|
@@ -118,19 +118,19 @@ public final class BasicTokenFilter extends TokenFilter {
|
|
}
|
|
}
|
|
current = null; // not really needed, but for safety
|
|
current = null; // not really needed, but for safety
|
|
if (input.incrementToken()) {
|
|
if (input.incrementToken()) {
|
|
- if (isStripAccents) {
|
|
|
|
- stripAccent();
|
|
|
|
- }
|
|
|
|
if (neverSplitSet.contains(termAtt)) {
|
|
if (neverSplitSet.contains(termAtt)) {
|
|
return true;
|
|
return true;
|
|
}
|
|
}
|
|
// split punctuation and maybe cjk chars!!!
|
|
// split punctuation and maybe cjk chars!!!
|
|
LinkedList<DelimitedToken> splits = split();
|
|
LinkedList<DelimitedToken> splits = split();
|
|
- // There is nothing to merge, nothing to store, simply return
|
|
|
|
- if (splits.size() == 1) {
|
|
|
|
- return true;
|
|
|
|
|
|
+ LinkedList<DelimitedToken> delimitedTokens = mergeSplits(splits);
|
|
|
|
+ if (isStripAccents) {
|
|
|
|
+ for (DelimitedToken token : delimitedTokens) {
|
|
|
|
+ tokens.add(stripAccent(token));
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ tokens.addAll(delimitedTokens);
|
|
}
|
|
}
|
|
- tokens.addAll(mergeSplits(splits));
|
|
|
|
this.current = captureState();
|
|
this.current = captureState();
|
|
DelimitedToken token = tokens.removeFirst();
|
|
DelimitedToken token = tokens.removeFirst();
|
|
termAtt.setEmpty().append(token.charSequence());
|
|
termAtt.setEmpty().append(token.charSequence());
|
|
@@ -140,14 +140,14 @@ public final class BasicTokenFilter extends TokenFilter {
|
|
return false;
|
|
return false;
|
|
}
|
|
}
|
|
|
|
|
|
- private void stripAccent() {
|
|
|
|
|
|
+ private DelimitedToken stripAccent(DelimitedToken token) {
|
|
accentBuffer.setLength(0);
|
|
accentBuffer.setLength(0);
|
|
boolean changed = false;
|
|
boolean changed = false;
|
|
- if (normalizer.quickCheck(termAtt) != Normalizer.YES) {
|
|
|
|
- normalizer.normalize(termAtt, accentBuffer);
|
|
|
|
|
|
+ if (normalizer.quickCheck(token.charSequence()) != Normalizer.YES) {
|
|
|
|
+ normalizer.normalize(token.charSequence(), accentBuffer);
|
|
changed = true;
|
|
changed = true;
|
|
} else {
|
|
} else {
|
|
- accentBuffer.append(termAtt);
|
|
|
|
|
|
+ accentBuffer.append(token.charSequence());
|
|
}
|
|
}
|
|
List<Integer> badIndices = new ArrayList<>();
|
|
List<Integer> badIndices = new ArrayList<>();
|
|
List<Integer> charCount = new ArrayList<>();
|
|
List<Integer> charCount = new ArrayList<>();
|
|
@@ -172,8 +172,9 @@ public final class BasicTokenFilter extends TokenFilter {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (changed) {
|
|
if (changed) {
|
|
- termAtt.setEmpty().append(accentBuffer);
|
|
|
|
|
|
+ return new DelimitedToken(accentBuffer.toString(), token.startOffset(), token.endOffset());
|
|
}
|
|
}
|
|
|
|
+ return token;
|
|
}
|
|
}
|
|
|
|
|
|
private LinkedList<DelimitedToken> split() {
|
|
private LinkedList<DelimitedToken> split() {
|
|
@@ -210,6 +211,9 @@ public final class BasicTokenFilter extends TokenFilter {
|
|
}
|
|
}
|
|
|
|
|
|
private LinkedList<DelimitedToken> mergeSplits(LinkedList<DelimitedToken> splits) {
|
|
private LinkedList<DelimitedToken> mergeSplits(LinkedList<DelimitedToken> splits) {
|
|
|
|
+ if (splits.size() == 1) {
|
|
|
|
+ return splits;
|
|
|
|
+ }
|
|
LinkedList<DelimitedToken> mergedTokens = new LinkedList<>();
|
|
LinkedList<DelimitedToken> mergedTokens = new LinkedList<>();
|
|
List<DelimitedToken> matchingTokens = new ArrayList<>();
|
|
List<DelimitedToken> matchingTokens = new ArrayList<>();
|
|
CharSeqTokenTrieNode current = neverSplit;
|
|
CharSeqTokenTrieNode current = neverSplit;
|