|
@@ -146,13 +146,20 @@ public final class UnigramTokenizer extends Tokenizer {
|
|
offsetAtt.setOffset(correctOffset(whitespaceTokenizer.finalOffset), correctOffset(whitespaceTokenizer.finalOffset));
|
|
offsetAtt.setOffset(correctOffset(whitespaceTokenizer.finalOffset), correctOffset(whitespaceTokenizer.finalOffset));
|
|
}
|
|
}
|
|
|
|
|
|
- @Override
|
|
|
|
- public boolean incrementToken() throws IOException {
|
|
|
|
- clearAttributes();
|
|
|
|
|
|
+ private void popFromTokens() {
|
|
if (tokens.isEmpty() == false) {
|
|
if (tokens.isEmpty() == false) {
|
|
DelimitedToken.Encoded token = tokens.removeFirst();
|
|
DelimitedToken.Encoded token = tokens.removeFirst();
|
|
|
|
+ tokenizedValues.add(token);
|
|
termAtt.setEmpty().append(token.charSequence());
|
|
termAtt.setEmpty().append(token.charSequence());
|
|
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
|
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ @Override
|
|
|
|
+ public boolean incrementToken() throws IOException {
|
|
|
|
+ clearAttributes();
|
|
|
|
+ if (tokens.isEmpty() == false) {
|
|
|
|
+ popFromTokens();
|
|
return true;
|
|
return true;
|
|
}
|
|
}
|
|
// First, whitespace tokenize
|
|
// First, whitespace tokenize
|
|
@@ -160,7 +167,7 @@ public final class UnigramTokenizer extends Tokenizer {
|
|
if (whitespaceToken != null) {
|
|
if (whitespaceToken != null) {
|
|
if (neverSplitHash.contains(whitespaceToken.charSequence())) {
|
|
if (neverSplitHash.contains(whitespaceToken.charSequence())) {
|
|
Integer maybeTokenized = vocabToId.get(new BytesRef(whitespaceToken.charSequence()));
|
|
Integer maybeTokenized = vocabToId.get(new BytesRef(whitespaceToken.charSequence()));
|
|
- tokenizedValues.add(
|
|
|
|
|
|
+ tokens.add(
|
|
new DelimitedToken.Encoded(
|
|
new DelimitedToken.Encoded(
|
|
whitespaceToken.charSequence().toString(),
|
|
whitespaceToken.charSequence().toString(),
|
|
Objects.requireNonNullElse(maybeTokenized, unknownTokenId),
|
|
Objects.requireNonNullElse(maybeTokenized, unknownTokenId),
|
|
@@ -168,7 +175,7 @@ public final class UnigramTokenizer extends Tokenizer {
|
|
correctOffset(whitespaceToken.endOffset())
|
|
correctOffset(whitespaceToken.endOffset())
|
|
)
|
|
)
|
|
);
|
|
);
|
|
- offsetAtt.setOffset(correctOffset(whitespaceToken.startOffset()), correctOffset(whitespaceToken.endOffset()));
|
|
|
|
|
|
+ popFromTokens();
|
|
return true;
|
|
return true;
|
|
}
|
|
}
|
|
int inputOffsetStart = whitespaceToken.startOffset();
|
|
int inputOffsetStart = whitespaceToken.startOffset();
|
|
@@ -217,12 +224,9 @@ public final class UnigramTokenizer extends Tokenizer {
|
|
MultiCharSequence.from(PREFIX, token.charSequence()),
|
|
MultiCharSequence.from(PREFIX, token.charSequence()),
|
|
offsetCorrectorFunction
|
|
offsetCorrectorFunction
|
|
);
|
|
);
|
|
- tokenizedValues.addAll(tokenList);
|
|
|
|
tokens.addAll(tokenList);
|
|
tokens.addAll(tokenList);
|
|
}
|
|
}
|
|
- DelimitedToken.Encoded token = tokens.removeFirst();
|
|
|
|
- termAtt.setEmpty().append(token.charSequence());
|
|
|
|
- offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
|
|
|
|
|
+ popFromTokens();
|
|
return true;
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
return false;
|