|
|
@@ -101,12 +101,14 @@ public class BasicTokenizer {
|
|
|
|
|
|
// At this point text has been tokenized by whitespace
|
|
|
// but one of the special never split tokens could be adjacent
|
|
|
- // to a punctuation character.
|
|
|
- if (isCommonPunctuation(token.codePointAt(token.length() -1)) &&
|
|
|
- neverSplit.contains(token.substring(0, token.length() -1))) {
|
|
|
- processedTokens.add(token.substring(0, token.length() -1));
|
|
|
- processedTokens.add(token.substring(token.length() -1));
|
|
|
- continue;
|
|
|
+ // to one or more punctuation characters.
|
|
|
+ if (isCommonPunctuation(token.codePointAt(token.length() -1))) {
|
|
|
+ int lastNonPunctuationIndex = findLastNonPunctuationIndex(token);
|
|
|
+ if (lastNonPunctuationIndex >= 0 && neverSplit.contains(token.substring(0, lastNonPunctuationIndex + 1))) {
|
|
|
+ processedTokens.add(token.substring(0, lastNonPunctuationIndex + 1));
|
|
|
+ processedTokens.addAll(splitOnPunctuation(token.substring(lastNonPunctuationIndex + 1)));
|
|
|
+ continue;
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
if (isLowerCase) {
|
|
|
@@ -121,6 +123,17 @@ public class BasicTokenizer {
|
|
|
return processedTokens;
|
|
|
}
|
|
|
|
|
|
+ private int findLastNonPunctuationIndex(String token) {
|
|
|
+ int i = token.length() - 1;
|
|
|
+ while (i >= 0) {
|
|
|
+ if (isCommonPunctuation(token.codePointAt(i)) == false) {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ i--;
|
|
|
+ }
|
|
|
+ return i;
|
|
|
+ }
|
|
|
+
|
|
|
public boolean isLowerCase() {
|
|
|
return isLowerCase;
|
|
|
}
|