2 years ago · 143fe5b1c7
--- a/docs/changelog/92329.yaml
+++ b/docs/changelog/92329.yaml
@@ -0,0 +1,5 @@
 
															+pr: 92329
														
 
															+summary: Fix tokenization bug when handling normalization in BERT and MPNet
														
 
															+area: Machine Learning
														
 
															+type: bug
														
 
															+issues: []
														
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/BasicTokenFilter.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/BasicTokenFilter.java
@@ -118,19 +118,19 @@ public final class BasicTokenFilter extends TokenFilter {
 
															         }
														
 
															         current = null; // not really needed, but for safety
														
 
															         if (input.incrementToken()) {
														
 
															-            if (isStripAccents) {
														
 
															-                stripAccent();
														
 
															-            }
														
 
															             if (neverSplitSet.contains(termAtt)) {
														
 
															                 return true;
														
 
															             }
														
 
															             // split punctuation and maybe cjk chars!!!
														
 
															             LinkedList<DelimitedToken> splits = split();
														
 
															-            // There is nothing to merge, nothing to store, simply return
														
 
															-            if (splits.size() == 1) {
														
 
															-                return true;
														
 
															+            LinkedList<DelimitedToken> delimitedTokens = mergeSplits(splits);
														
 
															+            if (isStripAccents) {
														
 
															+                for (DelimitedToken token : delimitedTokens) {
														
 
															+                    tokens.add(stripAccent(token));
														
 
															+                }
														
 
															+            } else {
														
 
															+                tokens.addAll(delimitedTokens);
														
 
															             }
														
 
															-            tokens.addAll(mergeSplits(splits));
														
 
															             this.current = captureState();
														
 
															             DelimitedToken token = tokens.removeFirst();
														
 
															             termAtt.setEmpty().append(token.charSequence());
														
@@ -140,14 +140,14 @@ public final class BasicTokenFilter extends TokenFilter {
 
															         return false;
														
 
															     }
														
 
															-    private void stripAccent() {
														
 
															+    private DelimitedToken stripAccent(DelimitedToken token) {
														
 
															         accentBuffer.setLength(0);
														
 
															         boolean changed = false;
														
 
															-        if (normalizer.quickCheck(termAtt) != Normalizer.YES) {
														
 
															-            normalizer.normalize(termAtt, accentBuffer);
														
 
															+        if (normalizer.quickCheck(token.charSequence()) != Normalizer.YES) {
														
 
															+            normalizer.normalize(token.charSequence(), accentBuffer);
														
 
															             changed = true;
														
 
															         } else {
														
 
															-            accentBuffer.append(termAtt);
														
 
															+            accentBuffer.append(token.charSequence());
														
 
															         }
														
 
															         List<Integer> badIndices = new ArrayList<>();
														
 
															         List<Integer> charCount = new ArrayList<>();
														
@@ -172,8 +172,9 @@ public final class BasicTokenFilter extends TokenFilter {
 
															             }
														
 
															         }
														
 
															         if (changed) {
														
 
															-            termAtt.setEmpty().append(accentBuffer);
														
 
															+            return new DelimitedToken(accentBuffer.toString(), token.startOffset(), token.endOffset());
														
 
															         }
														
 
															+        return token;
														
 
															     }
														
 
															     private LinkedList<DelimitedToken> split() {
														
@@ -210,6 +211,9 @@ public final class BasicTokenFilter extends TokenFilter {
 
															     }
														
 
															     private LinkedList<DelimitedToken> mergeSplits(LinkedList<DelimitedToken> splits) {
														
 
															+        if (splits.size() == 1) {
														
 
															+            return splits;
														
 
															+        }
														
 
															         LinkedList<DelimitedToken> mergedTokens = new LinkedList<>();
														
 
															         List<DelimitedToken> matchingTokens = new ArrayList<>();
														
 
															         CharSeqTokenTrieNode current = neverSplit;
														
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/BertTokenizerTests.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/BertTokenizerTests.java
@@ -67,6 +67,35 @@ public class BertTokenizerTests extends ESTestCase {
 
															         }
														
 
															     }
														
 
															+    public void testTokenizeFailureCaseAccentFilter() {
														
 
															+        List<String> testingVocab = List.of(
														
 
															+            "[CLS]",
														
 
															+            "br",
														
 
															+            "##ᄎ",
														
 
															+            "##ᅡ",
														
 
															+            "##ᆼ",
														
 
															+            "##n",
														
 
															+            "'",
														
 
															+            "s",
														
 
															+            "[SEP]",
														
 
															+            BertTokenizer.MASK_TOKEN,
														
 
															+            BertTokenizer.UNKNOWN_TOKEN,
														
 
															+            BertTokenizer.PAD_TOKEN
														
 
															+        );
														
 
															+        try (
														
 
															+            BertTokenizer tokenizer = BertTokenizer.builder(
														
 
															+                testingVocab,
														
 
															+                new BertTokenization(true, true, 512, Tokenization.Truncate.FIRST, -1)
														
 
															+            ).build()
														
 
															+        ) {
														
 
															+            TokenizationResult.Tokens tokenization = tokenizer.tokenize("Br창n's", Tokenization.Truncate.NONE, -1, 0).get(0);
														
 
															+            assertThat(tokenization.tokenIds(), equalTo(new int[] { 0, 1, 2, 3, 4, 5, 6, 7, 8 }));
														
 
															+
														
 
															+            tokenization = tokenizer.tokenize("Br창n", Tokenization.Truncate.NONE, -1, 0).get(0);
														
 
															+            assertThat(tokenization.tokenIds(), equalTo(new int[] { 0, 1, 2, 3, 4, 5, 8 }));
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															     public void testTokenizeLargeInputNoTruncation() {
														
 
															         try (
														
 
															             BertTokenizer tokenizer = BertTokenizer.builder(