Browse Source

[ML] Fix deberta tokenizer bug caused by bug in normalizer (#117189) (#117254)

* Fix deberta tokenizer bug caused by bug in normalizer which caused offesets to be negative

* Update docs/changelog/117189.yaml
Max Hniebergall 10 months ago
parent
commit
e0733f305c

+ 5 - 0
docs/changelog/117189.yaml

@@ -0,0 +1,5 @@
+pr: 117189
+summary: Fix deberta tokenizer bug caused by bug in normalizer
+area: Machine Learning
+type: bug
+issues: []

+ 1 - 1
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java

@@ -194,7 +194,7 @@ public class PrecompiledCharMapNormalizer extends BaseCharFilter {
                     if (charDelta < 0) {
                         // normalised form is shorter
                         int lastDiff = getLastCumulativeDiff();
-                        addOffCorrectMap(normalizedCharPos, lastDiff + charDelta);
+                        addOffCorrectMap(normalizedCharPos, lastDiff - charDelta);
                     } else if (charDelta > 0) {
                         // inserted chars, add the offset in the output stream
                         int lastDiff = getLastCumulativeDiff();

+ 14 - 0
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java

@@ -94,6 +94,20 @@ public class DebertaV2TokenizerTests extends ESTestCase {
         }
     }
 
+    public void testTokenizeWithHiddenControlCharacters() throws IOException {
+        try (
+            DebertaV2Tokenizer tokenizer = DebertaV2Tokenizer.builder(
+                TEST_CASE_VOCAB,
+                TEST_CASE_SCORES,
+                new DebertaV2Tokenization(false, false, null, Tokenization.Truncate.NONE, -1)
+            ).build()
+        ) {
+            TokenizationResult.Tokens tokenization = tokenizer.tokenize("\u009F\u008Fz", Tokenization.Truncate.NONE, -1, 0, null).get(0);
+            assertThat(tokenStrings(tokenization.tokens().get(0)), contains("▁", "z"));
+
+        }
+    }
+
     public void testSurrogatePair() throws IOException {
         try (
             DebertaV2Tokenizer tokenizer = DebertaV2Tokenizer.builder(