Explorar o código

[ML] fix minor tokenization bug when using fill_mask tasks with roberta tokenizer (#88825)

Depending on where your <mask> resides when using byte-part encoding (e.g. roberta), it could be split out erroneously.

This commit fixes that bug.

NOTE, this bug existed before: #88737
Benjamin Trent %!s(int64=3) %!d(string=hai) anos
pai
achega
d2fe21dc81

+ 5 - 0
docs/changelog/88825.yaml

@@ -0,0 +1,5 @@
+pr: 88825
+summary: fix minor tokenization bug when using fill_mask task with roberta tokenizer
+area: Machine Learning
+type: bug
+issues: []

+ 5 - 1
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/TokenizerUtils.java

@@ -39,7 +39,11 @@ final class TokenizerUtils {
                 if (neverSplitSet.contains(maybeNeverSplit)) {
                     if (windowStart < neverSplitStart) {
                         bigTokens.add(
-                            new DelimitedToken(new CharSequenceRef(input, windowStart, neverSplitStart), windowStart, neverSplitStart)
+                            new DelimitedToken(
+                                new CharSequenceRef(input, windowStart, neverSplitStart - windowStart),
+                                windowStart,
+                                neverSplitStart
+                            )
                         );
                     }
                     bigTokens.add(new DelimitedToken(maybeNeverSplit, neverSplitStart, i + 1));

+ 5 - 0
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/BpeTokenizerTests.java

@@ -375,6 +375,11 @@ public class BpeTokenizerTests extends BaseTokenStreamTestCase {
         assertAnalyzesToNoCharFilter(analyzer, "Elasticsearch <<mask>.", new String[] { "Elast", "icsearch", "Ġ", "<", "<mask>", "." });
         assertAnalyzesToNoCharFilter(analyzer, "Elasticsearch < red", new String[] { "Elast", "icsearch", "Ġ", "<", "Ġred" });
         assertAnalyzesToNoCharFilter(analyzer, "Elasticsearch <mask>.", new String[] { "Elast", "icsearch", "<mask>", "." });
+        assertAnalyzesToNoCharFilter(
+            analyzer,
+            "Elasticsearch<mask><mask>~redElasticsearch",
+            new String[] { "Elast", "icsearch", "<mask>", "<mask>", "~", "red", "Elast", "icsearch" }
+        );
         assertAnalyzesToNoCharFilter(
             analyzer,
             "Elasticsearch red~<mask>.",