11 mēneši atpakaļ · 6cb39b17fc
--- a/docs/changelog/116358.yaml
+++ b/docs/changelog/116358.yaml
@@ -0,0 +1,5 @@
 
				+pr: 116358
			
 
				+summary: Update Deberta tokenizer
			
 
				+area: Machine Learning
			
 
				+type: bug
			
 
				+issues: []
			
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/UnigramTokenizer.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/UnigramTokenizer.java
@@ -367,8 +367,10 @@ public final class UnigramTokenizer extends Tokenizer {
 
				                         new DelimitedToken.Encoded(
			
 
				                             Strings.format("<0x%02X>", bytes[i]),
			
 
				                             pieces[i],
			
 
				+                            // even though we are changing the number of characters in the output, we don't
			
 
				+                            // need to change the offsets. The offsets refer to the input characters
			
 
				                             offsetCorrection.apply(node.startsAtCharPos),
			
 
				-                            offsetCorrection.apply(startsAtBytes + i)
			
 
				+                            offsetCorrection.apply(endsAtChars)
			
 
				                         )
			
 
				                     );
			
 
				                 }