1 жил өмнө · e63f0c535d
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/TextEmbeddingProcessor.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/TextEmbeddingProcessor.java
@@ -62,10 +62,18 @@ public class TextEmbeddingProcessor extends NlpTask.Processor {
 
															         if (chunkResults) {
														
 
															             var embeddings = new ArrayList<MlChunkedTextEmbeddingFloatResults.EmbeddingChunk>();
														
 
															             for (int i = 0; i < pyTorchResult.getInferenceResult()[0].length; i++) {
														
 
															-                int startOffset = tokenization.getTokenization(i).tokens().get(0).get(0).startOffset();
														
 
															-                int lastIndex = tokenization.getTokenization(i).tokens().get(0).size() - 1;
														
 
															-                int endOffset = tokenization.getTokenization(i).tokens().get(0).get(lastIndex).endOffset();
														
 
															-                String matchedText = tokenization.getTokenization(i).input().get(0).substring(startOffset, endOffset);
														
 
															+                String matchedText;
														
 
															+                if (tokenization.getTokenization(i).tokens().get(0).isEmpty() == false) {
														
 
															+                    int startOffset = tokenization.getTokenization(i).tokens().get(0).get(0).startOffset();
														
 
															+                    int lastIndex = tokenization.getTokenization(i).tokens().get(0).size() - 1;
														
 
															+                    int endOffset = tokenization.getTokenization(i).tokens().get(0).get(lastIndex).endOffset();
														
 
															+                    matchedText = tokenization.getTokenization(i).input().get(0).substring(startOffset, endOffset);
														
 
															+
														
 
															+                } else {
														
 
															+                    // No tokens in the input, this should only happen with and empty string
														
 
															+                    assert tokenization.getTokenization(i).input().get(0).isEmpty();
														
 
															+                    matchedText = "";
														
 
															+                }
														
 
															                 embeddings.add(
														
 
															                     new MlChunkedTextEmbeddingFloatResults.EmbeddingChunk(matchedText, pyTorchResult.getInferenceResult()[0][i])
														
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/TextExpansionProcessor.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/TextExpansionProcessor.java
@@ -75,10 +75,17 @@ public class TextExpansionProcessor extends NlpTask.Processor {
 
															             var chunkedResults = new ArrayList<MlChunkedTextExpansionResults.ChunkedResult>();
														
 
															             for (int i = 0; i < pyTorchResult.getInferenceResult()[0].length; i++) {
														
 
															-                int startOffset = tokenization.getTokenization(i).tokens().get(0).get(0).startOffset();
														
 
															-                int lastIndex = tokenization.getTokenization(i).tokens().get(0).size() - 1;
														
 
															-                int endOffset = tokenization.getTokenization(i).tokens().get(0).get(lastIndex).endOffset();
														
 
															-                String matchedText = tokenization.getTokenization(i).input().get(0).substring(startOffset, endOffset);
														
 
															+                String matchedText;
														
 
															+                if (tokenization.getTokenization(i).tokens().get(0).isEmpty() == false) {
														
 
															+                    int startOffset = tokenization.getTokenization(i).tokens().get(0).get(0).startOffset();
														
 
															+                    int lastIndex = tokenization.getTokenization(i).tokens().get(0).size() - 1;
														
 
															+                    int endOffset = tokenization.getTokenization(i).tokens().get(0).get(lastIndex).endOffset();
														
 
															+                    matchedText = tokenization.getTokenization(i).input().get(0).substring(startOffset, endOffset);
														
 
															+                } else {
														
 
															+                    // No tokens in the input, this should only happen with and empty string
														
 
															+                    assert tokenization.getTokenization(i).input().get(0).isEmpty();
														
 
															+                    matchedText = "";
														
 
															+                }
														
 
															                 var weightedTokens = sparseVectorToTokenWeights(pyTorchResult.getInferenceResult()[0][i], tokenization, replacementVocab);
														
 
															                 weightedTokens.sort((t1, t2) -> Float.compare(t2.weight(), t1.weight()));
														
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/TextEmbeddingProcessorTests.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/TextEmbeddingProcessorTests.java
@@ -9,6 +9,7 @@ package org.elasticsearch.xpack.ml.inference.nlp;
 
															 import org.elasticsearch.test.ESTestCase;
														
 
															 import org.elasticsearch.xpack.core.ml.inference.results.MlChunkedTextEmbeddingFloatResults;
														
 
															+import org.elasticsearch.xpack.core.ml.inference.results.MlChunkedTextExpansionResults;
														
 
															 import org.elasticsearch.xpack.core.ml.inference.results.MlTextEmbeddingResults;
														
 
															 import org.elasticsearch.xpack.core.ml.inference.trainedmodel.BertTokenization;
														
 
															 import org.elasticsearch.xpack.core.ml.inference.trainedmodel.Tokenization;
														
@@ -16,9 +17,13 @@ import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.BertTokenizationResul
 
															 import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.BertTokenizer;
														
 
															 import org.elasticsearch.xpack.ml.inference.pytorch.results.PyTorchInferenceResult;
														
 
															+import java.util.Map;
														
 
															+
														
 
															+import static org.hamcrest.Matchers.empty;
														
 
															 import static org.hamcrest.Matchers.greaterThan;
														
 
															 import static org.hamcrest.Matchers.hasSize;
														
 
															 import static org.hamcrest.Matchers.instanceOf;
														
 
															+import static org.hamcrest.core.IsNot.not;
														
 
															 public class TextEmbeddingProcessorTests extends ESTestCase {
														
@@ -67,4 +72,26 @@ public class TextEmbeddingProcessorTests extends ESTestCase {
 
															             assertThat(chunkedResult.getChunks().get(1).embedding().length, greaterThan(0));
														
 
															         }
														
 
															     }
														
 
															+
														
 
															+    public void testChunkingWithEmptyString() {
														
 
															+        try (
														
 
															+            BertTokenizer tokenizer = BertTokenizer.builder(
														
 
															+                TextExpansionProcessorTests.TEST_CASED_VOCAB,
														
 
															+                new BertTokenization(null, false, 5, Tokenization.Truncate.NONE, 0)
														
 
															+            ).build()
														
 
															+        ) {
														
 
															+            var pytorchResult = new PyTorchInferenceResult(new double[][][] { { { 1.0, 2.0, 3.0, 4.0, 5.0 } } });
														
 
															+
														
 
															+            var input = "";
														
 
															+            var tokenization = tokenizer.tokenize(input, Tokenization.Truncate.NONE, 0, 0, null);
														
 
															+            var tokenizationResult = new BertTokenizationResult(TextExpansionProcessorTests.TEST_CASED_VOCAB, tokenization, 0);
														
 
															+            var inferenceResult = TextExpansionProcessor.processResult(tokenizationResult, pytorchResult, Map.of(), "foo", true);
														
 
															+            assertThat(inferenceResult, instanceOf(MlChunkedTextExpansionResults.class));
														
 
															+
														
 
															+            var chunkedResult = (MlChunkedTextExpansionResults) inferenceResult;
														
 
															+            assertThat(chunkedResult.getChunks(), hasSize(1));
														
 
															+            assertEquals("", chunkedResult.getChunks().get(0).matchedText());
														
 
															+            assertThat(chunkedResult.getChunks().get(0).weightedTokens(), not(empty()));
														
 
															+        }
														
 
															+    }
														
 
															 }
														
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/TextExpansionProcessorTests.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/TextExpansionProcessorTests.java
@@ -147,4 +147,26 @@ public class TextExpansionProcessorTests extends ESTestCase {
 
															             assertThat(chunkedResult.getChunks().get(1).weightedTokens(), not(empty()));
														
 
															         }
														
 
															     }
														
 
															+
														
 
															+    public void testChunkingWithEmptyString() {
														
 
															+        try (
														
 
															+            BertTokenizer tokenizer = BertTokenizer.builder(
														
 
															+                TEST_CASED_VOCAB,
														
 
															+                new BertTokenization(null, false, 5, Tokenization.Truncate.NONE, 0)
														
 
															+            ).build()
														
 
															+        ) {
														
 
															+            var pytorchResult = new PyTorchInferenceResult(new double[][][] { { { 1.0, 2.0, 3.0, 4.0, 5.0 } } });
														
 
															+
														
 
															+            var input = "";
														
 
															+            var tokenization = tokenizer.tokenize(input, Tokenization.Truncate.NONE, 0, 0, null);
														
 
															+            var tokenizationResult = new BertTokenizationResult(TEST_CASED_VOCAB, tokenization, 0);
														
 
															+            var inferenceResult = TextExpansionProcessor.processResult(tokenizationResult, pytorchResult, Map.of(), "foo", true);
														
 
															+            assertThat(inferenceResult, instanceOf(MlChunkedTextExpansionResults.class));
														
 
															+
														
 
															+            var chunkedResult = (MlChunkedTextExpansionResults) inferenceResult;
														
 
															+            assertThat(chunkedResult.getChunks(), hasSize(1));
														
 
															+            assertEquals("", chunkedResult.getChunks().get(0).matchedText());
														
 
															+            assertThat(chunkedResult.getChunks().get(0).weightedTokens(), not(empty()));
														
 
															+        }
														
 
															+    }
														
 
															 }