Pārlūkot izejas kodu

Handle empty input inference (#123763)

* Added check for blank string to skip generating embeddings with unit test

* Adding yaml tests for skipping embedding generation

* dynamic update not required if model_settings stays null

* Updating node feature for handling empty input name and description

* Update yaml tests with refresh=true

* Update unit test to follow more accurate behavior

* Added yaml tests for multu chunks

* [CI] Auto commit changes from spotless

* Adding highlighter yaml tests for empty input

* Update docs/changelog/123763.yaml

* Update changelog and test reason to have more polished documentation

* adding input value into the response source and fixing unit tests by reformating

* Adding highligher test for backward compatibility and refactor existing test

* Added bwc tests for  empty input and multi chunks

* Removed reindex for empty input from bwc

* [CI] Auto commit changes from spotless

* Fixing yaml test

* Update unit tests helper function to support both format

* [CI] Auto commit changes from spotless

* Adding cluster features for bwc

* Centralize logic for assertInference helper

---------

Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
Co-authored-by: elasticsearchmachine <infra-root+elasticsearchmachine@elastic.co>
Samiul Monir 7 mēneši atpakaļ
vecāks
revīzija
f0d5220178

+ 5 - 0
docs/changelog/123763.yaml

@@ -0,0 +1,5 @@
+pr: 123763
+summary: Skip semantic_text embedding generation when no content is provided.
+area: Relevance
+type: enhancement
+issues: []

+ 2 - 1
x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java

@@ -50,7 +50,8 @@ public class InferenceFeatures implements FeatureSpecification {
             SEMANTIC_TEXT_HIGHLIGHTER_DEFAULT,
             SEMANTIC_KNN_FILTER_FIX,
             TEST_RERANKING_SERVICE_PARSE_TEXT_AS_SCORE,
-            SemanticTextFieldMapper.SEMANTIC_TEXT_BIT_VECTOR_SUPPORT
+            SemanticTextFieldMapper.SEMANTIC_TEXT_BIT_VECTOR_SUPPORT,
+            SemanticTextFieldMapper.SEMANTIC_TEXT_HANDLE_EMPTY_INPUT
         );
     }
 }

+ 8 - 2
x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java

@@ -563,7 +563,7 @@ public class ShardBulkInferenceActionFilter implements MappedActionFilter {
                             }
                             continue;
                         }
-                        ensureResponseAccumulatorSlot(itemIndex);
+                        var slot = ensureResponseAccumulatorSlot(itemIndex);
                         final List<String> values;
                         try {
                             values = SemanticTextUtils.nodeStringValues(field, valueObj);
@@ -580,7 +580,13 @@ public class ShardBulkInferenceActionFilter implements MappedActionFilter {
                         List<FieldInferenceRequest> fieldRequests = fieldRequestsMap.computeIfAbsent(inferenceId, k -> new ArrayList<>());
                         int offsetAdjustment = 0;
                         for (String v : values) {
-                            fieldRequests.add(new FieldInferenceRequest(itemIndex, field, sourceField, v, order++, offsetAdjustment));
+                            if (v.isBlank()) {
+                                slot.addOrUpdateResponse(
+                                    new FieldInferenceResponse(field, sourceField, v, order++, 0, null, EMPTY_CHUNKED_INFERENCE)
+                                );
+                            } else {
+                                fieldRequests.add(new FieldInferenceRequest(itemIndex, field, sourceField, v, order++, offsetAdjustment));
+                            }
 
                             // When using the inference metadata fields format, all the input values are concatenated so that the
                             // chunk text offsets are expressed in the context of a single string. Calculate the offset adjustment

+ 2 - 1
x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java

@@ -117,6 +117,7 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
     public static final NodeFeature SEMANTIC_TEXT_ALWAYS_EMIT_INFERENCE_ID_FIX = new NodeFeature(
         "semantic_text.always_emit_inference_id_fix"
     );
+    public static final NodeFeature SEMANTIC_TEXT_HANDLE_EMPTY_INPUT = new NodeFeature("semantic_text.handle_empty_input");
     public static final NodeFeature SEMANTIC_TEXT_SKIP_INFERENCE_FIELDS = new NodeFeature("semantic_text.skip_inference_fields");
     public static final NodeFeature SEMANTIC_TEXT_BIT_VECTOR_SUPPORT = new NodeFeature("semantic_text.bit_vector_support");
 
@@ -403,7 +404,7 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
         }
 
         final SemanticTextFieldMapper mapper;
-        if (fieldType().getModelSettings() == null) {
+        if (fieldType().getModelSettings() == null && field.inference().modelSettings() != null) {
             mapper = addDynamicUpdate(context, field);
         } else {
             Conflicts conflicts = new Conflicts(fullFieldName);

+ 62 - 8
x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java

@@ -335,7 +335,7 @@ public class ShardBulkInferenceActionFilterTests extends ESTestCase {
                 // item 3
                 assertNull(bulkShardRequest.items()[3].getPrimaryResponse());
                 actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[3].request());
-                assertInferenceResults(useLegacyFormat, actualRequest, "obj.field1", EXPLICIT_NULL, 0);
+                assertInferenceResults(useLegacyFormat, actualRequest, "obj.field1", EXPLICIT_NULL, null);
 
                 // item 4
                 assertNull(bulkShardRequest.items()[4].getPrimaryResponse());
@@ -368,6 +368,59 @@ public class ShardBulkInferenceActionFilterTests extends ESTestCase {
         awaitLatch(chainExecuted, 10, TimeUnit.SECONDS);
     }
 
+    @SuppressWarnings({ "unchecked", "rawtypes" })
+    public void testHandleEmptyInput() throws Exception {
+        StaticModel model = StaticModel.createRandomInstance();
+        ShardBulkInferenceActionFilter filter = createFilter(
+            threadPool,
+            Map.of(model.getInferenceEntityId(), model),
+            randomIntBetween(1, 10),
+            useLegacyFormat,
+            true
+        );
+
+        CountDownLatch chainExecuted = new CountDownLatch(1);
+        ActionFilterChain actionFilterChain = (task, action, request, listener) -> {
+            try {
+                BulkShardRequest bulkShardRequest = (BulkShardRequest) request;
+                assertNull(bulkShardRequest.getInferenceFieldMap());
+                assertThat(bulkShardRequest.items().length, equalTo(3));
+
+                // Create with Empty string
+                assertNull(bulkShardRequest.items()[0].getPrimaryResponse());
+                IndexRequest actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[0].request());
+                assertInferenceResults(useLegacyFormat, actualRequest, "semantic_text_field", "", 0);
+
+                // Create with whitespace only
+                assertNull(bulkShardRequest.items()[1].getPrimaryResponse());
+                actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[1].request());
+                assertInferenceResults(useLegacyFormat, actualRequest, "semantic_text_field", " ", 0);
+
+                // Update with multiple Whitespaces
+                assertNull(bulkShardRequest.items()[2].getPrimaryResponse());
+                actualRequest = getIndexRequestOrNull(bulkShardRequest.items()[2].request());
+                assertInferenceResults(useLegacyFormat, actualRequest, "semantic_text_field", "  ", 0);
+            } finally {
+                chainExecuted.countDown();
+            }
+        };
+        ActionListener actionListener = mock(ActionListener.class);
+        Task task = mock(Task.class);
+        Map<String, InferenceFieldMetadata> inferenceFieldMap = Map.of(
+            "semantic_text_field",
+            new InferenceFieldMetadata("semantic_text_field", model.getInferenceEntityId(), new String[] { "semantic_text_field" })
+        );
+
+        BulkItemRequest[] items = new BulkItemRequest[3];
+        items[0] = new BulkItemRequest(0, new IndexRequest("index").source(Map.of("semantic_text_field", "")));
+        items[1] = new BulkItemRequest(1, new IndexRequest("index").source(Map.of("semantic_text_field", " ")));
+        items[2] = new BulkItemRequest(2, new UpdateRequest().doc(new IndexRequest("index").source(Map.of("semantic_text_field", "  "))));
+        BulkShardRequest request = new BulkShardRequest(new ShardId("test", "test", 0), WriteRequest.RefreshPolicy.NONE, items);
+        request.setInferenceFieldMap(inferenceFieldMap);
+        filter.apply(task, TransportShardBulkAction.ACTION_NAME, request, actionListener, actionFilterChain);
+        awaitLatch(chainExecuted, 10, TimeUnit.SECONDS);
+    }
+
     @SuppressWarnings({ "unchecked", "rawtypes" })
     public void testManyRandomDocs() throws Exception {
         Map<String, StaticModel> inferenceModelMap = new HashMap<>();
@@ -591,7 +644,7 @@ public class ShardBulkInferenceActionFilterTests extends ESTestCase {
         IndexRequest request,
         String fieldName,
         Object expectedOriginalValue,
-        int expectedChunkCount
+        Integer expectedChunkCount
     ) {
         final Map<String, Object> requestMap = request.sourceAsMap();
         if (useLegacyFormat) {
@@ -601,13 +654,11 @@ public class ShardBulkInferenceActionFilterTests extends ESTestCase {
             );
 
             List<Object> chunks = (List<Object>) XContentMapValues.extractValue(getChunksFieldName(fieldName), requestMap);
-            if (expectedChunkCount > 0) {
+            if (expectedChunkCount == null) {
+                assertNull(chunks);
+            } else {
                 assertNotNull(chunks);
                 assertThat(chunks.size(), equalTo(expectedChunkCount));
-            } else {
-                // If the expected chunk count is 0, we expect that no inference has been performed. In this case, the source should not be
-                // transformed, and thus the semantic text field structure should not be created.
-                assertNull(chunks);
             }
         } else {
             assertThat(XContentMapValues.extractValue(fieldName, requestMap, EXPLICIT_NULL), equalTo(expectedOriginalValue));
@@ -627,8 +678,11 @@ public class ShardBulkInferenceActionFilterTests extends ESTestCase {
                 inferenceMetadataFields,
                 EXPLICIT_NULL
             );
+
+            // When using the new format, the chunks field should always exist
+            int expectedSize = expectedChunkCount == null ? 0 : expectedChunkCount;
             assertNotNull(chunks);
-            assertThat(chunks.size(), equalTo(expectedChunkCount));
+            assertThat(chunks.size(), equalTo(expectedSize));
         }
     }
 

+ 171 - 0
x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/30_semantic_text_inference.yml

@@ -1005,3 +1005,174 @@ setup:
   - match: { hits.hits.0._source.dense_field: "another inference test" }
   - match: { hits.hits.0._source.non_inference_field: "non inference test" }
   - exists: hits.hits.0._source._inference_fields
+
+---
+"Empty semantic_text field skips embedding generation":
+  - requires:
+      cluster_features: "semantic_text.handle_empty_input"
+      reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.
+
+  - do:
+      index:
+        index: test-index
+        id: doc_1
+        body:
+          sparse_field: ""
+        refresh: true
+
+  - do:
+      index:
+        index: test-index
+        id: doc_2
+        body:
+          sparse_field: "   "
+        refresh: true
+
+  - do:
+      search:
+        index: test-index
+        body:
+          fields: [ _inference_fields ]
+          query:
+            match_all: { }
+
+  - match: { hits.total.value: 2 }
+  - match: { hits.hits.0._source.sparse_field: "" }
+  - match: { hits.hits.1._source.sparse_field: "   " }
+  - not_exists: hits.hits.0._source._inference_fields
+  - not_exists: hits.hits.1._source._inference_fields
+
+---
+"Reindexing with empty or whitespace semantic_text skips embedding generation":
+  - requires:
+      cluster_features: "semantic_text.handle_empty_input"
+      reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.
+
+  - do:
+      index:
+        index: test-index
+        id: doc_1
+        body:
+          sparse_field: "  "
+        refresh: true
+
+  - do:
+      indices.create:
+        index: destination-index
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              sparse_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+
+  - do:
+      reindex:
+        wait_for_completion: true
+        body:
+          source:
+            index: test-index
+          dest:
+            index: destination-index
+        refresh: true
+
+  - do:
+      get:
+        index: destination-index
+        id: doc_1
+
+  - match: { _source.sparse_field: "  " }
+
+  - do:
+      search:
+        index: destination-index
+        body:
+          fields: [ _inference_fields ]
+          query:
+            match_all: { }
+
+  - not_exists: hits.hits.0._source._inference_fields
+
+---
+"Empty Multi-Field skips embedding generation":
+  - requires:
+      cluster_features: "semantic_text.handle_empty_input"
+      reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.
+
+  - do:
+      indices.create:
+        index: test-multi-index
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+                fields:
+                  sparse:
+                    type: semantic_text
+                    inference_id: sparse-inference-id
+
+  - do:
+      bulk:
+        index: test-multi-index
+        refresh: true
+        body: |
+          {"index":{"_id": "1"}}
+          {"field": ["you know, for testing", "now with chunks"]}
+          {"index":{"_id": "2"}}
+          {"field": ["", "  "]}
+
+  - do:
+      search:
+        index: test-multi-index
+        body:
+          fields: [ _inference_fields ]
+          query:
+            match_all: { }
+
+  - exists: hits.hits.0._source._inference_fields
+  - not_exists: hits.hits.1._source._inference_fields
+
+---
+"Multi chunks skips empty input embedding generation":
+  - requires:
+      cluster_features: "semantic_text.handle_empty_input"
+      reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.
+
+  - do:
+      index:
+        index: test-index
+        id: doc_1
+        body:
+          sparse_field: ["some test data", "    ", "now with chunks"]
+        refresh: true
+
+  - do:
+      search:
+        index: test-index
+        body:
+          fields: [ _inference_fields ]
+          query:
+            match_all: { }
+
+  - match: { hits.total.value: 1 }
+
+  - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks: 1 }
+  - length: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field: 2 }
+  - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.embeddings
+  - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.start_offset: 0 }
+  - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.0.end_offset: 14 }
+  - exists: hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.embeddings
+  - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.start_offset: 20 }
+  - match: { hits.hits.0._source._inference_fields.sparse_field.inference.chunks.sparse_field.1.end_offset: 35 }

+ 64 - 0
x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/30_semantic_text_inference_bwc.yml

@@ -675,3 +675,67 @@ setup:
 
   - match: { hits.total.value: 1 }
   - not_exists: hits.hits.0._source._inference_fields
+
+---
+"Empty semantic_text field skips embedding generation":
+  - requires:
+      cluster_features: "semantic_text.handle_empty_input"
+      reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.
+
+  - do:
+      index:
+        index: test-index
+        id: doc_1
+        body:
+          sparse_field: ""
+        refresh: true
+
+  - do:
+      index:
+        index: test-index
+        id: doc_2
+        body:
+          sparse_field: "  "
+        refresh: true
+
+  - do:
+      search:
+        index: test-index
+        body:
+          query:
+            match_all: { }
+
+  - match: { hits.total.value: 2 }
+  - match: { hits.hits.0._source.sparse_field.text: "" }
+  - length: { hits.hits.0._source.sparse_field.inference.chunks: 0 }
+  - match: { hits.hits.1._source.sparse_field.text: "  " }
+  - length: { hits.hits.1._source.sparse_field.inference.chunks: 0 }
+
+---
+"Multi chunks skips empty input embedding generation":
+  - requires:
+      cluster_features: "semantic_text.handle_empty_input"
+      reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.
+
+  - do:
+      index:
+        index: test-index
+        id: doc_1
+        body:
+          sparse_field: ["some test data", "    ", "now with chunks"]
+        refresh: true
+
+  - do:
+      search:
+        index: test-index
+        body:
+          query:
+            match_all: { }
+
+  - match: { hits.total.value: 1 }
+
+  - length: { hits.hits.0._source.sparse_field.inference.chunks: 2 }
+  - match: { hits.hits.0._source.sparse_field.inference.chunks.0.text: "some test data" }
+  - exists: hits.hits.0._source.sparse_field.inference.chunks.0.embeddings
+  - match: { hits.hits.0._source.sparse_field.inference.chunks.1.text: "now with chunks" }
+  - exists: hits.hits.0._source.sparse_field.inference.chunks.1.embeddings

+ 45 - 0
x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml

@@ -291,3 +291,48 @@ setup:
   - match:      { hits.hits.0._id: "doc_1" }
   - not_exists: hits.hits.0.highlight.title
 
+---
+"Highlighting and multi chunks with empty input":
+  - requires:
+      cluster_features: "semantic_text.handle_empty_input"
+      reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.
+
+  - do:
+      indices.create:
+        index: test-multi-chunk-index
+        body:
+          settings:
+            index.mapping.semantic_text.use_legacy_format: false
+          mappings:
+            properties:
+              semantic_text_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+
+  - do:
+      index:
+        index: test-multi-chunk-index
+        id: doc_1
+        body:
+          semantic_text_field: ["some test data", "    ", "now with chunks"]
+        refresh: true
+
+  - do:
+      search:
+        index: test-multi-chunk-index
+        body:
+          query:
+            semantic:
+              field: "semantic_text_field"
+              query: "test"
+          highlight:
+            fields:
+              semantic_text_field:
+                type: "semantic"
+                number_of_fragments: 3
+
+  - match: { hits.total.value: 1 }
+  - match: { hits.hits.0._id: "doc_1" }
+  - length: { hits.hits.0.highlight.semantic_text_field: 2 }
+  - match: { hits.hits.0.highlight.semantic_text_field.0: "some test data" }
+  - match: { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }

+ 44 - 0
x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter_bwc.yml

@@ -243,4 +243,48 @@ setup:
   - match:  { hits.hits.0.highlight.body.0: "You Know, for Search!" }
   - match:  { hits.hits.0.highlight.body.1: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
 
+---
+"Highlighting and multi chunks with empty input":
+  - requires:
+      cluster_features: "semantic_text.handle_empty_input"
+      reason: Skips embedding generation when semantic_text is empty or contains only whitespace, effective from 8.19 and 9.1.0.
 
+  - do:
+      indices.create:
+        index: test-multi-chunk-index
+        body:
+          settings:
+            index.mapping.semantic_text.use_legacy_format: true
+          mappings:
+            properties:
+              semantic_text_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+
+  - do:
+      index:
+        index: test-multi-chunk-index
+        id: doc_1
+        body:
+          semantic_text_field: ["some test data", "    ", "now with chunks"]
+        refresh: true
+
+  - do:
+      search:
+        index: test-multi-chunk-index
+        body:
+          query:
+            semantic:
+              field: "semantic_text_field"
+              query: "test"
+          highlight:
+            fields:
+              semantic_text_field:
+                type: "semantic"
+                number_of_fragments: 3
+
+  - match: { hits.total.value: 1 }
+  - match: { hits.hits.0._id: "doc_1" }
+  - length: { hits.hits.0.highlight.semantic_text_field: 2 }
+  - match: { hits.hits.0.highlight.semantic_text_field.0: "some test data" }
+  - match: { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }