Forráskód Böngészése

Semantic_text match_all with Highlighter (#128702)

* initial implementation for match_All

* reformat

* [CI] Auto commit changes from spotless

* Excluding matchAllintercepter

* Adding matchAllDocs support for vector fields

* [CI] Auto commit changes from spotless

* Remove previous implementation

* Adding yaml tests for match_all

* fixed yaml tests

* Update docs/changelog/128702.yaml

* Update changelog

* changelog - update summary

* Fix wrong inference names for the yaml tests

---------

Co-authored-by: elasticsearchmachine <infra-root+elasticsearchmachine@elastic.co>
Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
Samiul Monir 4 hónapja
szülő
commit
d1b5532dbf

+ 5 - 0
docs/changelog/128702.yaml

@@ -0,0 +1,5 @@
+pr: 128702
+summary: Fix missing highlighting in `match_all` queries for `semantic_text` fields
+area: Search
+type: bug
+issues: []

+ 3 - 1
x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java

@@ -34,6 +34,7 @@ public class InferenceFeatures implements FeatureSpecification {
     private static final NodeFeature TEST_RULE_RETRIEVER_WITH_INDICES_THAT_DONT_RETURN_RANK_DOCS = new NodeFeature(
         "test_rule_retriever.with_indices_that_dont_return_rank_docs"
     );
+    private static final NodeFeature SEMANTIC_TEXT_MATCH_ALL_HIGHLIGHTER = new NodeFeature("semantic_text.match_all_highlighter");
 
     @Override
     public Set<NodeFeature> getTestFeatures() {
@@ -57,7 +58,8 @@ public class InferenceFeatures implements FeatureSpecification {
             SemanticTextFieldMapper.SEMANTIC_TEXT_BIT_VECTOR_SUPPORT,
             SemanticTextFieldMapper.SEMANTIC_TEXT_HANDLE_EMPTY_INPUT,
             TEST_RULE_RETRIEVER_WITH_INDICES_THAT_DONT_RETURN_RANK_DOCS,
-            SEMANTIC_TEXT_SUPPORT_CHUNKING_CONFIG
+            SEMANTIC_TEXT_SUPPORT_CHUNKING_CONFIG,
+            SEMANTIC_TEXT_MATCH_ALL_HIGHLIGHTER
         );
     }
 }

+ 10 - 0
x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighter.java

@@ -15,6 +15,7 @@ import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.KnnByteVectorQuery;
 import org.apache.lucene.search.KnnFloatVectorQuery;
+import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.QueryVisitor;
 import org.apache.lucene.search.ScoreMode;
@@ -267,6 +268,8 @@ public class SemanticTextHighlighter implements Highlighter {
                     queries.add(fieldType.createExactKnnQuery(VectorData.fromFloats(knnQuery.getTargetCopy()), null));
                 } else if (query instanceof KnnByteVectorQuery knnQuery) {
                     queries.add(fieldType.createExactKnnQuery(VectorData.fromBytes(knnQuery.getTargetCopy()), null));
+                } else if (query instanceof MatchAllDocsQuery) {
+                    queries.add(new MatchAllDocsQuery());
                 }
             }
         });
@@ -293,6 +296,13 @@ public class SemanticTextHighlighter implements Highlighter {
                 }
                 return this;
             }
+
+            @Override
+            public void visitLeaf(Query query) {
+                if (query instanceof MatchAllDocsQuery) {
+                    queries.add(new MatchAllDocsQuery());
+                }
+            }
         });
         return queries;
     }

+ 130 - 0
x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml

@@ -336,3 +336,133 @@ setup:
   - length: { hits.hits.0.highlight.semantic_text_field: 2 }
   - match: { hits.hits.0.highlight.semantic_text_field.0: "some test data" }
   - match: { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }
+
+---
+"Highlighting with match_all query":
+  - requires:
+      cluster_features: "semantic_text.match_all_highlighter"
+      reason: semantic text field supports match_all query with semantic highlighter, effective from 8.19 and 9.1.0.
+
+  - do:
+      search:
+        index: test-sparse-index
+        body:
+          query:
+            match_all: {}
+          highlight:
+            fields:
+              body:
+                type: "semantic"
+                number_of_fragments: 2
+
+  - match:  { hits.total.value: 1 }
+  - match:  { hits.hits.0._id: "doc_1" }
+  - length: { hits.hits.0.highlight.body: 2 }
+  - match:  { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
+  - match:  { hits.hits.0.highlight.body.1: "You Know, for Search!" }
+
+  - do:
+      search:
+        index: test-dense-index
+        body:
+          query:
+            match_all: {}
+          highlight:
+            fields:
+              body:
+                type: "semantic"
+                number_of_fragments: 2
+
+  - match: { hits.total.value: 1 }
+  - match: { hits.hits.0._id: "doc_1" }
+  - length: { hits.hits.0.highlight.body: 2 }
+  - match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
+  - match: { hits.hits.0.highlight.body.1: "You Know, for Search!" }
+
+---
+"Highlighting with match_all and multi chunks with empty input":
+  - requires:
+      cluster_features: "semantic_text.match_all_highlighter"
+      reason: semantic text field supports match_all query with semantic highlighter, effective from 8.19 and 9.1.0.
+
+  - do:
+      indices.create:
+        index: test-index-sparse
+        body:
+          settings:
+            index.mapping.semantic_text.use_legacy_format: false
+          mappings:
+            properties:
+              semantic_text_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+              text_field:
+                type: text
+
+  - do:
+      index:
+        index: test-index-sparse
+        id: doc_1
+        body:
+          semantic_text_field: [ "some test data", "    ", "now with chunks" ]
+          text_field: "some test data"
+        refresh: true
+
+  - do:
+      search:
+        index: test-index-sparse
+        body:
+          query:
+            match_all: {}
+          highlight:
+            fields:
+              semantic_text_field:
+                type: "semantic"
+                number_of_fragments: 2
+
+  - match:  { hits.total.value: 1 }
+  - match:  { hits.hits.0._id: "doc_1" }
+  - length: { hits.hits.0.highlight.semantic_text_field: 2 }
+  - match:  { hits.hits.0.highlight.semantic_text_field.0: "some test data" }
+  - match:  { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }
+
+  - do:
+      indices.create:
+        index: test-index-dense
+        body:
+          settings:
+            index.mapping.semantic_text.use_legacy_format: false
+          mappings:
+            properties:
+              semantic_text_field:
+                type: semantic_text
+                inference_id: dense-inference-id
+              text_field:
+                type: text
+
+  - do:
+      index:
+        index: test-index-dense
+        id: doc_1
+        body:
+          semantic_text_field: [ "some test data", "    ", "now with chunks" ]
+          text_field: "some test data"
+        refresh: true
+
+  - do:
+      search:
+        index: test-index-dense
+        body:
+          query:
+            match_all: {}
+          highlight:
+            fields:
+              semantic_text_field:
+                type: "semantic"
+                number_of_fragments: 2
+
+  - match: { hits.total.value: 1 }
+  - match: { hits.hits.0._id: "doc_1" }
+  - length: { hits.hits.0.highlight.semantic_text_field: 2 }
+  - match: { hits.hits.0.highlight.semantic_text_field.0: "some test data" }
+  - match: { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }

+ 147 - 0
x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter_bwc.yml

@@ -288,3 +288,150 @@ setup:
   - length: { hits.hits.0.highlight.semantic_text_field: 2 }
   - match: { hits.hits.0.highlight.semantic_text_field.0: "some test data" }
   - match: { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }
+
+---
+"Highlighting with match_all query":
+  - requires:
+      cluster_features: "semantic_text.match_all_highlighter"
+      reason: semantic text field supports match_all query with semantic highlighter, effective from 8.19 and 9.1.0.
+
+  - do:
+      index:
+        index: test-sparse-index
+        id: doc_1
+        body:
+          body: [ "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!" ]
+        refresh: true
+
+  - do:
+      search:
+        index: test-sparse-index
+        body:
+          query:
+            match_all: {}
+          highlight:
+            fields:
+              body:
+                type: "semantic"
+                number_of_fragments: 2
+
+  - match:  { hits.total.value: 1 }
+  - match:  { hits.hits.0._id: "doc_1" }
+  - length: { hits.hits.0.highlight.body: 2 }
+  - match:  { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
+  - match:  { hits.hits.0.highlight.body.1: "You Know, for Search!" }
+
+  - do:
+      index:
+        index: test-dense-index
+        id: doc_1
+        body:
+          body: [ "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!" ]
+        refresh: true
+
+  - do:
+      search:
+        index: test-dense-index
+        body:
+          query:
+            match_all: {}
+          highlight:
+            fields:
+              body:
+                type: "semantic"
+                number_of_fragments: 2
+
+  - match: { hits.total.value: 1 }
+  - match: { hits.hits.0._id: "doc_1" }
+  - length: { hits.hits.0.highlight.body: 2 }
+  - match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
+  - match: { hits.hits.0.highlight.body.1: "You Know, for Search!" }
+
+---
+"Highlighting with match_all and multi chunks with empty input":
+  - requires:
+      cluster_features: "semantic_text.match_all_highlighter"
+      reason: semantic text field supports match_all query with semantic highlighter, effective from 8.19 and 9.1.0.
+
+  - do:
+      indices.create:
+        index: test-index-sparse
+        body:
+          settings:
+            index.mapping.semantic_text.use_legacy_format: true
+          mappings:
+            properties:
+              semantic_text_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+              text_field:
+                type: text
+
+  - do:
+      index:
+        index: test-index-sparse
+        id: doc_1
+        body:
+          semantic_text_field: [ "some test data", "    ", "now with chunks" ]
+          text_field: "some test data"
+        refresh: true
+
+  - do:
+      search:
+        index: test-index-sparse
+        body:
+          query:
+            match_all: {}
+          highlight:
+            fields:
+              semantic_text_field:
+                type: "semantic"
+                number_of_fragments: 2
+
+  - match:  { hits.total.value: 1 }
+  - match:  { hits.hits.0._id: "doc_1" }
+  - length: { hits.hits.0.highlight.semantic_text_field: 2 }
+  - match:  { hits.hits.0.highlight.semantic_text_field.0: "some test data" }
+  - match:  { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }
+
+  - do:
+      indices.create:
+        index: test-index-dense
+        body:
+          settings:
+            index.mapping.semantic_text.use_legacy_format: true
+          mappings:
+            properties:
+              semantic_text_field:
+                type: semantic_text
+                inference_id: dense-inference-id
+              text_field:
+                type: text
+
+  - do:
+      index:
+        index: test-index-dense
+        id: doc_1
+        body:
+          semantic_text_field: [ "some test data", "    ", "now with chunks" ]
+          text_field: "some test data"
+        refresh: true
+
+  - do:
+      search:
+        index: test-index-dense
+        body:
+          query:
+            match_all: {}
+          highlight:
+            fields:
+              semantic_text_field:
+                type: "semantic"
+                number_of_fragments: 2
+
+  - match: { hits.total.value: 1 }
+  - match: { hits.hits.0._id: "doc_1" }
+  - length: { hits.hits.0.highlight.semantic_text_field: 2 }
+  - match: { hits.hits.0.highlight.semantic_text_field.0: "some test data" }
+  - match: { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }
+