Browse Source

Enable Mapped Field Types to Override Default Highlighter (#121176) (#121237)

This commit introduces the `MappedFieldType#getDefaultHighlighter`, allowing a specific highlighter to be enforced for a field.
The semantic field mapper utilizes this new functionality to set the `semantic` highlighter as the default.
All other fields will continue to use the `unified` highlighter by default.
Jim Ferenczi 8 months ago
parent
commit
0db2f0a027

+ 29 - 3
docs/reference/mapping/types/semantic-text.asciidoc

@@ -133,14 +133,13 @@ You can extract the most relevant fragments from a semantic text field by using
 POST test-index/_search
 POST test-index/_search
 {
 {
     "query": {
     "query": {
-        "semantic": {
-            "field": "my_semantic_field"
+        "match": {
+            "my_semantic_field": "Which country is Paris in?"
         }
         }
     },
     },
     "highlight": {
     "highlight": {
         "fields": {
         "fields": {
             "my_semantic_field": {
             "my_semantic_field": {
-                "type": "semantic",
                 "number_of_fragments": 2,  <1>
                 "number_of_fragments": 2,  <1>
                 "order": "score"           <2>
                 "order": "score"           <2>
             }
             }
@@ -152,6 +151,33 @@ POST test-index/_search
 <1> Specifies the maximum number of fragments to return.
 <1> Specifies the maximum number of fragments to return.
 <2> Sorts highlighted fragments by score when set to `score`. By default, fragments will be output in the order they appear in the field (order: none).
 <2> Sorts highlighted fragments by score when set to `score`. By default, fragments will be output in the order they appear in the field (order: none).
 
 
+Highlighting is supported on fields other than semantic_text.
+However, if you want to restrict highlighting to the semantic highlighter and return no fragments when the field is not of type semantic_text,
+you can explicitly enforce the `semantic` highlighter in the query:
+
+[source,console]
+------------------------------------------------------------
+PUT test-index
+{
+    "query": {
+        "match": {
+            "my_field": "Which country is Paris in?"
+        }
+    },
+    "highlight": {
+        "fields": {
+            "my_field": {
+                "type": "semantic",         <1>
+                "number_of_fragments": 2,
+                "order": "score"
+            }
+        }
+    }
+}
+------------------------------------------------------------
+// TEST[skip:Requires inference endpoint]
+<1> Ensures that highlighting is applied exclusively to semantic_text fields.
+
 [discrete]
 [discrete]
 [[custom-indexing]]
 [[custom-indexing]]
 ==== Customizing `semantic_text` indexing
 ==== Customizing `semantic_text` indexing

+ 15 - 3
docs/reference/search/search-your-data/highlighting.asciidoc

@@ -37,8 +37,8 @@ GET /_search
 // TEST[setup:my_index]
 // TEST[setup:my_index]
 
 
 {es} supports three highlighters: `unified`, `plain`, and `fvh` (fast vector
 {es} supports three highlighters: `unified`, `plain`, and `fvh` (fast vector
-highlighter). You can specify the highlighter `type` you want to use
-for each field.
+highlighter) for `text` and `keyword` fields and the `semantic` highlighter for `semantic_text` fields.
+You can specify the highlighter `type` you want to use for each field or rely on the field type's default highlighter.
 
 
 [discrete]
 [discrete]
 [[unified-highlighter]]
 [[unified-highlighter]]
@@ -48,7 +48,19 @@ highlighter breaks the text into sentences and uses the BM25 algorithm to score
 individual sentences as if they were documents in the corpus. It also supports
 individual sentences as if they were documents in the corpus. It also supports
 accurate phrase and multi-term (fuzzy, prefix, regex) highlighting. The `unified`
 accurate phrase and multi-term (fuzzy, prefix, regex) highlighting. The `unified`
 highlighter can combine matches from multiple fields into one result (see
 highlighter can combine matches from multiple fields into one result (see
-`matched_fields`). This is the default highlighter.
+`matched_fields`).
+
+This is the default highlighter for all `text` and `keyword` fields.
+
+[discrete]
+[[semantic-highlighter]]
+==== Semantic Highlighter
+
+The `semantic` highlighter is specifically designed for use with the <<semantic-text, `semantic_text`>> field.
+It identifies and extracts the most relevant fragments from the field based on semantic
+similarity between the query and each fragment.
+
+By default, <<semantic-text, `semantic_text`>> fields use the semantic highlighter.
 
 
 [discrete]
 [discrete]
 [[plain-highlighter]]
 [[plain-highlighter]]

+ 8 - 0
server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java

@@ -45,6 +45,7 @@ import org.elasticsearch.index.query.QueryShardException;
 import org.elasticsearch.index.query.SearchExecutionContext;
 import org.elasticsearch.index.query.SearchExecutionContext;
 import org.elasticsearch.search.DocValueFormat;
 import org.elasticsearch.search.DocValueFormat;
 import org.elasticsearch.search.fetch.subphase.FetchFieldsPhase;
 import org.elasticsearch.search.fetch.subphase.FetchFieldsPhase;
+import org.elasticsearch.search.fetch.subphase.highlight.DefaultHighlighter;
 import org.elasticsearch.search.lookup.SearchLookup;
 import org.elasticsearch.search.lookup.SearchLookup;
 
 
 import java.io.IOException;
 import java.io.IOException;
@@ -221,6 +222,13 @@ public abstract class MappedFieldType {
         return null;
         return null;
     }
     }
 
 
+    /**
+     * Returns the default highlighter type to use when highlighting the field.
+     */
+    public String getDefaultHighlighter() {
+        return DefaultHighlighter.NAME;
+    }
+
     /** Generates a query that will only match documents that contain the given value.
     /** Generates a query that will only match documents that contain the given value.
      *  The default implementation returns a {@link TermQuery} over the value bytes
      *  The default implementation returns a {@link TermQuery} over the value bytes
      *  @throws IllegalArgumentException if {@code value} cannot be converted to the expected data type or if the field is not searchable
      *  @throws IllegalArgumentException if {@code value} cannot be converted to the expected data type or if the field is not searchable

+ 1 - 1
server/src/main/java/org/elasticsearch/search/SearchModule.java

@@ -936,7 +936,7 @@ public class SearchModule {
         NamedRegistry<Highlighter> highlighters = new NamedRegistry<>("highlighter");
         NamedRegistry<Highlighter> highlighters = new NamedRegistry<>("highlighter");
         highlighters.register("fvh", new FastVectorHighlighter(settings));
         highlighters.register("fvh", new FastVectorHighlighter(settings));
         highlighters.register("plain", new PlainHighlighter());
         highlighters.register("plain", new PlainHighlighter());
-        highlighters.register("unified", new DefaultHighlighter());
+        highlighters.register(DefaultHighlighter.NAME, new DefaultHighlighter());
         highlighters.extractAndRegister(plugins, SearchPlugin::getHighlighters);
         highlighters.extractAndRegister(plugins, SearchPlugin::getHighlighters);
 
 
         return unmodifiableMap(highlighters.getRegistry());
         return unmodifiableMap(highlighters.getRegistry());

+ 1 - 1
server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/DefaultHighlighter.java

@@ -49,7 +49,7 @@ import java.util.function.Predicate;
 import static org.elasticsearch.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
 import static org.elasticsearch.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
 
 
 public class DefaultHighlighter implements Highlighter {
 public class DefaultHighlighter implements Highlighter {
-
+    public static final String NAME = "unified";
     public static final NodeFeature UNIFIED_HIGHLIGHTER_MATCHED_FIELDS = new NodeFeature("unified_highlighter_matched_fields", true);
     public static final NodeFeature UNIFIED_HIGHLIGHTER_MATCHED_FIELDS = new NodeFeature("unified_highlighter_matched_fields", true);
 
 
     @Override
     @Override

+ 4 - 5
server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightPhase.java

@@ -66,7 +66,7 @@ public class HighlightPhase implements FetchSubPhase {
                 Map<String, Function<HitContext, FieldHighlightContext>> contextBuilders = fieldContext.builders;
                 Map<String, Function<HitContext, FieldHighlightContext>> contextBuilders = fieldContext.builders;
                 for (String field : contextBuilders.keySet()) {
                 for (String field : contextBuilders.keySet()) {
                     FieldHighlightContext fieldContext = contextBuilders.get(field).apply(hitContext);
                     FieldHighlightContext fieldContext = contextBuilders.get(field).apply(hitContext);
-                    Highlighter highlighter = getHighlighter(fieldContext.field);
+                    Highlighter highlighter = getHighlighter(fieldContext.field, fieldContext.fieldType);
                     HighlightField highlightField = highlighter.highlight(fieldContext);
                     HighlightField highlightField = highlighter.highlight(fieldContext);
                     if (highlightField != null) {
                     if (highlightField != null) {
                         // Note that we make sure to use the original field name in the response. This is because the
                         // Note that we make sure to use the original field name in the response. This is because the
@@ -80,10 +80,10 @@ public class HighlightPhase implements FetchSubPhase {
         };
         };
     }
     }
 
 
-    private Highlighter getHighlighter(SearchHighlightContext.Field field) {
+    private Highlighter getHighlighter(SearchHighlightContext.Field field, MappedFieldType fieldType) {
         String highlighterType = field.fieldOptions().highlighterType();
         String highlighterType = field.fieldOptions().highlighterType();
         if (highlighterType == null) {
         if (highlighterType == null) {
-            highlighterType = "unified";
+            highlighterType = fieldType.getDefaultHighlighter();
         }
         }
         Highlighter highlighter = highlighters.get(highlighterType);
         Highlighter highlighter = highlighters.get(highlighterType);
         if (highlighter == null) {
         if (highlighter == null) {
@@ -103,8 +103,6 @@ public class HighlightPhase implements FetchSubPhase {
         Map<String, Function<HitContext, FieldHighlightContext>> builders = new LinkedHashMap<>();
         Map<String, Function<HitContext, FieldHighlightContext>> builders = new LinkedHashMap<>();
         StoredFieldsSpec storedFieldsSpec = StoredFieldsSpec.NO_REQUIREMENTS;
         StoredFieldsSpec storedFieldsSpec = StoredFieldsSpec.NO_REQUIREMENTS;
         for (SearchHighlightContext.Field field : highlightContext.fields()) {
         for (SearchHighlightContext.Field field : highlightContext.fields()) {
-            Highlighter highlighter = getHighlighter(field);
-
             Collection<String> fieldNamesToHighlight = context.getSearchExecutionContext().getMatchingFieldNames(field.field());
             Collection<String> fieldNamesToHighlight = context.getSearchExecutionContext().getMatchingFieldNames(field.field());
 
 
             boolean fieldNameContainsWildcards = field.field().contains("*");
             boolean fieldNameContainsWildcards = field.field().contains("*");
@@ -112,6 +110,7 @@ public class HighlightPhase implements FetchSubPhase {
             boolean sourceRequired = false;
             boolean sourceRequired = false;
             for (String fieldName : fieldNamesToHighlight) {
             for (String fieldName : fieldNamesToHighlight) {
                 MappedFieldType fieldType = context.getSearchExecutionContext().getFieldType(fieldName);
                 MappedFieldType fieldType = context.getSearchExecutionContext().getFieldType(fieldName);
+                Highlighter highlighter = getHighlighter(field, fieldType);
 
 
                 // We should prevent highlighting if a field is anything but a text, match_only_text,
                 // We should prevent highlighting if a field is anything but a text, match_only_text,
                 // or keyword field.
                 // or keyword field.

+ 3 - 1
x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java

@@ -37,6 +37,7 @@ public class InferenceFeatures implements FeatureSpecification {
     }
     }
 
 
     private static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTER = new NodeFeature("semantic_text.highlighter");
     private static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTER = new NodeFeature("semantic_text.highlighter");
+    private static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTER_DEFAULT = new NodeFeature("semantic_text.highlighter.default");
 
 
     @Override
     @Override
     public Set<NodeFeature> getTestFeatures() {
     public Set<NodeFeature> getTestFeatures() {
@@ -52,7 +53,8 @@ public class InferenceFeatures implements FeatureSpecification {
             SemanticInferenceMetadataFieldsMapper.EXPLICIT_NULL_FIXES,
             SemanticInferenceMetadataFieldsMapper.EXPLICIT_NULL_FIXES,
             SEMANTIC_KNN_VECTOR_QUERY_REWRITE_INTERCEPTION_SUPPORTED,
             SEMANTIC_KNN_VECTOR_QUERY_REWRITE_INTERCEPTION_SUPPORTED,
             TextSimilarityRankRetrieverBuilder.TEXT_SIMILARITY_RERANKER_ALIAS_HANDLING_FIX,
             TextSimilarityRankRetrieverBuilder.TEXT_SIMILARITY_RERANKER_ALIAS_HANDLING_FIX,
-            SemanticInferenceMetadataFieldsMapper.INFERENCE_METADATA_FIELDS_ENABLED_BY_DEFAULT
+            SemanticInferenceMetadataFieldsMapper.INFERENCE_METADATA_FIELDS_ENABLED_BY_DEFAULT,
+            SEMANTIC_TEXT_HIGHLIGHTER_DEFAULT
         );
         );
     }
     }
 }
 }

+ 6 - 0
x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java

@@ -73,6 +73,7 @@ import org.elasticsearch.xcontent.XContentType;
 import org.elasticsearch.xpack.core.ml.inference.results.MlTextEmbeddingResults;
 import org.elasticsearch.xpack.core.ml.inference.results.MlTextEmbeddingResults;
 import org.elasticsearch.xpack.core.ml.inference.results.TextExpansionResults;
 import org.elasticsearch.xpack.core.ml.inference.results.TextExpansionResults;
 import org.elasticsearch.xpack.core.ml.search.SparseVectorQueryBuilder;
 import org.elasticsearch.xpack.core.ml.search.SparseVectorQueryBuilder;
+import org.elasticsearch.xpack.inference.highlight.SemanticTextHighlighter;
 
 
 import java.io.IOException;
 import java.io.IOException;
 import java.io.UncheckedIOException;
 import java.io.UncheckedIOException;
@@ -582,6 +583,11 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
             return TextFieldMapper.CONTENT_TYPE;
             return TextFieldMapper.CONTENT_TYPE;
         }
         }
 
 
+        @Override
+        public String getDefaultHighlighter() {
+            return SemanticTextHighlighter.NAME;
+        }
+
         public String getInferenceId() {
         public String getInferenceId() {
             return inferenceId;
             return inferenceId;
         }
         }

+ 60 - 13
x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml

@@ -55,22 +55,32 @@ setup:
             index.mapping.semantic_text.use_legacy_format: false
             index.mapping.semantic_text.use_legacy_format: false
           mappings:
           mappings:
             properties:
             properties:
+              title:
+                type: text
               body:
               body:
                 type: semantic_text
                 type: semantic_text
                 inference_id: dense-inference-id
                 inference_id: dense-inference-id
 
 
----
-"Highlighting using a sparse embedding model":
   - do:
   - do:
       index:
       index:
         index: test-sparse-index
         index: test-sparse-index
         id: doc_1
         id: doc_1
         body:
         body:
+          title: "Elasticsearch"
           body: ["ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!"]
           body: ["ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!"]
         refresh: true
         refresh: true
 
 
-  - match: { result: created }
+  - do:
+      index:
+        index: test-dense-index
+        id: doc_1
+        body:
+          title: "Elasticsearch"
+          body: [ "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!" ]
+        refresh: true
 
 
+---
+"Highlighting using a sparse embedding model":
   - do:
   - do:
       search:
       search:
         index: test-sparse-index
         index: test-sparse-index
@@ -153,16 +163,6 @@ setup:
 
 
 ---
 ---
 "Highlighting using a dense embedding model":
 "Highlighting using a dense embedding model":
-  - do:
-      index:
-        index: test-dense-index
-        id: doc_1
-        body:
-          body: ["ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!"]
-        refresh: true
-
-  - match: { result: created }
-
   - do:
   - do:
       search:
       search:
         index: test-dense-index
         index: test-dense-index
@@ -243,4 +243,51 @@ setup:
   - match:  { hits.hits.0.highlight.body.0: "You Know, for Search!" }
   - match:  { hits.hits.0.highlight.body.0: "You Know, for Search!" }
   - match:  { hits.hits.0.highlight.body.1: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
   - match:  { hits.hits.0.highlight.body.1: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
 
 
+---
+"Default highlighter for fields":
+  - requires:
+        cluster_features: "semantic_text.highlighter.default"
+        reason: semantic text field defaults to the semantic highlighter
+
+  - do:
+      search:
+        index: test-dense-index
+        body:
+          query:
+            match:
+              body: "What is Elasticsearch?"
+          highlight:
+            fields:
+              body:
+                order: "score"
+                number_of_fragments: 2
+
+  - match:  { hits.total.value: 1 }
+  - match:  { hits.hits.0._id: "doc_1" }
+  - length: { hits.hits.0.highlight.body: 2 }
+  - match:  { hits.hits.0.highlight.body.0: "You Know, for Search!" }
+  - match:  { hits.hits.0.highlight.body.1: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
+
+---
+"semantic highlighter ignores non-inference fields":
+  - requires:
+      cluster_features: "semantic_text.highlighter.default"
+      reason: semantic text field defaults to the semantic highlighter
+
+  - do:
+      search:
+        index: test-dense-index
+        body:
+          query:
+            match:
+              title: "Elasticsearch"
+          highlight:
+            fields:
+              title:
+                type: semantic
+                number_of_fragments: 2
+
+  - match:      { hits.total.value: 1 }
+  - match:      { hits.hits.0._id: "doc_1" }
+  - not_exists: hits.hits.0.highlight.title