Browse Source

Enable Mapped Field Types to Override Default Highlighter (#121176) (#121237)

This commit introduces the `MappedFieldType#getDefaultHighlighter`, allowing a specific highlighter to be enforced for a field.
The semantic field mapper utilizes this new functionality to set the `semantic` highlighter as the default.
All other fields will continue to use the `unified` highlighter by default.
Jim Ferenczi 8 months ago
parent
commit
0db2f0a027

+ 29 - 3
docs/reference/mapping/types/semantic-text.asciidoc

@@ -133,14 +133,13 @@ You can extract the most relevant fragments from a semantic text field by using
 POST test-index/_search
 {
     "query": {
-        "semantic": {
-            "field": "my_semantic_field"
+        "match": {
+            "my_semantic_field": "Which country is Paris in?"
         }
     },
     "highlight": {
         "fields": {
             "my_semantic_field": {
-                "type": "semantic",
                 "number_of_fragments": 2,  <1>
                 "order": "score"           <2>
             }
@@ -152,6 +151,33 @@ POST test-index/_search
 <1> Specifies the maximum number of fragments to return.
 <2> Sorts highlighted fragments by score when set to `score`. By default, fragments will be output in the order they appear in the field (order: none).
 
+Highlighting is supported on fields other than semantic_text.
+However, if you want to restrict highlighting to the semantic highlighter and return no fragments when the field is not of type semantic_text,
+you can explicitly enforce the `semantic` highlighter in the query:
+
+[source,console]
+------------------------------------------------------------
+PUT test-index
+{
+    "query": {
+        "match": {
+            "my_field": "Which country is Paris in?"
+        }
+    },
+    "highlight": {
+        "fields": {
+            "my_field": {
+                "type": "semantic",         <1>
+                "number_of_fragments": 2,
+                "order": "score"
+            }
+        }
+    }
+}
+------------------------------------------------------------
+// TEST[skip:Requires inference endpoint]
+<1> Ensures that highlighting is applied exclusively to semantic_text fields.
+
 [discrete]
 [[custom-indexing]]
 ==== Customizing `semantic_text` indexing

+ 15 - 3
docs/reference/search/search-your-data/highlighting.asciidoc

@@ -37,8 +37,8 @@ GET /_search
 // TEST[setup:my_index]
 
 {es} supports three highlighters: `unified`, `plain`, and `fvh` (fast vector
-highlighter). You can specify the highlighter `type` you want to use
-for each field.
+highlighter) for `text` and `keyword` fields and the `semantic` highlighter for `semantic_text` fields.
+You can specify the highlighter `type` you want to use for each field or rely on the field type's default highlighter.
 
 [discrete]
 [[unified-highlighter]]
@@ -48,7 +48,19 @@ highlighter breaks the text into sentences and uses the BM25 algorithm to score
 individual sentences as if they were documents in the corpus. It also supports
 accurate phrase and multi-term (fuzzy, prefix, regex) highlighting. The `unified`
 highlighter can combine matches from multiple fields into one result (see
-`matched_fields`). This is the default highlighter.
+`matched_fields`).
+
+This is the default highlighter for all `text` and `keyword` fields.
+
+[discrete]
+[[semantic-highlighter]]
+==== Semantic Highlighter
+
+The `semantic` highlighter is specifically designed for use with the <<semantic-text, `semantic_text`>> field.
+It identifies and extracts the most relevant fragments from the field based on semantic
+similarity between the query and each fragment.
+
+By default, <<semantic-text, `semantic_text`>> fields use the semantic highlighter.
 
 [discrete]
 [[plain-highlighter]]

+ 8 - 0
server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java

@@ -45,6 +45,7 @@ import org.elasticsearch.index.query.QueryShardException;
 import org.elasticsearch.index.query.SearchExecutionContext;
 import org.elasticsearch.search.DocValueFormat;
 import org.elasticsearch.search.fetch.subphase.FetchFieldsPhase;
+import org.elasticsearch.search.fetch.subphase.highlight.DefaultHighlighter;
 import org.elasticsearch.search.lookup.SearchLookup;
 
 import java.io.IOException;
@@ -221,6 +222,13 @@ public abstract class MappedFieldType {
         return null;
     }
 
+    /**
+     * Returns the default highlighter type to use when highlighting the field.
+     */
+    public String getDefaultHighlighter() {
+        return DefaultHighlighter.NAME;
+    }
+
     /** Generates a query that will only match documents that contain the given value.
      *  The default implementation returns a {@link TermQuery} over the value bytes
      *  @throws IllegalArgumentException if {@code value} cannot be converted to the expected data type or if the field is not searchable

+ 1 - 1
server/src/main/java/org/elasticsearch/search/SearchModule.java

@@ -936,7 +936,7 @@ public class SearchModule {
         NamedRegistry<Highlighter> highlighters = new NamedRegistry<>("highlighter");
         highlighters.register("fvh", new FastVectorHighlighter(settings));
         highlighters.register("plain", new PlainHighlighter());
-        highlighters.register("unified", new DefaultHighlighter());
+        highlighters.register(DefaultHighlighter.NAME, new DefaultHighlighter());
         highlighters.extractAndRegister(plugins, SearchPlugin::getHighlighters);
 
         return unmodifiableMap(highlighters.getRegistry());

+ 1 - 1
server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/DefaultHighlighter.java

@@ -49,7 +49,7 @@ import java.util.function.Predicate;
 import static org.elasticsearch.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
 
 public class DefaultHighlighter implements Highlighter {
-
+    public static final String NAME = "unified";
     public static final NodeFeature UNIFIED_HIGHLIGHTER_MATCHED_FIELDS = new NodeFeature("unified_highlighter_matched_fields", true);
 
     @Override

+ 4 - 5
server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/HighlightPhase.java

@@ -66,7 +66,7 @@ public class HighlightPhase implements FetchSubPhase {
                 Map<String, Function<HitContext, FieldHighlightContext>> contextBuilders = fieldContext.builders;
                 for (String field : contextBuilders.keySet()) {
                     FieldHighlightContext fieldContext = contextBuilders.get(field).apply(hitContext);
-                    Highlighter highlighter = getHighlighter(fieldContext.field);
+                    Highlighter highlighter = getHighlighter(fieldContext.field, fieldContext.fieldType);
                     HighlightField highlightField = highlighter.highlight(fieldContext);
                     if (highlightField != null) {
                         // Note that we make sure to use the original field name in the response. This is because the
@@ -80,10 +80,10 @@ public class HighlightPhase implements FetchSubPhase {
         };
     }
 
-    private Highlighter getHighlighter(SearchHighlightContext.Field field) {
+    private Highlighter getHighlighter(SearchHighlightContext.Field field, MappedFieldType fieldType) {
         String highlighterType = field.fieldOptions().highlighterType();
         if (highlighterType == null) {
-            highlighterType = "unified";
+            highlighterType = fieldType.getDefaultHighlighter();
         }
         Highlighter highlighter = highlighters.get(highlighterType);
         if (highlighter == null) {
@@ -103,8 +103,6 @@ public class HighlightPhase implements FetchSubPhase {
         Map<String, Function<HitContext, FieldHighlightContext>> builders = new LinkedHashMap<>();
         StoredFieldsSpec storedFieldsSpec = StoredFieldsSpec.NO_REQUIREMENTS;
         for (SearchHighlightContext.Field field : highlightContext.fields()) {
-            Highlighter highlighter = getHighlighter(field);
-
             Collection<String> fieldNamesToHighlight = context.getSearchExecutionContext().getMatchingFieldNames(field.field());
 
             boolean fieldNameContainsWildcards = field.field().contains("*");
@@ -112,6 +110,7 @@ public class HighlightPhase implements FetchSubPhase {
             boolean sourceRequired = false;
             for (String fieldName : fieldNamesToHighlight) {
                 MappedFieldType fieldType = context.getSearchExecutionContext().getFieldType(fieldName);
+                Highlighter highlighter = getHighlighter(field, fieldType);
 
                 // We should prevent highlighting if a field is anything but a text, match_only_text,
                 // or keyword field.

+ 3 - 1
x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java

@@ -37,6 +37,7 @@ public class InferenceFeatures implements FeatureSpecification {
     }
 
     private static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTER = new NodeFeature("semantic_text.highlighter");
+    private static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTER_DEFAULT = new NodeFeature("semantic_text.highlighter.default");
 
     @Override
     public Set<NodeFeature> getTestFeatures() {
@@ -52,7 +53,8 @@ public class InferenceFeatures implements FeatureSpecification {
             SemanticInferenceMetadataFieldsMapper.EXPLICIT_NULL_FIXES,
             SEMANTIC_KNN_VECTOR_QUERY_REWRITE_INTERCEPTION_SUPPORTED,
             TextSimilarityRankRetrieverBuilder.TEXT_SIMILARITY_RERANKER_ALIAS_HANDLING_FIX,
-            SemanticInferenceMetadataFieldsMapper.INFERENCE_METADATA_FIELDS_ENABLED_BY_DEFAULT
+            SemanticInferenceMetadataFieldsMapper.INFERENCE_METADATA_FIELDS_ENABLED_BY_DEFAULT,
+            SEMANTIC_TEXT_HIGHLIGHTER_DEFAULT
         );
     }
 }

+ 6 - 0
x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java

@@ -73,6 +73,7 @@ import org.elasticsearch.xcontent.XContentType;
 import org.elasticsearch.xpack.core.ml.inference.results.MlTextEmbeddingResults;
 import org.elasticsearch.xpack.core.ml.inference.results.TextExpansionResults;
 import org.elasticsearch.xpack.core.ml.search.SparseVectorQueryBuilder;
+import org.elasticsearch.xpack.inference.highlight.SemanticTextHighlighter;
 
 import java.io.IOException;
 import java.io.UncheckedIOException;
@@ -582,6 +583,11 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
             return TextFieldMapper.CONTENT_TYPE;
         }
 
+        @Override
+        public String getDefaultHighlighter() {
+            return SemanticTextHighlighter.NAME;
+        }
+
         public String getInferenceId() {
             return inferenceId;
         }

+ 60 - 13
x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml

@@ -55,22 +55,32 @@ setup:
             index.mapping.semantic_text.use_legacy_format: false
           mappings:
             properties:
+              title:
+                type: text
               body:
                 type: semantic_text
                 inference_id: dense-inference-id
 
----
-"Highlighting using a sparse embedding model":
   - do:
       index:
         index: test-sparse-index
         id: doc_1
         body:
+          title: "Elasticsearch"
           body: ["ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!"]
         refresh: true
 
-  - match: { result: created }
+  - do:
+      index:
+        index: test-dense-index
+        id: doc_1
+        body:
+          title: "Elasticsearch"
+          body: [ "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!" ]
+        refresh: true
 
+---
+"Highlighting using a sparse embedding model":
   - do:
       search:
         index: test-sparse-index
@@ -153,16 +163,6 @@ setup:
 
 ---
 "Highlighting using a dense embedding model":
-  - do:
-      index:
-        index: test-dense-index
-        id: doc_1
-        body:
-          body: ["ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!"]
-        refresh: true
-
-  - match: { result: created }
-
   - do:
       search:
         index: test-dense-index
@@ -243,4 +243,51 @@ setup:
   - match:  { hits.hits.0.highlight.body.0: "You Know, for Search!" }
   - match:  { hits.hits.0.highlight.body.1: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
 
+---
+"Default highlighter for fields":
+  - requires:
+        cluster_features: "semantic_text.highlighter.default"
+        reason: semantic text field defaults to the semantic highlighter
+
+  - do:
+      search:
+        index: test-dense-index
+        body:
+          query:
+            match:
+              body: "What is Elasticsearch?"
+          highlight:
+            fields:
+              body:
+                order: "score"
+                number_of_fragments: 2
+
+  - match:  { hits.total.value: 1 }
+  - match:  { hits.hits.0._id: "doc_1" }
+  - length: { hits.hits.0.highlight.body: 2 }
+  - match:  { hits.hits.0.highlight.body.0: "You Know, for Search!" }
+  - match:  { hits.hits.0.highlight.body.1: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
+
+---
+"semantic highlighter ignores non-inference fields":
+  - requires:
+      cluster_features: "semantic_text.highlighter.default"
+      reason: semantic text field defaults to the semantic highlighter
+
+  - do:
+      search:
+        index: test-dense-index
+        body:
+          query:
+            match:
+              title: "Elasticsearch"
+          highlight:
+            fields:
+              title:
+                type: semantic
+                number_of_fragments: 2
+
+  - match:      { hits.total.value: 1 }
+  - match:      { hits.hits.0._id: "doc_1" }
+  - not_exists: hits.hits.0.highlight.title