2 月之前 · 347198787e
--- a/docs/changelog/132410.yaml
+++ b/docs/changelog/132410.yaml
@@ -0,0 +1,5 @@
 
				+pr: 132410
			
 
				+summary: Add support for retrieving semantic_text's indexed chunks via fields API
			
 
				+area: Vector Search
			
 
				+type: feature
			
 
				+issues: []
			
--- a/docs/reference/elasticsearch/mapping-reference/semantic-text.md
+++ b/docs/reference/elasticsearch/mapping-reference/semantic-text.md
@@ -282,6 +282,34 @@ PUT test-index/_doc/1
 
				     * Others (such as `elastic` and `elasticsearch`) will automatically truncate
			
 
				       the input.
			
 
				 
			
 
				+## Retrieving indexed chunks
			
 
				+```{applies_to}
			
 
				+stack: ga 9.2
			
 
				+serverless: ga
			
 
				+```
			
 
				+
			
 
				+You can retrieve the individual chunks generated by your semantic field’s chunking
			
 
				+strategy using the [fields parameter](/reference/elasticsearch/rest-apis/retrieve-selected-fields.md#search-fields-param):
			
 
				+
			
 
				+```console
			
 
				+POST test-index/_search
			
 
				+{
			
 
				+  "query": {
			
 
				+    "ids" : {
			
 
				+      "values" : ["1"]
			
 
				+    }
			
 
				+  },
			
 
				+  "fields": [
			
 
				+    {
			
 
				+      "field": "semantic_text_field",
			
 
				+      "format": "chunks"      <1>
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+1. Use `"format": "chunks"` to return the field’s text as the original text chunks that were indexed.
			
 
				+
			
 
				 ## Extracting relevant fragments from semantic text [semantic-text-highlighting]
			
 
				 
			
 
				 You can extract the most relevant fragments from a semantic text field by using
			
@@ -311,27 +339,6 @@ POST test-index/_search
 
				 2. Sorts the most relevant highlighted fragments by score when set to `score`. By default,
			
 
				    fragments will be output in the order they appear in the field (order: none).
			
 
				 
			
 
				-To use the `semantic` highlighter to view chunks in the order which they were indexed with no scoring,
			
 
				-use the `match_all` query to retrieve them in the order they appear in the document:
			
 
				-
			
 
				-```console
			
 
				-POST test-index/_search
			
 
				-{
			
 
				-    "query": {
			
 
				-        "match_all": {}
			
 
				-    },
			
 
				-    "highlight": {
			
 
				-        "fields": {
			
 
				-            "my_semantic_field": {
			
 
				-                "number_of_fragments": 5  <1>
			
 
				-            }
			
 
				-        }
			
 
				-    }
			
 
				-}
			
 
				-```
			
 
				-
			
 
				-1. This will return the first 5 chunks, set this number higher to retrieve more chunks.
			
 
				-
			
 
				 Highlighting is supported on fields other than semantic_text. However, if you
			
 
				 want to restrict highlighting to the semantic highlighter and return no
			
 
				 fragments when the field is not of type semantic_text, you can explicitly
			
@@ -359,6 +366,31 @@ PUT test-index
 
				 
			
 
				 1. Ensures that highlighting is applied exclusively to semantic_text fields.
			
 
				 
			
 
				+To retrieve all fragments from the `semantic` highlighter in their original indexing order
			
 
				+without scoring, use a `match_all` query as the `highlight_query`.
			
 
				+This ensures fragments are returned in the order they appear in the document:
			
 
				+
			
 
				+```console
			
 
				+POST test-index/_search
			
 
				+{
			
 
				+  "query": {
			
 
				+    "ids": {
			
 
				+      "values": ["1"]
			
 
				+    }
			
 
				+  },
			
 
				+  "highlight": {
			
 
				+    "fields": {
			
 
				+      "my_semantic_field": {
			
 
				+        "number_of_fragments": 5,        <1>
			
 
				+        "highlight_query": { "match_all": {} }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+1. Returns the first 5 fragments. Increase this value to retrieve additional fragments.
			
 
				+
			
 
				 ## Updates and partial updates for `semantic_text` fields [semantic-text-updates]
			
 
				 
			
 
				 When updating documents that contain `semantic_text` fields, it’s important to understand how inference is triggered:
			
--- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java
+++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java
@@ -47,6 +47,7 @@ public class InferenceFeatures implements FeatureSpecification {
 
				     private static final NodeFeature SEMANTIC_TEXT_MATCH_ALL_HIGHLIGHTER = new NodeFeature("semantic_text.match_all_highlighter");
			
 
				     private static final NodeFeature COHERE_V2_API = new NodeFeature("inference.cohere.v2");
			
 
				     public static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTING_FLAT = new NodeFeature("semantic_text.highlighter.flat_index_options");
			
 
				+    private static final NodeFeature SEMANTIC_TEXT_FIELDS_CHUNKS_FORMAT = new NodeFeature("semantic_text.fields_chunks_format");
			
 
				 
			
 
				     @Override
			
 
				     public Set<NodeFeature> getTestFeatures() {
			
@@ -80,7 +81,8 @@ public class InferenceFeatures implements FeatureSpecification {
 
				                 SEMANTIC_TEXT_INDEX_OPTIONS_WITH_DEFAULTS,
			
 
				                 SEMANTIC_QUERY_REWRITE_INTERCEPTORS_PROPAGATE_BOOST_AND_QUERY_NAME_FIX,
			
 
				                 SEMANTIC_TEXT_HIGHLIGHTING_FLAT,
			
 
				-                SEMANTIC_TEXT_SPARSE_VECTOR_INDEX_OPTIONS
			
 
				+                SEMANTIC_TEXT_SPARSE_VECTOR_INDEX_OPTIONS,
			
 
				+                SEMANTIC_TEXT_FIELDS_CHUNKS_FORMAT
			
 
				             )
			
 
				         );
			
 
				         if (RERANK_SNIPPETS.isEnabled()) {
			
--- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticInferenceMetadataFieldsMapper.java
+++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticInferenceMetadataFieldsMapper.java
@@ -66,7 +66,7 @@ public class SemanticInferenceMetadataFieldsMapper extends InferenceMetadataFiel
 
				             for (var inferenceField : mappingLookup.inferenceFields().keySet()) {
			
 
				                 MappedFieldType ft = mappingLookup.getFieldType(inferenceField);
			
 
				                 if (ft instanceof SemanticTextFieldMapper.SemanticTextFieldType semanticTextFieldType) {
			
 
				-                    fieldFetchers.put(inferenceField, semanticTextFieldType.valueFetcherWithInferenceResults(bitSetCache, searcher));
			
 
				+                    fieldFetchers.put(inferenceField, semanticTextFieldType.valueFetcherWithInferenceResults(bitSetCache, searcher, false));
			
 
				                 } else {
			
 
				                     throw new IllegalArgumentException(
			
 
				                         "Invalid inference field [" + ft.name() + "]. Expected field type [semantic_text] but got [" + ft.typeName() + "]"
			
--- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java
+++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java
@@ -89,6 +89,7 @@ import java.io.IOException;
 
				 import java.io.UncheckedIOException;
			
 
				 import java.util.ArrayList;
			
 
				 import java.util.Arrays;
			
 
				+import java.util.HashMap;
			
 
				 import java.util.Iterator;
			
 
				 import java.util.LinkedHashMap;
			
 
				 import java.util.List;
			
@@ -104,6 +105,7 @@ import static org.elasticsearch.index.IndexVersions.SEMANTIC_TEXT_DEFAULTS_TO_BB
 
				 import static org.elasticsearch.index.IndexVersions.SEMANTIC_TEXT_DEFAULTS_TO_BBQ_BACKPORT_8_X;
			
 
				 import static org.elasticsearch.inference.TaskType.SPARSE_EMBEDDING;
			
 
				 import static org.elasticsearch.inference.TaskType.TEXT_EMBEDDING;
			
 
				+import static org.elasticsearch.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
			
 
				 import static org.elasticsearch.search.SearchService.DEFAULT_SIZE;
			
 
				 import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.CHUNKED_EMBEDDINGS_FIELD;
			
 
				 import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.CHUNKED_OFFSET_FIELD;
			
@@ -864,6 +866,14 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
 
				 
			
 
				         @Override
			
 
				         public ValueFetcher valueFetcher(SearchExecutionContext context, String format) {
			
 
				+            if (format != null && "chunks".equals(format) == false) {
			
 
				+                throw new IllegalArgumentException(
			
 
				+                    "Unknown format [" + format + "] for field [" + name() + "], only [chunks] is supported."
			
 
				+                );
			
 
				+            }
			
 
				+            if (format != null) {
			
 
				+                return valueFetcherWithInferenceResults(getChunksField().bitsetProducer(), context.searcher(), true);
			
 
				+            }
			
 
				             if (useLegacyFormat) {
			
 
				                 // Redirect the fetcher to load the original values of the field
			
 
				                 return SourceValueFetcher.toString(getOriginalTextFieldName(name()), context, format);
			
@@ -871,7 +881,11 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
 
				             return SourceValueFetcher.toString(name(), context, null);
			
 
				         }
			
 
				 
			
 
				-        ValueFetcher valueFetcherWithInferenceResults(Function<Query, BitSetProducer> bitSetCache, IndexSearcher searcher) {
			
 
				+        ValueFetcher valueFetcherWithInferenceResults(
			
 
				+            Function<Query, BitSetProducer> bitSetCache,
			
 
				+            IndexSearcher searcher,
			
 
				+            boolean onlyTextChunks
			
 
				+        ) {
			
 
				             var embeddingsField = getEmbeddingsField();
			
 
				             if (embeddingsField == null) {
			
 
				                 return ValueFetcher.EMPTY;
			
@@ -884,7 +898,7 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
 
				                     org.apache.lucene.search.ScoreMode.COMPLETE_NO_SCORES,
			
 
				                     1
			
 
				                 );
			
 
				-                return new SemanticTextFieldValueFetcher(bitSetFilter, childWeight, embeddingsLoader);
			
 
				+                return new SemanticTextFieldValueFetcher(bitSetFilter, childWeight, embeddingsLoader, onlyTextChunks);
			
 
				             } catch (IOException exc) {
			
 
				                 throw new UncheckedIOException(exc);
			
 
				             }
			
@@ -1022,6 +1036,7 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
 
				             private final BitSetProducer parentBitSetProducer;
			
 
				             private final Weight childWeight;
			
 
				             private final SourceLoader.SyntheticFieldLoader fieldLoader;
			
 
				+            private final boolean onlyTextChunks;
			
 
				 
			
 
				             private BitSet bitSet;
			
 
				             private Scorer childScorer;
			
@@ -1031,11 +1046,13 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
 
				             private SemanticTextFieldValueFetcher(
			
 
				                 BitSetProducer bitSetProducer,
			
 
				                 Weight childWeight,
			
 
				-                SourceLoader.SyntheticFieldLoader fieldLoader
			
 
				+                SourceLoader.SyntheticFieldLoader fieldLoader,
			
 
				+                boolean onlyTextChunks
			
 
				             ) {
			
 
				                 this.parentBitSetProducer = bitSetProducer;
			
 
				                 this.childWeight = childWeight;
			
 
				                 this.fieldLoader = fieldLoader;
			
 
				+                this.onlyTextChunks = onlyTextChunks;
			
 
				             }
			
 
				 
			
 
				             @Override
			
@@ -1046,7 +1063,9 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
 
				                     if (childScorer != null) {
			
 
				                         childScorer.iterator().nextDoc();
			
 
				                     }
			
 
				-                    dvLoader = fieldLoader.docValuesLoader(context.reader(), null);
			
 
				+                    if (onlyTextChunks == false) {
			
 
				+                        dvLoader = fieldLoader.docValuesLoader(context.reader(), null);
			
 
				+                    }
			
 
				                     var terms = context.reader().terms(getOffsetsFieldName(name()));
			
 
				                     offsetsLoader = terms != null ? OffsetSourceField.loader(terms) : null;
			
 
				                 } catch (IOException exc) {
			
@@ -1064,21 +1083,33 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
 
				                 if (it.docID() < previousParent) {
			
 
				                     it.advance(previousParent);
			
 
				                 }
			
 
				+
			
 
				+                return onlyTextChunks ? fetchTextChunks(source, doc, it) : fetchFullField(source, doc, it);
			
 
				+            }
			
 
				+
			
 
				+            private List<Object> fetchTextChunks(Source source, int doc, DocIdSetIterator it) throws IOException {
			
 
				+                Map<String, String> originalValueMap = new HashMap<>();
			
 
				+                List<Object> chunks = new ArrayList<>();
			
 
				+
			
 
				+                iterateChildDocs(doc, it, offset -> {
			
 
				+                    var rawValue = originalValueMap.computeIfAbsent(offset.field(), k -> {
			
 
				+                        var valueObj = XContentMapValues.extractValue(offset.field(), source.source(), null);
			
 
				+                        var values = SemanticTextUtils.nodeStringValues(offset.field(), valueObj).stream().toList();
			
 
				+                        return Strings.collectionToDelimitedString(values, String.valueOf(MULTIVAL_SEP_CHAR));
			
 
				+                    });
			
 
				+
			
 
				+                    chunks.add(rawValue.substring(offset.start(), offset.end()));
			
 
				+                });
			
 
				+
			
 
				+                return chunks;
			
 
				+            }
			
 
				+
			
 
				+            private List<Object> fetchFullField(Source source, int doc, DocIdSetIterator it) throws IOException {
			
 
				                 Map<String, List<SemanticTextField.Chunk>> chunkMap = new LinkedHashMap<>();
			
 
				-                while (it.docID() < doc) {
			
 
				-                    if (dvLoader == null || dvLoader.advanceToDoc(it.docID()) == false) {
			
 
				-                        throw new IllegalStateException(
			
 
				-                            "Cannot fetch values for field [" + name() + "], missing embeddings for doc [" + doc + "]"
			
 
				-                        );
			
 
				-                    }
			
 
				-                    var offset = offsetsLoader.advanceTo(it.docID());
			
 
				-                    if (offset == null) {
			
 
				-                        throw new IllegalStateException(
			
 
				-                            "Cannot fetch values for field [" + name() + "], missing offsets for doc [" + doc + "]"
			
 
				-                        );
			
 
				-                    }
			
 
				-                    var chunks = chunkMap.computeIfAbsent(offset.field(), k -> new ArrayList<>());
			
 
				-                    chunks.add(
			
 
				+
			
 
				+                iterateChildDocs(doc, it, offset -> {
			
 
				+                    var fullChunks = chunkMap.computeIfAbsent(offset.field(), k -> new ArrayList<>());
			
 
				+                    fullChunks.add(
			
 
				                         new SemanticTextField.Chunk(
			
 
				                             null,
			
 
				                             offset.start(),
			
@@ -1086,13 +1117,12 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
 
				                             rawEmbeddings(fieldLoader::write, source.sourceContentType())
			
 
				                         )
			
 
				                     );
			
 
				-                    if (it.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
			
 
				-                        break;
			
 
				-                    }
			
 
				-                }
			
 
				+                });
			
 
				+
			
 
				                 if (chunkMap.isEmpty()) {
			
 
				                     return List.of();
			
 
				                 }
			
 
				+
			
 
				                 return List.of(
			
 
				                     new SemanticTextField(
			
 
				                         useLegacyFormat,
			
@@ -1104,6 +1134,38 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
 
				                 );
			
 
				             }
			
 
				 
			
 
				+            /**
			
 
				+             * Iterates over all child documents for the given doc and applies the provided action for each valid offset.
			
 
				+             */
			
 
				+            private void iterateChildDocs(
			
 
				+                int doc,
			
 
				+                DocIdSetIterator it,
			
 
				+                CheckedConsumer<OffsetSourceFieldMapper.OffsetSource, IOException> action
			
 
				+            ) throws IOException {
			
 
				+                while (it.docID() < doc) {
			
 
				+                    if (onlyTextChunks == false) {
			
 
				+                        if (dvLoader == null || dvLoader.advanceToDoc(it.docID()) == false) {
			
 
				+                            throw new IllegalStateException(
			
 
				+                                "Cannot fetch values for field [" + name() + "], missing embeddings for doc [" + doc + "]"
			
 
				+                            );
			
 
				+                        }
			
 
				+                    }
			
 
				+
			
 
				+                    var offset = offsetsLoader.advanceTo(it.docID());
			
 
				+                    if (offset == null) {
			
 
				+                        throw new IllegalStateException(
			
 
				+                            "Cannot fetch values for field [" + name() + "], missing offsets for doc [" + doc + "]"
			
 
				+                        );
			
 
				+                    }
			
 
				+
			
 
				+                    action.accept(offset);
			
 
				+
			
 
				+                    if (it.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
			
 
				+                        break;
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				             private BytesReference rawEmbeddings(CheckedConsumer<XContentBuilder, IOException> writer, XContentType xContentType)
			
 
				                 throws IOException {
			
 
				                 try (var result = XContentFactory.contentBuilder(xContentType)) {
			
--- a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml
+++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml
@@ -671,3 +671,76 @@ setup:
 
				   - length: { hits.hits.0.highlight.bbq_hnsw_field: 1 }
			
 
				   - match: { hits.hits.0.highlight.bbq_hnsw_field.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
			
 
				 
			
 
				+---
			
 
				+"Retrieve chunks with the fields api":
			
 
				+  - requires:
			
 
				+      cluster_features: "semantic_text.fields_chunks_format"
			
 
				+      reason: semantic text field supports retrieving chunks through fields API in 9.2.0.
			
 
				+
			
 
				+  - do:
			
 
				+      indices.create:
			
 
				+        index: test-index-sparse
			
 
				+        body:
			
 
				+          settings:
			
 
				+            index.mapping.semantic_text.use_legacy_format: false
			
 
				+          mappings:
			
 
				+            properties:
			
 
				+              semantic_text_field:
			
 
				+                type: semantic_text
			
 
				+                inference_id: sparse-inference-id
			
 
				+              text_field:
			
 
				+                type: text
			
 
				+                copy_to: ["semantic_text_field"]
			
 
				+
			
 
				+  - do:
			
 
				+      index:
			
 
				+        index: test-index-sparse
			
 
				+        id: doc_1
			
 
				+        body:
			
 
				+          semantic_text_field: [ "some test data", "    ", "now with chunks" ]
			
 
				+          text_field: "text field data"
			
 
				+        refresh: true
			
 
				+
			
 
				+  - do:
			
 
				+      search:
			
 
				+        index: test-index-sparse
			
 
				+        body:
			
 
				+          query:
			
 
				+            match_all: { }
			
 
				+          fields: [{"field": "semantic_text_field", "format": "chunks"}]
			
 
				+
			
 
				+  - match: { hits.total.value: 1 }
			
 
				+  - match: { hits.hits.0._id: "doc_1" }
			
 
				+  - length: { hits.hits.0.fields.semantic_text_field: 3 }
			
 
				+  - match: { hits.hits.0.fields.semantic_text_field.0: "some test data" }
			
 
				+  - match: { hits.hits.0.fields.semantic_text_field.1: "now with chunks" }
			
 
				+  - match: { hits.hits.0.fields.semantic_text_field.2: "text field data" }
			
 
				+
			
 
				+---
			
 
				+"Highlighting with match_all in a highlight_query":
			
 
				+  - requires:
			
 
				+      cluster_features: "semantic_text.match_all_highlighter"
			
 
				+      reason: semantic text field supports match_all query with semantic highlighter, effective from 8.19 and 9.1.0.
			
 
				+
			
 
				+  - do:
			
 
				+      search:
			
 
				+        index: test-sparse-index
			
 
				+        body:
			
 
				+          query:
			
 
				+            ids: {
			
 
				+              values: ["doc_1"]
			
 
				+            }
			
 
				+          highlight:
			
 
				+            fields:
			
 
				+              body:
			
 
				+                type: "semantic"
			
 
				+                number_of_fragments: 2
			
 
				+                highlight_query: {
			
 
				+                  match_all: {}
			
 
				+                }
			
 
				+
			
 
				+  - match: { hits.total.value: 1 }
			
 
				+  - match: { hits.hits.0._id: "doc_1" }
			
 
				+  - length: { hits.hits.0.highlight.body: 2 }
			
 
				+  - match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
			
 
				+  - match: { hits.hits.0.highlight.body.1: "You Know, for Search!" }