Browse Source

Add Sparse Vector Index Options Settings to Semantic Text Field (#131058)

* add sparse vector index options to semantic text

* [CI] Auto commit changes from spotless

* current tests - not 100% working yet

* sparse_vector index options/createEmbeddingsField

* set default index options if we don't have any

* [CI] Auto commit changes from spotless

* remove redundant code; set defaults

* fix tests

* add validation test

* [CI] Auto commit changes from spotless

* add additional tests

* [CI] Auto commit changes from spotless

* fix tests

* [CI] Auto commit changes from spotless

* ... and fix tests...

* [CI] Auto commit changes from spotless

* fill in test specific sparse vector index options

* remove unused node feature

* [CI] Auto commit changes from spotless

* Update docs/changelog/131058.yaml

* update changelog

* some cleanups; still needs a few more tests

* [CI] Auto commit changes from spotless

* fix additional tests

* [CI] Auto commit changes from spotless

* and fix more tests

* ... annnnd... fix more tests

* [CI] Auto commit changes from spotless

* clean tests; add YAML Rest tests

* [CI] Auto commit changes from spotless

* fix failing tests

* [CI] Auto commit changes from spotless

* fix tests due to multiple random index versioning

* [CI] Auto commit changes from spotless

* fix tests; fix yaml tests;

* [CI] Auto commit changes from spotless

* fix more tests due to random index versioning

* [CI] Auto commit changes from spotless

* ... and more test cleeaning

* [CI] Auto commit changes from spotless

* add link to sparse_vector index_options for docs

* fix docs

* fix tests; remove old code

* correct tests; simplify mocking/spy ModelRegistry

* add test for dense vector w/ sparse index options

* [CI] Auto commit changes from spotless

* collapse multiple "@before" methods

---------

Co-authored-by: elasticsearchmachine <infra-root+elasticsearchmachine@elastic.co>
Mark J. Hoy 2 months ago
parent
commit
b50cefe139

+ 5 - 0
docs/changelog/131058.yaml

@@ -0,0 +1,5 @@
+pr: 131058
+summary: Adds sparse vector index options settings to semantic_text field
+area: Search
+type: enhancement
+issues: []

+ 6 - 4
docs/reference/elasticsearch/mapping-reference/semantic-text.md

@@ -156,9 +156,11 @@ to create the endpoint. If not specified, the {{infer}} endpoint defined by
 
 `index_options` {applies_to}`stack: ga 9.1`
 :   (Optional, object) Specifies the index options to override default values
-for the field. Currently, `dense_vector` index options are supported.
-For text embeddings, `index_options` may match any allowed
-[dense_vector index options](/reference/elasticsearch/mapping-reference/dense-vector.md#dense-vector-index-options).
+for the field. Currently, `dense_vector` and `sparse_vector` index options are supported.
+For text embeddings, `index_options` may match any allowed.
+
+* [dense_vector index options](/reference/elasticsearch/mapping-reference/dense-vector.md#dense-vector-index-options).
+* [sparse_vector index options](/reference/elasticsearch/mapping-reference/sparse-vector.md#sparse-vectors-params). {applies_to}`stack: ga 9.2`
 
 `chunking_settings` {applies_to}`stack: ga 9.1`
 :   (Optional, object) Settings for chunking text into smaller passages.
@@ -410,7 +412,7 @@ stack: ga 9.0
 In case you want to customize data indexing, use the
 [`sparse_vector`](/reference/elasticsearch/mapping-reference/sparse-vector.md)
 or [`dense_vector`](/reference/elasticsearch/mapping-reference/dense-vector.md)
-field types and create an ingest pipeline with an 
+field types and create an ingest pipeline with an
 [{{infer}} processor](/reference/enrich-processor/inference-processor.md) to
 generate the embeddings.
 [This tutorial](docs-content://solutions/search/semantic-search/semantic-search-inference.md)

+ 52 - 26
server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java

@@ -48,7 +48,6 @@ import org.elasticsearch.xcontent.ConstructingObjectParser;
 import org.elasticsearch.xcontent.DeprecationHandler;
 import org.elasticsearch.xcontent.NamedXContentRegistry;
 import org.elasticsearch.xcontent.ParseField;
-import org.elasticsearch.xcontent.ToXContent;
 import org.elasticsearch.xcontent.XContentBuilder;
 import org.elasticsearch.xcontent.XContentParser;
 import org.elasticsearch.xcontent.XContentParser.Token;
@@ -98,7 +97,7 @@ public class SparseVectorFieldMapper extends FieldMapper {
 
         private final Parameter<Boolean> stored = Parameter.storeParam(m -> toType(m).fieldType().isStored(), false);
         private final Parameter<Map<String, String>> meta = Parameter.metaParam();
-        private final Parameter<IndexOptions> indexOptions = new Parameter<>(
+        private final Parameter<SparseVectorIndexOptions> indexOptions = new Parameter<>(
             SPARSE_VECTOR_INDEX_OPTIONS,
             true,
             () -> null,
@@ -128,9 +127,9 @@ public class SparseVectorFieldMapper extends FieldMapper {
 
         @Override
         public SparseVectorFieldMapper build(MapperBuilderContext context) {
-            IndexOptions builderIndexOptions = indexOptions.getValue();
+            SparseVectorIndexOptions builderIndexOptions = indexOptions.getValue();
             if (builderIndexOptions == null) {
-                builderIndexOptions = getDefaultIndexOptions(indexVersionCreated);
+                builderIndexOptions = SparseVectorIndexOptions.getDefaultIndexOptions(indexVersionCreated);
             }
 
             final boolean syntheticVectorFinal = context.isSourceSynthetic() == false && isSyntheticVector;
@@ -149,33 +148,34 @@ public class SparseVectorFieldMapper extends FieldMapper {
             );
         }
 
-        private IndexOptions getDefaultIndexOptions(IndexVersion indexVersion) {
-            return (indexVersion.onOrAfter(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION)
-                || indexVersion.between(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION_8_X, IndexVersions.UPGRADE_TO_LUCENE_10_0_0))
-                    ? IndexOptions.DEFAULT_PRUNING_INDEX_OPTIONS
-                    : null;
+        private boolean indexOptionsSerializerCheck(boolean includeDefaults, boolean isConfigured, SparseVectorIndexOptions value) {
+            return includeDefaults || (SparseVectorIndexOptions.isDefaultOptions(value, indexVersionCreated) == false);
         }
 
-        private boolean indexOptionsSerializerCheck(boolean includeDefaults, boolean isConfigured, IndexOptions value) {
-            return includeDefaults || (IndexOptions.isDefaultOptions(value, indexVersionCreated) == false);
+        public void setIndexOptions(SparseVectorIndexOptions sparseVectorIndexOptions) {
+            indexOptions.setValue(sparseVectorIndexOptions);
         }
     }
 
-    public IndexOptions getIndexOptions() {
+    public SparseVectorIndexOptions getIndexOptions() {
         return fieldType().getIndexOptions();
     }
 
-    private static final ConstructingObjectParser<IndexOptions, Void> INDEX_OPTIONS_PARSER = new ConstructingObjectParser<>(
+    private static final ConstructingObjectParser<SparseVectorIndexOptions, Void> INDEX_OPTIONS_PARSER = new ConstructingObjectParser<>(
         SPARSE_VECTOR_INDEX_OPTIONS,
-        args -> new IndexOptions((Boolean) args[0], (TokenPruningConfig) args[1])
+        args -> new SparseVectorIndexOptions((Boolean) args[0], (TokenPruningConfig) args[1])
     );
 
     static {
-        INDEX_OPTIONS_PARSER.declareBoolean(optionalConstructorArg(), IndexOptions.PRUNE_FIELD_NAME);
-        INDEX_OPTIONS_PARSER.declareObject(optionalConstructorArg(), TokenPruningConfig.PARSER, IndexOptions.PRUNING_CONFIG_FIELD_NAME);
+        INDEX_OPTIONS_PARSER.declareBoolean(optionalConstructorArg(), SparseVectorIndexOptions.PRUNE_FIELD_NAME);
+        INDEX_OPTIONS_PARSER.declareObject(
+            optionalConstructorArg(),
+            TokenPruningConfig.PARSER,
+            SparseVectorIndexOptions.PRUNING_CONFIG_FIELD_NAME
+        );
     }
 
-    private static SparseVectorFieldMapper.IndexOptions parseIndexOptions(MappingParserContext context, Object propNode) {
+    private static SparseVectorIndexOptions parseIndexOptions(MappingParserContext context, Object propNode) {
         if (propNode == null) {
             return null;
         }
@@ -212,7 +212,7 @@ public class SparseVectorFieldMapper extends FieldMapper {
 
     public static final class SparseVectorFieldType extends MappedFieldType {
         private final IndexVersion indexVersionCreated;
-        private final IndexOptions indexOptions;
+        private final SparseVectorIndexOptions indexOptions;
 
         public SparseVectorFieldType(IndexVersion indexVersionCreated, String name, boolean isStored, Map<String, String> meta) {
             this(indexVersionCreated, name, isStored, meta, null);
@@ -223,14 +223,14 @@ public class SparseVectorFieldMapper extends FieldMapper {
             String name,
             boolean isStored,
             Map<String, String> meta,
-            @Nullable SparseVectorFieldMapper.IndexOptions indexOptions
+            @Nullable SparseVectorIndexOptions indexOptions
         ) {
             super(name, true, isStored, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta);
             this.indexVersionCreated = indexVersionCreated;
             this.indexOptions = indexOptions;
         }
 
-        public IndexOptions getIndexOptions() {
+        public SparseVectorIndexOptions getIndexOptions() {
             return indexOptions;
         }
 
@@ -560,15 +560,18 @@ public class SparseVectorFieldMapper extends FieldMapper {
         }
     }
 
-    public static class IndexOptions implements ToXContent {
+    public static class SparseVectorIndexOptions implements IndexOptions {
         public static final ParseField PRUNE_FIELD_NAME = new ParseField("prune");
         public static final ParseField PRUNING_CONFIG_FIELD_NAME = new ParseField("pruning_config");
-        public static final IndexOptions DEFAULT_PRUNING_INDEX_OPTIONS = new IndexOptions(true, new TokenPruningConfig());
+        public static final SparseVectorIndexOptions DEFAULT_PRUNING_INDEX_OPTIONS = new SparseVectorIndexOptions(
+            true,
+            new TokenPruningConfig()
+        );
 
         final Boolean prune;
         final TokenPruningConfig pruningConfig;
 
-        IndexOptions(@Nullable Boolean prune, @Nullable TokenPruningConfig pruningConfig) {
+        public SparseVectorIndexOptions(@Nullable Boolean prune, @Nullable TokenPruningConfig pruningConfig) {
             if (pruningConfig != null && (prune == null || prune == false)) {
                 throw new IllegalArgumentException(
                     "["
@@ -585,14 +588,37 @@ public class SparseVectorFieldMapper extends FieldMapper {
             this.pruningConfig = pruningConfig;
         }
 
-        public static boolean isDefaultOptions(IndexOptions indexOptions, IndexVersion indexVersion) {
-            IndexOptions defaultIndexOptions = indexVersionSupportsDefaultPruningConfig(indexVersion)
+        public static boolean isDefaultOptions(SparseVectorIndexOptions indexOptions, IndexVersion indexVersion) {
+            SparseVectorIndexOptions defaultIndexOptions = indexVersionSupportsDefaultPruningConfig(indexVersion)
                 ? DEFAULT_PRUNING_INDEX_OPTIONS
                 : null;
 
             return Objects.equals(indexOptions, defaultIndexOptions);
         }
 
+        public static SparseVectorIndexOptions getDefaultIndexOptions(IndexVersion indexVersion) {
+            return indexVersionSupportsDefaultPruningConfig(indexVersion) ? DEFAULT_PRUNING_INDEX_OPTIONS : null;
+        }
+
+        public static SparseVectorIndexOptions parseFromMap(Map<String, Object> map) {
+            if (map == null) {
+                return null;
+            }
+
+            try {
+                XContentParser parser = new MapXContentParser(
+                    NamedXContentRegistry.EMPTY,
+                    DeprecationHandler.IGNORE_DEPRECATIONS,
+                    map,
+                    XContentType.JSON
+                );
+
+                return INDEX_OPTIONS_PARSER.parse(parser, null);
+            } catch (IOException ioEx) {
+                throw new UncheckedIOException(ioEx);
+            }
+        }
+
         public Boolean getPrune() {
             return prune;
         }
@@ -626,7 +652,7 @@ public class SparseVectorFieldMapper extends FieldMapper {
                 return false;
             }
 
-            IndexOptions otherAsIndexOptions = (IndexOptions) other;
+            SparseVectorIndexOptions otherAsIndexOptions = (SparseVectorIndexOptions) other;
             return Objects.equals(prune, otherAsIndexOptions.prune) && Objects.equals(pruningConfig, otherAsIndexOptions.pruningConfig);
         }
 

+ 4 - 0
server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java

@@ -906,4 +906,8 @@ public class SparseVectorFieldMapperTests extends SyntheticVectorsMapperTestCase
         }
         return result;
     }
+
+    public static IndexVersion getIndexOptionsCompatibleIndexVersion() {
+        return IndexVersionUtils.randomVersionBetween(random(), SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT, IndexVersion.current());
+    }
 }

+ 24 - 0
server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldTypeTests.java

@@ -40,4 +40,28 @@ public class SparseVectorFieldTypeTests extends FieldTypeTestCase {
         MappedFieldType fieldType = new SparseVectorFieldMapper.SparseVectorFieldType(indexVersion, "field", false, Collections.emptyMap());
         assertFalse(fieldType.isAggregatable());
     }
+
+    public static SparseVectorFieldMapper.SparseVectorIndexOptions randomSparseVectorIndexOptions() {
+        return randomSparseVectorIndexOptions(true);
+    }
+
+    public static SparseVectorFieldMapper.SparseVectorIndexOptions randomSparseVectorIndexOptions(boolean includeNull) {
+        if (includeNull && randomBoolean()) {
+            return null;
+        }
+
+        Boolean prune = randomBoolean() ? null : randomBoolean();
+        if (prune == null) {
+            new SparseVectorFieldMapper.SparseVectorIndexOptions(null, null);
+        }
+
+        if (prune == Boolean.FALSE) {
+            new SparseVectorFieldMapper.SparseVectorIndexOptions(false, null);
+        }
+
+        return new SparseVectorFieldMapper.SparseVectorIndexOptions(
+            true,
+            new TokenPruningConfig(randomFloatBetween(1.0f, 100.0f, true), randomFloatBetween(0.0f, 1.0f, true), randomBoolean())
+        );
+    }
 }

+ 3 - 1
x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java

@@ -19,6 +19,7 @@ import java.util.Set;
 import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper.SEMANTIC_TEXT_EXCLUDE_SUB_FIELDS_FROM_FIELD_CAPS;
 import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper.SEMANTIC_TEXT_INDEX_OPTIONS;
 import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper.SEMANTIC_TEXT_INDEX_OPTIONS_WITH_DEFAULTS;
+import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper.SEMANTIC_TEXT_SPARSE_VECTOR_INDEX_OPTIONS;
 import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper.SEMANTIC_TEXT_SUPPORT_CHUNKING_CONFIG;
 import static org.elasticsearch.xpack.inference.queries.SemanticKnnVectorQueryRewriteInterceptor.SEMANTIC_KNN_FILTER_FIX;
 import static org.elasticsearch.xpack.inference.queries.SemanticKnnVectorQueryRewriteInterceptor.SEMANTIC_KNN_VECTOR_QUERY_REWRITE_INTERCEPTION_SUPPORTED;
@@ -78,7 +79,8 @@ public class InferenceFeatures implements FeatureSpecification {
                 COHERE_V2_API,
                 SEMANTIC_TEXT_INDEX_OPTIONS_WITH_DEFAULTS,
                 SEMANTIC_QUERY_REWRITE_INTERCEPTORS_PROPAGATE_BOOST_AND_QUERY_NAME_FIX,
-                SEMANTIC_TEXT_HIGHLIGHTING_FLAT
+                SEMANTIC_TEXT_HIGHLIGHTING_FLAT,
+                SEMANTIC_TEXT_SPARSE_VECTOR_INDEX_OPTIONS
             )
         );
         if (RERANK_SNIPPETS.isEnabled()) {

+ 95 - 48
x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java

@@ -143,6 +143,9 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
     public static final NodeFeature SEMANTIC_TEXT_INDEX_OPTIONS_WITH_DEFAULTS = new NodeFeature(
         "semantic_text.index_options_with_defaults"
     );
+    public static final NodeFeature SEMANTIC_TEXT_SPARSE_VECTOR_INDEX_OPTIONS = new NodeFeature(
+        "semantic_text.sparse_vector_index_options"
+    );
 
     public static final String CONTENT_TYPE = "semantic_text";
     public static final String DEFAULT_ELSER_2_INFERENCE_ID = DEFAULT_ELSER_ID;
@@ -458,8 +461,20 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
                 );
             }
 
-            if (indexOptions.type() == SemanticTextIndexOptions.SupportedIndexOptions.DENSE_VECTOR) {
+            if (indexOptions.type() == SemanticTextIndexOptions.SupportedIndexOptions.SPARSE_VECTOR) {
+                if (modelSettings.taskType() != SPARSE_EMBEDDING) {
+                    throw new IllegalArgumentException(
+                        "Invalid task type for index options, required ["
+                            + SPARSE_EMBEDDING
+                            + "] but was ["
+                            + modelSettings.taskType()
+                            + "]"
+                    );
+                }
+                return;
+            }
 
+            if (indexOptions.type() == SemanticTextIndexOptions.SupportedIndexOptions.DENSE_VECTOR) {
                 if (modelSettings.taskType() != TEXT_EMBEDDING) {
                     throw new IllegalArgumentException(
                         "Invalid task type for index options, required [" + TEXT_EMBEDDING + "] but was [" + modelSettings.taskType() + "]"
@@ -471,7 +486,6 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
                     (DenseVectorFieldMapper.DenseVectorIndexOptions) indexOptions.indexOptions();
                 denseVectorIndexOptions.validate(modelSettings.elementType(), dims, true);
             }
-
         }
 
         /**
@@ -1169,9 +1183,17 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
         boolean useLegacyFormat
     ) {
         return switch (modelSettings.taskType()) {
-            case SPARSE_EMBEDDING -> new SparseVectorFieldMapper.Builder(CHUNKED_EMBEDDINGS_FIELD, indexVersionCreated, false).setStored(
-                useLegacyFormat == false
-            );
+            case SPARSE_EMBEDDING -> {
+                SparseVectorFieldMapper.Builder sparseVectorMapperBuilder = new SparseVectorFieldMapper.Builder(
+                    CHUNKED_EMBEDDINGS_FIELD,
+                    indexVersionCreated,
+                    false
+                ).setStored(useLegacyFormat == false);
+
+                configureSparseVectorMapperBuilder(indexVersionCreated, sparseVectorMapperBuilder, indexOptions);
+
+                yield sparseVectorMapperBuilder;
+            }
             case TEXT_EMBEDDING -> {
                 DenseVectorFieldMapper.Builder denseVectorMapperBuilder = new DenseVectorFieldMapper.Builder(
                     CHUNKED_EMBEDDINGS_FIELD,
@@ -1179,45 +1201,7 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
                     false
                 );
 
-                SimilarityMeasure similarity = modelSettings.similarity();
-                if (similarity != null) {
-                    switch (similarity) {
-                        case COSINE -> denseVectorMapperBuilder.similarity(DenseVectorFieldMapper.VectorSimilarity.COSINE);
-                        case DOT_PRODUCT -> denseVectorMapperBuilder.similarity(DenseVectorFieldMapper.VectorSimilarity.DOT_PRODUCT);
-                        case L2_NORM -> denseVectorMapperBuilder.similarity(DenseVectorFieldMapper.VectorSimilarity.L2_NORM);
-                        default -> throw new IllegalArgumentException(
-                            "Unknown similarity measure in model_settings [" + similarity.name() + "]"
-                        );
-                    }
-                }
-                denseVectorMapperBuilder.dimensions(modelSettings.dimensions());
-                denseVectorMapperBuilder.elementType(modelSettings.elementType());
-                // Here is where we persist index_options. If they are specified by the user, we will use those index_options,
-                // otherwise we will determine if we can set default index options. If we can't, we won't persist any index_options
-                // and the field will use the defaults for the dense_vector field.
-                if (indexOptions != null) {
-                    DenseVectorFieldMapper.DenseVectorIndexOptions denseVectorIndexOptions =
-                        (DenseVectorFieldMapper.DenseVectorIndexOptions) indexOptions.indexOptions();
-                    denseVectorMapperBuilder.indexOptions(denseVectorIndexOptions);
-                    denseVectorIndexOptions.validate(modelSettings.elementType(), modelSettings.dimensions(), true);
-                } else {
-                    DenseVectorFieldMapper.DenseVectorIndexOptions defaultIndexOptions = defaultDenseVectorIndexOptions(
-                        indexVersionCreated,
-                        modelSettings
-                    );
-                    if (defaultIndexOptions != null) {
-                        denseVectorMapperBuilder.indexOptions(defaultIndexOptions);
-                    }
-                }
-
-                boolean hasUserSpecifiedIndexOptions = indexOptions != null;
-                DenseVectorFieldMapper.DenseVectorIndexOptions denseVectorIndexOptions = hasUserSpecifiedIndexOptions
-                    ? (DenseVectorFieldMapper.DenseVectorIndexOptions) indexOptions.indexOptions()
-                    : defaultDenseVectorIndexOptions(indexVersionCreated, modelSettings);
-
-                if (denseVectorIndexOptions != null) {
-                    denseVectorMapperBuilder.indexOptions(denseVectorIndexOptions);
-                }
+                configureDenseVectorMapperBuilder(indexVersionCreated, denseVectorMapperBuilder, modelSettings, indexOptions);
 
                 yield denseVectorMapperBuilder;
             }
@@ -1225,6 +1209,62 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
         };
     }
 
+    private static void configureSparseVectorMapperBuilder(
+        IndexVersion indexVersionCreated,
+        SparseVectorFieldMapper.Builder sparseVectorMapperBuilder,
+        SemanticTextIndexOptions indexOptions
+    ) {
+        if (indexOptions != null) {
+            SparseVectorFieldMapper.SparseVectorIndexOptions sparseVectorIndexOptions =
+                (SparseVectorFieldMapper.SparseVectorIndexOptions) indexOptions.indexOptions();
+
+            sparseVectorMapperBuilder.setIndexOptions(sparseVectorIndexOptions);
+        } else {
+            SparseVectorFieldMapper.SparseVectorIndexOptions defaultIndexOptions = SparseVectorFieldMapper.SparseVectorIndexOptions
+                .getDefaultIndexOptions(indexVersionCreated);
+            if (defaultIndexOptions != null) {
+                sparseVectorMapperBuilder.setIndexOptions(defaultIndexOptions);
+            }
+        }
+    }
+
+    private static void configureDenseVectorMapperBuilder(
+        IndexVersion indexVersionCreated,
+        DenseVectorFieldMapper.Builder denseVectorMapperBuilder,
+        MinimalServiceSettings modelSettings,
+        SemanticTextIndexOptions indexOptions
+    ) {
+        SimilarityMeasure similarity = modelSettings.similarity();
+        if (similarity != null) {
+            switch (similarity) {
+                case COSINE -> denseVectorMapperBuilder.similarity(DenseVectorFieldMapper.VectorSimilarity.COSINE);
+                case DOT_PRODUCT -> denseVectorMapperBuilder.similarity(DenseVectorFieldMapper.VectorSimilarity.DOT_PRODUCT);
+                case L2_NORM -> denseVectorMapperBuilder.similarity(DenseVectorFieldMapper.VectorSimilarity.L2_NORM);
+                default -> throw new IllegalArgumentException("Unknown similarity measure in model_settings [" + similarity.name() + "]");
+            }
+        }
+
+        denseVectorMapperBuilder.dimensions(modelSettings.dimensions());
+        denseVectorMapperBuilder.elementType(modelSettings.elementType());
+        // Here is where we persist index_options. If they are specified by the user, we will use those index_options,
+        // otherwise we will determine if we can set default index options. If we can't, we won't persist any index_options
+        // and the field will use the defaults for the dense_vector field.
+        if (indexOptions != null) {
+            DenseVectorFieldMapper.DenseVectorIndexOptions denseVectorIndexOptions =
+                (DenseVectorFieldMapper.DenseVectorIndexOptions) indexOptions.indexOptions();
+            denseVectorMapperBuilder.indexOptions(denseVectorIndexOptions);
+            denseVectorIndexOptions.validate(modelSettings.elementType(), modelSettings.dimensions(), true);
+        } else {
+            DenseVectorFieldMapper.DenseVectorIndexOptions defaultIndexOptions = defaultDenseVectorIndexOptions(
+                indexVersionCreated,
+                modelSettings
+            );
+            if (defaultIndexOptions != null) {
+                denseVectorMapperBuilder.indexOptions(defaultIndexOptions);
+            }
+        }
+    }
+
     static DenseVectorFieldMapper.DenseVectorIndexOptions defaultDenseVectorIndexOptions(
         IndexVersion indexVersionCreated,
         MinimalServiceSettings modelSettings
@@ -1259,23 +1299,30 @@ public class SemanticTextFieldMapper extends FieldMapper implements InferenceFie
     }
 
     static SemanticTextIndexOptions defaultIndexOptions(IndexVersion indexVersionCreated, MinimalServiceSettings modelSettings) {
-
         if (modelSettings == null) {
             return null;
         }
 
-        SemanticTextIndexOptions defaultIndexOptions = null;
         if (modelSettings.taskType() == TaskType.TEXT_EMBEDDING) {
             DenseVectorFieldMapper.DenseVectorIndexOptions denseVectorIndexOptions = defaultDenseVectorIndexOptions(
                 indexVersionCreated,
                 modelSettings
             );
-            defaultIndexOptions = denseVectorIndexOptions == null
+            return denseVectorIndexOptions == null
                 ? null
                 : new SemanticTextIndexOptions(SemanticTextIndexOptions.SupportedIndexOptions.DENSE_VECTOR, denseVectorIndexOptions);
         }
 
-        return defaultIndexOptions;
+        if (modelSettings.taskType() == SPARSE_EMBEDDING) {
+            SparseVectorFieldMapper.SparseVectorIndexOptions sparseVectorIndexOptions = SparseVectorFieldMapper.SparseVectorIndexOptions
+                .getDefaultIndexOptions(indexVersionCreated);
+
+            return sparseVectorIndexOptions == null
+                ? null
+                : new SemanticTextIndexOptions(SemanticTextIndexOptions.SupportedIndexOptions.SPARSE_VECTOR, sparseVectorIndexOptions);
+        }
+
+        return null;
     }
 
     private static boolean canMergeModelSettings(MinimalServiceSettings previous, MinimalServiceSettings current, Conflicts conflicts) {

+ 15 - 0
x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextIndexOptions.java

@@ -13,6 +13,7 @@ import org.elasticsearch.common.xcontent.support.XContentMapValues;
 import org.elasticsearch.index.IndexVersion;
 import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper;
 import org.elasticsearch.index.mapper.vectors.IndexOptions;
+import org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper;
 import org.elasticsearch.xcontent.ToXContent;
 import org.elasticsearch.xcontent.XContentBuilder;
 
@@ -76,6 +77,12 @@ public class SemanticTextIndexOptions implements ToXContent {
             public IndexOptions parseIndexOptions(String fieldName, Map<String, Object> map, IndexVersion indexVersion) {
                 return parseDenseVectorIndexOptionsFromMap(fieldName, map, indexVersion);
             }
+        },
+        SPARSE_VECTOR("sparse_vector") {
+            @Override
+            public IndexOptions parseIndexOptions(String fieldName, Map<String, Object> map, IndexVersion indexVersion) {
+                return parseSparseVectorIndexOptionsFromMap(map);
+            }
         };
 
         public final String value;
@@ -127,4 +134,12 @@ public class SemanticTextIndexOptions implements ToXContent {
             throw new ElasticsearchException(exc);
         }
     }
+
+    private static SparseVectorFieldMapper.SparseVectorIndexOptions parseSparseVectorIndexOptionsFromMap(Map<String, Object> map) {
+        try {
+            return SparseVectorFieldMapper.SparseVectorIndexOptions.parseFromMap(map);
+        } catch (Exception exc) {
+            throw new ElasticsearchException(exc);
+        }
+    }
 }

+ 310 - 61
x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java

@@ -56,6 +56,9 @@ import org.elasticsearch.index.mapper.SourceToParse;
 import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper;
 import org.elasticsearch.index.mapper.vectors.DenseVectorFieldTypeTests;
 import org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper;
+import org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapperTests;
+import org.elasticsearch.index.mapper.vectors.SparseVectorFieldTypeTests;
+import org.elasticsearch.index.mapper.vectors.TokenPruningConfig;
 import org.elasticsearch.index.query.SearchExecutionContext;
 import org.elasticsearch.index.search.ESToParentBlockJoinQuery;
 import org.elasticsearch.inference.ChunkingSettings;
@@ -95,6 +98,7 @@ import java.util.function.BiConsumer;
 import java.util.function.Supplier;
 
 import static org.elasticsearch.index.mapper.vectors.DenseVectorFieldTypeTests.randomIndexOptionsAll;
+import static org.elasticsearch.index.mapper.vectors.SparseVectorFieldTypeTests.randomSparseVectorIndexOptions;
 import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.CHUNKED_EMBEDDINGS_FIELD;
 import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.CHUNKS_FIELD;
 import static org.elasticsearch.xpack.inference.mapper.SemanticTextField.INFERENCE_FIELD;
@@ -113,6 +117,9 @@ import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldTests.ra
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.instanceOf;
+import static org.mockito.ArgumentMatchers.anyString;
+import static org.mockito.Mockito.spy;
+import static org.mockito.Mockito.when;
 
 public class SemanticTextFieldMapperTests extends MapperTestCase {
     private final boolean useLegacyFormat;
@@ -123,9 +130,20 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
         this.useLegacyFormat = useLegacyFormat;
     }
 
+    ModelRegistry globalModelRegistry;
+
     @Before
     private void startThreadPool() {
         threadPool = createThreadPool();
+        var clusterService = ClusterServiceUtils.createClusterService(threadPool);
+        var modelRegistry = new ModelRegistry(clusterService, new NoOpClient(threadPool));
+        globalModelRegistry = spy(modelRegistry);
+        globalModelRegistry.clusterChanged(new ClusterChangedEvent("init", clusterService.state(), clusterService.state()) {
+            @Override
+            public boolean localNodeMaster() {
+                return false;
+            }
+        });
     }
 
     @After
@@ -140,18 +158,10 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
 
     @Override
     protected Collection<? extends Plugin> getPlugins() {
-        var clusterService = ClusterServiceUtils.createClusterService(threadPool);
-        var modelRegistry = new ModelRegistry(clusterService, new NoOpClient(threadPool));
-        modelRegistry.clusterChanged(new ClusterChangedEvent("init", clusterService.state(), clusterService.state()) {
-            @Override
-            public boolean localNodeMaster() {
-                return false;
-            }
-        });
         return List.of(new InferencePlugin(Settings.EMPTY) {
             @Override
             protected Supplier<ModelRegistry> getModelRegistry() {
-                return () -> modelRegistry;
+                return () -> globalModelRegistry;
             }
         }, new XPackClientPlugin());
     }
@@ -174,6 +184,11 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
     ) throws IOException {
         validateIndexVersion(minIndexVersion, useLegacyFormat);
         IndexVersion indexVersion = IndexVersionUtils.randomVersionBetween(random(), minIndexVersion, maxIndexVersion);
+        return createMapperServiceWithIndexVersion(mappings, useLegacyFormat, indexVersion);
+    }
+
+    private MapperService createMapperServiceWithIndexVersion(XContentBuilder mappings, boolean useLegacyFormat, IndexVersion indexVersion)
+        throws IOException {
         var settings = Settings.builder()
             .put(IndexMetadata.SETTING_INDEX_VERSION_CREATED.getKey(), indexVersion)
             .put(InferenceMetadataFieldsMapper.USE_LEGACY_SEMANTIC_TEXT_FORMAT.getKey(), useLegacyFormat)
@@ -189,17 +204,6 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
         }
     }
 
-    private MapperService createMapperService(String mappings, boolean useLegacyFormat) throws IOException {
-        var settings = Settings.builder()
-            .put(
-                IndexMetadata.SETTING_INDEX_VERSION_CREATED.getKey(),
-                SemanticInferenceMetadataFieldsMapperTests.getRandomCompatibleIndexVersion(useLegacyFormat)
-            )
-            .put(InferenceMetadataFieldsMapper.USE_LEGACY_SEMANTIC_TEXT_FORMAT.getKey(), useLegacyFormat)
-            .build();
-        return createMapperService(settings, mappings);
-    }
-
     @Override
     protected Settings getIndexSettings() {
         return Settings.builder()
@@ -380,6 +384,14 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
         }
     }
 
+    private SemanticTextIndexOptions getDefaultSparseVectorIndexOptionsForMapper(MapperService mapperService) {
+        var mapperIndexVersion = mapperService.getIndexSettings().getIndexVersionCreated();
+        var defaultSparseVectorIndexOptions = SparseVectorFieldMapper.SparseVectorIndexOptions.getDefaultIndexOptions(mapperIndexVersion);
+        return defaultSparseVectorIndexOptions == null
+            ? null
+            : new SemanticTextIndexOptions(SemanticTextIndexOptions.SupportedIndexOptions.SPARSE_VECTOR, defaultSparseVectorIndexOptions);
+    }
+
     public void testInvalidTaskTypes() {
         for (var taskType : TaskType.values()) {
             if (taskType == TaskType.TEXT_EMBEDDING || taskType == TaskType.SPARSE_EMBEDDING) {
@@ -415,7 +427,13 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
             }), useLegacyFormat));
             assertThat(e.getMessage(), containsString("Field [semantic] of type [semantic_text] can't be used in multifields"));
         } else {
-            var mapperService = createMapperService(fieldMapping(b -> {
+            IndexVersion indexVersion = SparseVectorFieldMapperTests.getIndexOptionsCompatibleIndexVersion();
+            SparseVectorFieldMapper.SparseVectorIndexOptions expectedIndexOptions = SparseVectorFieldMapper.SparseVectorIndexOptions
+                .getDefaultIndexOptions(indexVersion);
+            SemanticTextIndexOptions semanticTextIndexOptions = expectedIndexOptions == null
+                ? null
+                : new SemanticTextIndexOptions(SemanticTextIndexOptions.SupportedIndexOptions.SPARSE_VECTOR, expectedIndexOptions);
+            var mapperService = createMapperServiceWithIndexVersion(fieldMapping(b -> {
                 b.field("type", "text");
                 b.startObject("fields");
                 b.startObject("semantic");
@@ -426,10 +444,10 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
                 b.endObject();
                 b.endObject();
                 b.endObject();
-            }), useLegacyFormat);
-            assertSemanticTextField(mapperService, "field.semantic", true, null, null);
+            }), useLegacyFormat, indexVersion);
+            assertSemanticTextField(mapperService, "field.semantic", true, null, semanticTextIndexOptions);
 
-            mapperService = createMapperService(fieldMapping(b -> {
+            mapperService = createMapperServiceWithIndexVersion(fieldMapping(b -> {
                 b.field("type", "semantic_text");
                 b.field("inference_id", "my_inference_id");
                 b.startObject("model_settings");
@@ -440,10 +458,10 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
                 b.field("type", "text");
                 b.endObject();
                 b.endObject();
-            }), useLegacyFormat);
-            assertSemanticTextField(mapperService, "field", true, null, null);
+            }), useLegacyFormat, indexVersion);
+            assertSemanticTextField(mapperService, "field", true, null, semanticTextIndexOptions);
 
-            mapperService = createMapperService(fieldMapping(b -> {
+            mapperService = createMapperServiceWithIndexVersion(fieldMapping(b -> {
                 b.field("type", "semantic_text");
                 b.field("inference_id", "my_inference_id");
                 b.startObject("model_settings");
@@ -458,9 +476,9 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
                 b.endObject();
                 b.endObject();
                 b.endObject();
-            }), useLegacyFormat);
-            assertSemanticTextField(mapperService, "field", true, null, null);
-            assertSemanticTextField(mapperService, "field.semantic", true, null, null);
+            }), useLegacyFormat, indexVersion);
+            assertSemanticTextField(mapperService, "field", true, null, semanticTextIndexOptions);
+            assertSemanticTextField(mapperService, "field.semantic", true, null, semanticTextIndexOptions);
 
             Exception e = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> {
                 b.field("type", "semantic_text");
@@ -472,7 +490,6 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
                 b.endObject();
             }), useLegacyFormat));
             assertThat(e.getMessage(), containsString("is already used by another field"));
-
         }
     }
 
@@ -504,7 +521,8 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
                 inferenceId,
                 new MinimalServiceSettings("service", TaskType.SPARSE_EMBEDDING, null, null, null)
             );
-            assertSemanticTextField(mapperService, fieldName, true, null, null);
+            var expectedIndexOptions = getDefaultSparseVectorIndexOptionsForMapper(mapperService);
+            assertSemanticTextField(mapperService, fieldName, true, null, expectedIndexOptions);
             assertInferenceEndpoints(mapperService, fieldName, inferenceId, inferenceId);
         }
 
@@ -515,7 +533,8 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
                 searchInferenceId,
                 new MinimalServiceSettings("service", TaskType.SPARSE_EMBEDDING, null, null, null)
             );
-            assertSemanticTextField(mapperService, fieldName, true, null, null);
+            var expectedIndexOptions = getDefaultSparseVectorIndexOptionsForMapper(mapperService);
+            assertSemanticTextField(mapperService, fieldName, true, null, expectedIndexOptions);
             assertInferenceEndpoints(mapperService, fieldName, inferenceId, searchInferenceId);
         }
     }
@@ -559,14 +578,16 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
                             .endObject()
                     )
                 );
-                assertSemanticTextField(mapperService, fieldName, true, null, null);
+                var expectedIndexOptions = getDefaultSparseVectorIndexOptionsForMapper(mapperService);
+                assertSemanticTextField(mapperService, fieldName, true, null, expectedIndexOptions);
             }
             {
                 merge(
                     mapperService,
                     mapping(b -> b.startObject(fieldName).field("type", "semantic_text").field("inference_id", "test_model").endObject())
                 );
-                assertSemanticTextField(mapperService, fieldName, true, null, null);
+                var expectedIndexOptions = getDefaultSparseVectorIndexOptionsForMapper(mapperService);
+                assertSemanticTextField(mapperService, fieldName, true, null, expectedIndexOptions);
             }
             {
                 Exception exc = expectThrows(
@@ -614,6 +635,87 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
         }
     }
 
+    private void addSparseVectorModelSettingsToBuilder(XContentBuilder b) throws IOException {
+        b.startObject("model_settings");
+        b.field("task_type", TaskType.SPARSE_EMBEDDING);
+        b.endObject();
+    }
+
+    public void testSparseVectorIndexOptionsValidationAndMapping() throws IOException {
+        for (int depth = 1; depth < 5; depth++) {
+            String inferenceId = "test_model";
+            String fieldName = randomFieldName(depth);
+            IndexVersion indexVersion = SparseVectorFieldMapperTests.getIndexOptionsCompatibleIndexVersion();
+            var sparseVectorIndexOptions = SparseVectorFieldTypeTests.randomSparseVectorIndexOptions();
+            var expectedIndexOptions = sparseVectorIndexOptions == null
+                ? null
+                : new SemanticTextIndexOptions(SemanticTextIndexOptions.SupportedIndexOptions.SPARSE_VECTOR, sparseVectorIndexOptions);
+
+            // should not throw an exception
+            MapperService mapper = createMapperServiceWithIndexVersion(mapping(b -> {
+                b.startObject(fieldName);
+                {
+                    b.field("type", SemanticTextFieldMapper.CONTENT_TYPE);
+                    b.field(INFERENCE_ID_FIELD, inferenceId);
+                    addSparseVectorModelSettingsToBuilder(b);
+                    if (sparseVectorIndexOptions != null) {
+                        b.startObject(INDEX_OPTIONS_FIELD);
+                        {
+                            b.field(SparseVectorFieldMapper.CONTENT_TYPE);
+                            sparseVectorIndexOptions.toXContent(b, null);
+                        }
+                        b.endObject();
+                    }
+                }
+                b.endObject();
+            }), useLegacyFormat, indexVersion);
+
+            assertSemanticTextField(mapper, fieldName, true, null, expectedIndexOptions);
+        }
+    }
+
+    public void testSparseVectorMappingUpdate() throws IOException {
+        for (int i = 0; i < 5; i++) {
+            Model model = TestModel.createRandomInstance(TaskType.SPARSE_EMBEDDING);
+            when(globalModelRegistry.getMinimalServiceSettings(anyString())).thenAnswer(
+                invocation -> { return new MinimalServiceSettings(model); }
+            );
+
+            final ChunkingSettings chunkingSettings = generateRandomChunkingSettings(false);
+            IndexVersion indexVersion = SparseVectorFieldMapperTests.getIndexOptionsCompatibleIndexVersion();
+            final SemanticTextIndexOptions indexOptions = randomSemanticTextIndexOptions(TaskType.SPARSE_EMBEDDING);
+            String fieldName = "field";
+
+            MapperService mapperService = createMapperServiceWithIndexVersion(
+                mapping(b -> addSemanticTextMapping(b, fieldName, model.getInferenceEntityId(), null, chunkingSettings, indexOptions)),
+                useLegacyFormat,
+                indexVersion
+            );
+            var expectedIndexOptions = (indexOptions == null)
+                ? new SemanticTextIndexOptions(
+                    SemanticTextIndexOptions.SupportedIndexOptions.SPARSE_VECTOR,
+                    SparseVectorFieldMapper.SparseVectorIndexOptions.getDefaultIndexOptions(indexVersion)
+                )
+                : indexOptions;
+            assertSemanticTextField(mapperService, fieldName, false, chunkingSettings, expectedIndexOptions);
+
+            final SemanticTextIndexOptions newIndexOptions = randomSemanticTextIndexOptions(TaskType.SPARSE_EMBEDDING);
+            expectedIndexOptions = (newIndexOptions == null)
+                ? new SemanticTextIndexOptions(
+                    SemanticTextIndexOptions.SupportedIndexOptions.SPARSE_VECTOR,
+                    SparseVectorFieldMapper.SparseVectorIndexOptions.getDefaultIndexOptions(indexVersion)
+                )
+                : newIndexOptions;
+
+            ChunkingSettings newChunkingSettings = generateRandomChunkingSettingsOtherThan(chunkingSettings);
+            merge(
+                mapperService,
+                mapping(b -> addSemanticTextMapping(b, fieldName, model.getInferenceEntityId(), null, newChunkingSettings, newIndexOptions))
+            );
+            assertSemanticTextField(mapperService, fieldName, false, newChunkingSettings, expectedIndexOptions);
+        }
+    }
+
     public void testUpdateSearchInferenceId() throws IOException {
         final String inferenceId = "test_inference_id";
         final String searchInferenceId1 = "test_search_inference_id_1";
@@ -650,27 +752,24 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
                 inferenceId,
                 new MinimalServiceSettings("my-service", TaskType.SPARSE_EMBEDDING, null, null, null)
             );
-            assertSemanticTextField(mapperService, fieldName, true, null, null);
+            var expectedIndexOptions = getDefaultSparseVectorIndexOptionsForMapper(mapperService);
+            assertSemanticTextField(mapperService, fieldName, true, null, expectedIndexOptions);
             assertInferenceEndpoints(mapperService, fieldName, inferenceId, inferenceId);
 
             merge(mapperService, buildMapping.apply(fieldName, searchInferenceId1));
-            assertSemanticTextField(mapperService, fieldName, true, null, null);
+            assertSemanticTextField(mapperService, fieldName, true, null, expectedIndexOptions);
             assertInferenceEndpoints(mapperService, fieldName, inferenceId, searchInferenceId1);
 
             merge(mapperService, buildMapping.apply(fieldName, searchInferenceId2));
-            assertSemanticTextField(mapperService, fieldName, true, null, null);
+            assertSemanticTextField(mapperService, fieldName, true, null, expectedIndexOptions);
             assertInferenceEndpoints(mapperService, fieldName, inferenceId, searchInferenceId2);
 
             merge(mapperService, buildMapping.apply(fieldName, null));
-            assertSemanticTextField(mapperService, fieldName, true, null, null);
+            assertSemanticTextField(mapperService, fieldName, true, null, expectedIndexOptions);
             assertInferenceEndpoints(mapperService, fieldName, inferenceId, inferenceId);
         }
     }
 
-    private static void assertSemanticTextField(MapperService mapperService, String fieldName, boolean expectedModelSettings) {
-        assertSemanticTextField(mapperService, fieldName, expectedModelSettings, null, null);
-    }
-
     private static void assertSemanticTextField(
         MapperService mapperService,
         String fieldName,
@@ -720,9 +819,20 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
             switch (semanticFieldMapper.fieldType().getModelSettings().taskType()) {
                 case SPARSE_EMBEDDING -> {
                     assertThat(embeddingsMapper, instanceOf(SparseVectorFieldMapper.class));
-                    SparseVectorFieldMapper sparseMapper = (SparseVectorFieldMapper) embeddingsMapper;
-                    assertEquals(sparseMapper.fieldType().isStored(), semanticTextFieldType.useLegacyFormat() == false);
-                    assertNull(expectedIndexOptions);
+                    SparseVectorFieldMapper sparseVectorFieldMapper = (SparseVectorFieldMapper) embeddingsMapper;
+                    assertEquals(sparseVectorFieldMapper.fieldType().isStored(), semanticTextFieldType.useLegacyFormat() == false);
+
+                    SparseVectorFieldMapper.SparseVectorIndexOptions applied = sparseVectorFieldMapper.fieldType().getIndexOptions();
+                    SparseVectorFieldMapper.SparseVectorIndexOptions expected = expectedIndexOptions == null
+                        ? null
+                        : (SparseVectorFieldMapper.SparseVectorIndexOptions) expectedIndexOptions.indexOptions();
+                    if (expected == null && applied != null) {
+                        var indexVersionCreated = mapperService.getIndexSettings().getIndexVersionCreated();
+                        if (SparseVectorFieldMapper.SparseVectorIndexOptions.isDefaultOptions(applied, indexVersionCreated)) {
+                            expected = SparseVectorFieldMapper.SparseVectorIndexOptions.getDefaultIndexOptions(indexVersionCreated);
+                        }
+                    }
+                    assertEquals(expected, applied);
                 }
                 case TEXT_EMBEDDING -> {
                     assertThat(embeddingsMapper, instanceOf(DenseVectorFieldMapper.class));
@@ -763,6 +873,8 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
 
     public void testSuccessfulParse() throws IOException {
         for (int depth = 1; depth < 4; depth++) {
+            final IndexVersion indexVersion = SemanticInferenceMetadataFieldsMapperTests.getRandomCompatibleIndexVersion(useLegacyFormat);
+
             final String fieldName1 = randomFieldName(depth);
             final String fieldName2 = randomFieldName(depth + 1);
             final String searchInferenceId = randomAlphaOfLength(8);
@@ -771,6 +883,18 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
             TaskType taskType = TaskType.SPARSE_EMBEDDING;
             Model model1 = TestModel.createRandomInstance(taskType);
             Model model2 = TestModel.createRandomInstance(taskType);
+
+            when(globalModelRegistry.getMinimalServiceSettings(anyString())).thenAnswer(invocation -> {
+                var modelId = (String) invocation.getArguments()[0];
+                if (modelId.equals(model1.getInferenceEntityId())) {
+                    return new MinimalServiceSettings(model1);
+                }
+                if (modelId.equals(model2.getInferenceEntityId())) {
+                    return new MinimalServiceSettings(model2);
+                }
+                return null;
+            });
+
             ChunkingSettings chunkingSettings = null; // Some chunking settings configs can produce different Lucene docs counts
             SemanticTextIndexOptions indexOptions = randomSemanticTextIndexOptions(taskType);
             XContentBuilder mapping = mapping(b -> {
@@ -792,15 +916,22 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
                 );
             });
 
-            MapperService mapperService = createMapperService(mapping, useLegacyFormat);
-            assertSemanticTextField(mapperService, fieldName1, false, null, null);
+            var expectedIndexOptions = (indexOptions == null)
+                ? new SemanticTextIndexOptions(
+                    SemanticTextIndexOptions.SupportedIndexOptions.SPARSE_VECTOR,
+                    SparseVectorFieldMapper.SparseVectorIndexOptions.getDefaultIndexOptions(indexVersion)
+                )
+                : indexOptions;
+
+            MapperService mapperService = createMapperServiceWithIndexVersion(mapping, useLegacyFormat, indexVersion);
+            assertSemanticTextField(mapperService, fieldName1, false, null, expectedIndexOptions);
             assertInferenceEndpoints(
                 mapperService,
                 fieldName1,
                 model1.getInferenceEntityId(),
                 setSearchInferenceId ? searchInferenceId : model1.getInferenceEntityId()
             );
-            assertSemanticTextField(mapperService, fieldName2, false, null, null);
+            assertSemanticTextField(mapperService, fieldName2, false, null, expectedIndexOptions);
             assertInferenceEndpoints(
                 mapperService,
                 fieldName2,
@@ -1015,24 +1146,19 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
 
     public void testSettingAndUpdatingChunkingSettings() throws IOException {
         Model model = TestModel.createRandomInstance(TaskType.SPARSE_EMBEDDING);
+        when(globalModelRegistry.getMinimalServiceSettings(anyString())).thenAnswer(
+            invocation -> { return new MinimalServiceSettings(model); }
+        );
+
         final ChunkingSettings chunkingSettings = generateRandomChunkingSettings(false);
-        final SemanticTextIndexOptions indexOptions = null;
+        final SemanticTextIndexOptions indexOptions = randomSemanticTextIndexOptions(TaskType.SPARSE_EMBEDDING);
         String fieldName = "field";
 
-        SemanticTextField randomSemanticText = randomSemanticText(
-            useLegacyFormat,
-            fieldName,
-            model,
-            chunkingSettings,
-            List.of("a"),
-            XContentType.JSON
-        );
-
         MapperService mapperService = createMapperService(
             mapping(b -> addSemanticTextMapping(b, fieldName, model.getInferenceEntityId(), null, chunkingSettings, indexOptions)),
             useLegacyFormat
         );
-        assertSemanticTextField(mapperService, fieldName, false, chunkingSettings, null);
+        assertSemanticTextField(mapperService, fieldName, false, chunkingSettings, indexOptions);
 
         ChunkingSettings newChunkingSettings = generateRandomChunkingSettingsOtherThan(chunkingSettings);
         merge(
@@ -1046,6 +1172,11 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
         // Create inference results where model settings are set to null and chunks are provided
         TaskType taskType = TaskType.SPARSE_EMBEDDING;
         Model model = TestModel.createRandomInstance(taskType);
+
+        when(globalModelRegistry.getMinimalServiceSettings(anyString())).thenAnswer(
+            invocation -> { return new MinimalServiceSettings(model); }
+        );
+
         ChunkingSettings chunkingSettings = generateRandomChunkingSettings(false);
         SemanticTextIndexOptions indexOptions = randomSemanticTextIndexOptions(taskType);
         SemanticTextField randomSemanticText = randomSemanticText(
@@ -1196,6 +1327,13 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
         );
     }
 
+    private static SemanticTextIndexOptions defaultSparseVectorIndexOptions(IndexVersion indexVersion) {
+        return new SemanticTextIndexOptions(
+            SemanticTextIndexOptions.SupportedIndexOptions.SPARSE_VECTOR,
+            SparseVectorFieldMapper.SparseVectorIndexOptions.getDefaultIndexOptions(indexVersion)
+        );
+    }
+
     public void testDefaultIndexOptions() throws IOException {
 
         // We default to BBQ for eligible dense vectors
@@ -1318,6 +1456,42 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
             IndexVersionUtils.getPreviousVersion(IndexVersions.SEMANTIC_TEXT_DEFAULTS_TO_BBQ_BACKPORT_8_X)
         );
         assertSemanticTextField(mapperService, "field", true, null, defaultDenseVectorSemanticIndexOptions());
+
+        mapperService = createMapperService(fieldMapping(b -> {
+            b.field("type", "semantic_text");
+            b.field("inference_id", "another_inference_id");
+            b.startObject("model_settings");
+            b.field("task_type", "sparse_embedding");
+            b.endObject();
+        }),
+            useLegacyFormat,
+            IndexVersionUtils.getPreviousVersion(IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT),
+            IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT
+        );
+
+        assertSemanticTextField(
+            mapperService,
+            "field",
+            true,
+            null,
+            defaultSparseVectorIndexOptions(mapperService.getIndexSettings().getIndexVersionCreated())
+        );
+    }
+
+    public void testSparseVectorIndexOptionsDefaultsBeforeSupport() throws IOException {
+        var mapperService = createMapperService(fieldMapping(b -> {
+            b.field("type", "semantic_text");
+            b.field("inference_id", "another_inference_id");
+            b.startObject("model_settings");
+            b.field("task_type", "sparse_embedding");
+            b.endObject();
+        }),
+            useLegacyFormat,
+            IndexVersions.INFERENCE_METADATA_FIELDS,
+            IndexVersionUtils.getPreviousVersion(IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT)
+        );
+
+        assertSemanticTextField(mapperService, "field", true, null, null);
     }
 
     public void testSpecifiedDenseVectorIndexOptions() throws IOException {
@@ -1428,7 +1602,74 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
             b.endObject();
         }), useLegacyFormat, IndexVersions.INFERENCE_METADATA_FIELDS_BACKPORT));
         assertThat(e.getMessage(), containsString("Unsupported index options type invalid"));
+    }
+
+    public void testSpecificSparseVectorIndexOptions() throws IOException {
+        for (int i = 0; i < 10; i++) {
+            SparseVectorFieldMapper.SparseVectorIndexOptions testIndexOptions = randomSparseVectorIndexOptions(false);
+            var mapperService = createMapperService(fieldMapping(b -> {
+                b.field("type", SemanticTextFieldMapper.CONTENT_TYPE);
+                b.field(INFERENCE_ID_FIELD, "test_inference_id");
+                addSparseVectorModelSettingsToBuilder(b);
+                b.startObject(INDEX_OPTIONS_FIELD);
+                {
+                    b.field(SparseVectorFieldMapper.CONTENT_TYPE);
+                    testIndexOptions.toXContent(b, null);
+                }
+                b.endObject();
+            }), useLegacyFormat, IndexVersions.INFERENCE_METADATA_FIELDS_BACKPORT);
 
+            assertSemanticTextField(
+                mapperService,
+                "field",
+                true,
+                null,
+                new SemanticTextIndexOptions(SemanticTextIndexOptions.SupportedIndexOptions.SPARSE_VECTOR, testIndexOptions)
+            );
+        }
+    }
+
+    public void testSparseVectorIndexOptionsValidations() throws IOException {
+        Exception e = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> {
+            b.field("type", SemanticTextFieldMapper.CONTENT_TYPE);
+            b.field(INFERENCE_ID_FIELD, "test_inference_id");
+            b.startObject(INDEX_OPTIONS_FIELD);
+            {
+                b.startObject(SparseVectorFieldMapper.CONTENT_TYPE);
+                {
+                    b.field("prune", false);
+                    b.startObject("pruning_config");
+                    {
+                        b.field(TokenPruningConfig.TOKENS_FREQ_RATIO_THRESHOLD.getPreferredName(), 5.0f);
+                    }
+                    b.endObject();
+                }
+                b.endObject();
+            }
+            b.endObject();
+        }), useLegacyFormat, IndexVersions.INFERENCE_METADATA_FIELDS_BACKPORT));
+        assertThat(e.getMessage(), containsString("failed to parse field [pruning_config]"));
+
+        e = expectThrows(MapperParsingException.class, () -> createMapperService(fieldMapping(b -> {
+            b.field("type", SemanticTextFieldMapper.CONTENT_TYPE);
+            b.field(INFERENCE_ID_FIELD, "test_inference_id");
+            b.startObject(INDEX_OPTIONS_FIELD);
+            {
+                b.startObject(SparseVectorFieldMapper.CONTENT_TYPE);
+                {
+                    b.field("prune", true);
+                    b.startObject("pruning_config");
+                    {
+                        b.field(TokenPruningConfig.TOKENS_FREQ_RATIO_THRESHOLD.getPreferredName(), 1000.0f);
+                    }
+                    b.endObject();
+                }
+                b.endObject();
+            }
+            b.endObject();
+        }), useLegacyFormat, IndexVersions.INFERENCE_METADATA_FIELDS_BACKPORT));
+        var innerClause = e.getCause().getCause().getCause().getCause();
+        assertThat(innerClause.getMessage(), containsString("[tokens_freq_ratio_threshold] must be between [1] and [100], got 1000.0"));
     }
 
     public static SemanticTextIndexOptions randomSemanticTextIndexOptions() {
@@ -1437,13 +1678,21 @@ public class SemanticTextFieldMapperTests extends MapperTestCase {
     }
 
     public static SemanticTextIndexOptions randomSemanticTextIndexOptions(TaskType taskType) {
-
         if (taskType == TaskType.TEXT_EMBEDDING) {
             return randomBoolean()
                 ? null
                 : new SemanticTextIndexOptions(SemanticTextIndexOptions.SupportedIndexOptions.DENSE_VECTOR, randomIndexOptionsAll());
         }
 
+        if (taskType == TaskType.SPARSE_EMBEDDING) {
+            return randomBoolean()
+                ? null
+                : new SemanticTextIndexOptions(
+                    SemanticTextIndexOptions.SupportedIndexOptions.SPARSE_VECTOR,
+                    randomSparseVectorIndexOptions(false)
+                );
+        }
+
         return null;
     }
 

+ 25 - 15
x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/model/TestModel.java

@@ -52,13 +52,10 @@ public class TestModel extends Model {
     }
 
     public static TestModel createRandomInstance(TaskType taskType, List<SimilarityMeasure> excludedSimilarities, int maxDimensions) {
-        var elementType = taskType == TaskType.TEXT_EMBEDDING ? randomFrom(DenseVectorFieldMapper.ElementType.values()) : null;
-        var dimensions = taskType == TaskType.TEXT_EMBEDDING
-            ? DenseVectorFieldMapperTestUtils.randomCompatibleDimensions(elementType, maxDimensions)
-            : null;
-
-        SimilarityMeasure similarity = null;
         if (taskType == TaskType.TEXT_EMBEDDING) {
+            var elementType = randomFrom(DenseVectorFieldMapper.ElementType.values());
+            var dimensions = DenseVectorFieldMapperTestUtils.randomCompatibleDimensions(elementType, maxDimensions);
+
             List<SimilarityMeasure> supportedSimilarities = new ArrayList<>(
                 DenseVectorFieldMapperTestUtils.getSupportedSimilarities(elementType)
             );
@@ -75,17 +72,30 @@ public class TestModel extends Model {
                 );
             }
 
-            similarity = randomFrom(supportedSimilarities);
+            SimilarityMeasure similarity = randomFrom(supportedSimilarities);
+
+            return new TestModel(
+                randomAlphaOfLength(4),
+                TaskType.TEXT_EMBEDDING,
+                randomAlphaOfLength(10),
+                new TestModel.TestServiceSettings(randomAlphaOfLength(4), dimensions, similarity, elementType),
+                new TestModel.TestTaskSettings(randomInt(3)),
+                new TestModel.TestSecretSettings(randomAlphaOfLength(4))
+            );
+        }
+
+        if (taskType == TaskType.SPARSE_EMBEDDING) {
+            return new TestModel(
+                randomAlphaOfLength(4),
+                TaskType.SPARSE_EMBEDDING,
+                randomAlphaOfLength(10),
+                new TestModel.TestServiceSettings(randomAlphaOfLength(4), null, null, null),
+                new TestModel.TestTaskSettings(randomInt(3)),
+                new TestModel.TestSecretSettings(randomAlphaOfLength(4))
+            );
         }
 
-        return new TestModel(
-            randomAlphaOfLength(4),
-            taskType,
-            randomAlphaOfLength(10),
-            new TestModel.TestServiceSettings(randomAlphaOfLength(4), dimensions, similarity, elementType),
-            new TestModel.TestTaskSettings(randomInt(3)),
-            new TestModel.TestSecretSettings(randomAlphaOfLength(4))
-        );
+        throw new IllegalArgumentException("Unsupported task type [" + taskType + "]");
     }
 
     public TestModel(

+ 475 - 0
x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/10_semantic_text_field_mapping.yml

@@ -977,3 +977,478 @@ setup:
 
   - not_exists: test-index-options-sparse.mappings.properties.semantic_field.index_options
 
+---
+"Users can set sparse vector index options and index documents using those options":
+  - requires:
+      cluster_features: "semantic_text.sparse_vector_index_options"
+      reason: Index options for sparse vector introduced in 9.2.0
+
+  - do:
+      indices.create:
+        index: test-index-options
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              semantic_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+                index_options:
+                  sparse_vector:
+                    prune: true
+                    pruning_config:
+                      tokens_freq_ratio_threshold: 18.0
+                      tokens_weight_threshold: 0.6
+
+  - do:
+      indices.get_mapping:
+        index: test-index-options
+
+  - match: { "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.prune": true }
+  - match: { "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.pruning_config.tokens_freq_ratio_threshold": 18.0 }
+  - match: { "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.pruning_config.tokens_weight_threshold": 0.6 }
+
+  - do:
+      index:
+        index: test-index-options
+        id: doc_1
+        body:
+          semantic_field: "these are not the droids you're looking for. He's free to go around"
+          _inference_fields.semantic_field:
+            inference:
+              inference_id: sparse-inference-id
+              model_settings:
+                task_type: sparse_embedding
+              chunks:
+                semantic_field:
+                  - start_offset: 0
+                    end_offset: 44
+                    embeddings:
+                      dr: 1.6103356
+                      these: 1.1396849
+                  - start_offset: 44
+                    end_offset: 67
+                    embeddings:
+                      free: 1.693662
+                      around: 1.4376559
+
+  - do:
+      indices.get_mapping:
+        index: test-index-options
+
+  - match: { "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.prune": true }
+  - match: { "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.pruning_config.tokens_freq_ratio_threshold": 18.0 }
+  - match: { "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.pruning_config.tokens_weight_threshold": 0.6 }
+
+---
+"Specifying invalid sparse vector index options will fail":
+  - requires:
+      cluster_features: "semantic_text.sparse_vector_index_options"
+      reason: Index options for sparse vector introduced in 9.2.0
+
+  - do:
+      catch: /\[index_options\] unknown field \[ef_construction\]/
+      indices.create:
+        index: test-incompatible-index-options
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              semantic_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+                index_options:
+                  sparse_vector:
+                    ef_construction: 100
+
+  - match: { status: 400 }
+
+  - do:
+      catch: /\[index_options\] field \[pruning_config\] should only be set if \[prune\] is set to true/
+      indices.create:
+        index: test-incompatible-index-options
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              semantic_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+                index_options:
+                  sparse_vector:
+                    prune: false
+                    pruning_config:
+                      tokens_freq_ratio_threshold: 18.0
+                      tokens_weight_threshold: 0.6
+
+  - match: { status: 400 }
+
+  - do:
+      catch: /\[tokens_freq_ratio_threshold\] must be between \[1\] and \[100\]/
+      indices.create:
+        index: test-incompatible-index-options
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              semantic_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+                index_options:
+                  sparse_vector:
+                    prune: true
+                    pruning_config:
+                      tokens_freq_ratio_threshold: 101
+                      tokens_weight_threshold: 0.6
+
+  - match: { status: 400 }
+
+  - do:
+      catch: /unknown field \[some_other_param\]/
+      indices.create:
+        index: test-incompatible-index-options
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              semantic_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+                index_options:
+                  sparse_vector:
+                    prune: true
+                    pruning_config:
+                      tokens_freq_ratio_threshold: 18.0
+                      tokens_weight_threshold: 0.6
+                      some_other_param: true
+
+  - match: { status: 400 }
+
+---
+"Specifying sparse vector index options should fail using dense index options":
+  - requires:
+      cluster_features: "semantic_text.sparse_vector_index_options"
+      reason: Index options for sparse vector introduced in 9.2.0
+
+  - do:
+      catch: /Invalid task type for index options/
+      indices.create:
+        index: my-custom-semantic-index
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              semantic_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+                index_options:
+                  dense_vector:
+                    type: bbq_hnsw
+                    m: 16
+                    ef_construction: 100
+
+  - match: { status: 400 }
+
+---
+"Specifying dense vector index options should fail using sparse index options":
+  - requires:
+      cluster_features: "semantic_text.sparse_vector_index_options"
+      reason: Index options for sparse vector introduced in 9.2.0
+
+  - do:
+      catch: /Invalid task type for index options/
+      indices.create:
+        index: my-custom-semantic-index
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              semantic_field:
+                type: semantic_text
+                inference_id: dense-inference-id
+                index_options:
+                  sparse_vector:
+                    prune: false
+
+  - match: { status: 400 }
+
+---
+"Specifying sparse vector index options requires sparse vector model":
+  - requires:
+      cluster_features: "semantic_text.sparse_vector_index_options"
+      reason: Index options for sparse vector introduced in 9.2.0
+
+  - do:
+      catch: /Model settings must be set to validate index options/
+      indices.create:
+        index: should-be-invalid-index
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              semantic_field:
+                type: semantic_text
+                inference_id: nonexistent-inference-id
+                index_options:
+                  sparse_vector:
+                    prune: false
+
+  - match: { status: 400 }
+
+---
+"Updating sparse vector index options":
+  - requires:
+      cluster_features: "semantic_text.sparse_vector_index_options"
+      reason: Index options for sparse vector introduced in 9.2.0
+
+  - do:
+      indices.create:
+        index: test-index-options
+        body:
+          settings:
+            number_of_shards: 1
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              semantic_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+                index_options:
+                  sparse_vector:
+                    prune: true
+                    pruning_config:
+                      tokens_freq_ratio_threshold: 1.0
+                      tokens_weight_threshold: 1.0
+
+  - do:
+      indices.get_mapping:
+        index: test-index-options
+
+  - match: { "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.prune": true }
+  - match: { "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.pruning_config.tokens_freq_ratio_threshold": 1.0 }
+  - match: { "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.pruning_config.tokens_weight_threshold": 1.0 }
+
+  - do:
+      index:
+        index: test-index-options
+        id: doc_1
+        refresh: true
+        body:
+          semantic_field: "cheese is comet"
+          _inference_fields.semantic_field:
+            inference:
+              inference_id: sparse-inference-id
+              model_settings:
+                task_type: sparse_embedding
+              chunks:
+                semantic_field:
+                  - start_offset: 0
+                    end_offset: 67
+                    embeddings:
+                      feature_0: 2.671405
+                      feature_1: 0.11809908
+                      feature_2: 0.26088917
+
+  - do:
+      index:
+        index: test-index-options
+        id: doc_2
+        refresh: true
+        body:
+          semantic_field: "planet is astronomy moon"
+          _inference_fields.semantic_field:
+            inference:
+              inference_id: sparse-inference-id
+              model_settings:
+                task_type: sparse_embedding
+              chunks:
+                semantic_field:
+                  - start_offset: 0
+                    end_offset: 67
+                    embeddings:
+                      feature_0: 2.3438394
+                      feature_1: 0.54600334
+                      feature_2: 0.36015007
+                      feature_3: 0.20022368
+
+  - do:
+      index:
+        index: test-index-options
+        id: doc_3
+        refresh: true
+        body:
+          semantic_field: "is globe ocean underground"
+          _inference_fields.semantic_field:
+            inference:
+              inference_id: sparse-inference-id
+              model_settings:
+                task_type: sparse_embedding
+              chunks:
+                semantic_field:
+                  - start_offset: 0
+                    end_offset: 67
+                    embeddings:
+                      feature_0: 0.6891394
+                      feature_1: 0.484035
+                      feature_2: 0.080102935
+                      feature_3: 0.053516876
+
+  - do:
+      search:
+        index: test-index-options
+        body:
+          query:
+            semantic:
+              field: "semantic_field"
+              query: "test query"
+
+  - match: { hits.total.value: 2 }
+  - match: { hits.hits.0._id: "doc_2" }
+  - match: { hits.hits.1._id: "doc_3" }
+
+  - do:
+      indices.put_mapping:
+        index: test-index-options
+        body:
+          properties:
+            semantic_field:
+              type: semantic_text
+              inference_id: sparse-inference-id
+              index_options:
+                sparse_vector:
+                  prune: false
+
+  - do:
+      indices.get_mapping:
+        index: test-index-options
+
+  - match: { "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.prune": false }
+  - not_exists: "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.pruning_config.tokens_freq_ratio_threshold"
+  - not_exists: "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.pruning_config.tokens_weight_threshold"
+
+  - do:
+      search:
+        index: test-index-options
+        body:
+          query:
+            semantic:
+              field: "semantic_field"
+              query: "test query"
+
+  - match: { hits.total.value: 3 }
+  - match: { hits.hits.0._id: "doc_2" }
+  - match: { hits.hits.1._id: "doc_1" }
+  - match: { hits.hits.2._id: "doc_3" }
+
+
+---
+"Displaying default sparse vector index_options with and without include_defaults":
+  - requires:
+      cluster_features: "semantic_text.sparse_vector_index_options"
+      reason: Index options for sparse vector introduced in 9.2.0
+
+  - do:
+      indices.create:
+        index: test-index-options-sparse
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              semantic_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+
+  - do:
+      indices.get_mapping:
+        index: test-index-options-sparse
+
+  - not_exists: test-index-options-sparse.mappings.semantic_field.mapping.index_options
+
+  - do:
+      indices.get_field_mapping:
+        index: test-index-options-sparse
+        fields: semantic_field
+        include_defaults: true
+
+  - match: { "test-index-options-sparse.mappings.semantic_field.mapping.semantic_field.index_options.sparse_vector.prune": true }
+  - match: { "test-index-options-sparse.mappings.semantic_field.mapping.semantic_field.index_options.sparse_vector.pruning_config.tokens_freq_ratio_threshold": 5.0 }
+  - match: { "test-index-options-sparse.mappings.semantic_field.mapping.semantic_field.index_options.sparse_vector.pruning_config.tokens_weight_threshold": 0.4 }
+
+  # Validate that actually specifying the same values as our defaults will still serialize the user provided index_options
+  - do:
+      indices.create:
+        index: test-index-options-sparse2
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              semantic_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+                index_options:
+                  sparse_vector:
+                    prune: true
+                    pruning_config:
+                      tokens_freq_ratio_threshold: 5.0
+                      tokens_weight_threshold: 0.4
+
+  - do:
+      indices.get_mapping:
+        index: test-index-options-sparse2
+
+  - match: { "test-index-options-sparse2.mappings.properties.semantic_field.index_options.sparse_vector.prune": true }
+  - match: { "test-index-options-sparse2.mappings.properties.semantic_field.index_options.sparse_vector.pruning_config.tokens_freq_ratio_threshold": 5.0 }
+  - match: { "test-index-options-sparse2.mappings.properties.semantic_field.index_options.sparse_vector.pruning_config.tokens_weight_threshold": 0.4 }
+
+  - do:
+      indices.get_field_mapping:
+        index: test-index-options-sparse2
+        fields: semantic_field
+        include_defaults: true
+
+  - match: { "test-index-options-sparse2.mappings.semantic_field.mapping.semantic_field.index_options.sparse_vector.prune": true }
+  - match: { "test-index-options-sparse2.mappings.semantic_field.mapping.semantic_field.index_options.sparse_vector.pruning_config.tokens_freq_ratio_threshold": 5.0 }
+  - match: { "test-index-options-sparse2.mappings.semantic_field.mapping.semantic_field.index_options.sparse_vector.pruning_config.tokens_weight_threshold": 0.4 }

+ 475 - 0
x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/10_semantic_text_field_mapping_bwc.yml

@@ -879,3 +879,478 @@ setup:
 
   - not_exists: test-index-options-sparse.mappings.properties.semantic_field.index_options
 
+---
+"Users can set sparse vector index options and index documents using those options":
+  - requires:
+      cluster_features: "semantic_text.sparse_vector_index_options"
+      reason: Index options for sparse vector introduced in 9.2.0
+
+  - do:
+      indices.create:
+        index: test-index-options
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              semantic_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+                index_options:
+                  sparse_vector:
+                    prune: true
+                    pruning_config:
+                      tokens_freq_ratio_threshold: 18.0
+                      tokens_weight_threshold: 0.6
+
+  - do:
+      indices.get_mapping:
+        index: test-index-options
+
+  - match: { "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.prune": true }
+  - match: { "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.pruning_config.tokens_freq_ratio_threshold": 18.0 }
+  - match: { "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.pruning_config.tokens_weight_threshold": 0.6 }
+
+  - do:
+      index:
+        index: test-index-options
+        id: doc_1
+        body:
+          semantic_field: "these are not the droids you're looking for. He's free to go around"
+          _inference_fields.semantic_field:
+            inference:
+              inference_id: sparse-inference-id
+              model_settings:
+                task_type: sparse_embedding
+              chunks:
+                semantic_field:
+                  - start_offset: 0
+                    end_offset: 44
+                    embeddings:
+                      dr: 1.6103356
+                      these: 1.1396849
+                  - start_offset: 44
+                    end_offset: 67
+                    embeddings:
+                      free: 1.693662
+                      around: 1.4376559
+
+  - do:
+      indices.get_mapping:
+        index: test-index-options
+
+  - match: { "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.prune": true }
+  - match: { "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.pruning_config.tokens_freq_ratio_threshold": 18.0 }
+  - match: { "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.pruning_config.tokens_weight_threshold": 0.6 }
+
+---
+"Specifying invalid sparse vector index options will fail":
+  - requires:
+      cluster_features: "semantic_text.sparse_vector_index_options"
+      reason: Index options for sparse vector introduced in 9.2.0
+
+  - do:
+      catch: /\[index_options\] unknown field \[ef_construction\]/
+      indices.create:
+        index: test-incompatible-index-options
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              semantic_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+                index_options:
+                  sparse_vector:
+                    ef_construction: 100
+
+  - match: { status: 400 }
+
+  - do:
+      catch: /\[index_options\] field \[pruning_config\] should only be set if \[prune\] is set to true/
+      indices.create:
+        index: test-incompatible-index-options
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              semantic_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+                index_options:
+                  sparse_vector:
+                    prune: false
+                    pruning_config:
+                      tokens_freq_ratio_threshold: 18.0
+                      tokens_weight_threshold: 0.6
+
+  - match: { status: 400 }
+
+  - do:
+      catch: /\[tokens_freq_ratio_threshold\] must be between \[1\] and \[100\]/
+      indices.create:
+        index: test-incompatible-index-options
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              semantic_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+                index_options:
+                  sparse_vector:
+                    prune: true
+                    pruning_config:
+                      tokens_freq_ratio_threshold: 101
+                      tokens_weight_threshold: 0.6
+
+  - match: { status: 400 }
+
+  - do:
+      catch: /unknown field \[some_other_param\]/
+      indices.create:
+        index: test-incompatible-index-options
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              semantic_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+                index_options:
+                  sparse_vector:
+                    prune: true
+                    pruning_config:
+                      tokens_freq_ratio_threshold: 18.0
+                      tokens_weight_threshold: 0.6
+                      some_other_param: true
+
+  - match: { status: 400 }
+
+---
+"Specifying sparse vector index options should fail using dense index options":
+  - requires:
+      cluster_features: "semantic_text.sparse_vector_index_options"
+      reason: Index options for sparse vector introduced in 9.2.0
+
+  - do:
+      catch: /Invalid task type for index options/
+      indices.create:
+        index: my-custom-semantic-index
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              semantic_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+                index_options:
+                  dense_vector:
+                    type: bbq_hnsw
+                    m: 16
+                    ef_construction: 100
+
+  - match: { status: 400 }
+
+---
+"Specifying dense vector index options should fail using sparse index options":
+  - requires:
+      cluster_features: "semantic_text.sparse_vector_index_options"
+      reason: Index options for sparse vector introduced in 9.2.0
+
+  - do:
+      catch: /Invalid task type for index options/
+      indices.create:
+        index: my-custom-semantic-index
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              semantic_field:
+                type: semantic_text
+                inference_id: dense-inference-id
+                index_options:
+                  sparse_vector:
+                    prune: false
+
+  - match: { status: 400 }
+
+---
+"Specifying sparse vector index options requires sparse vector model":
+  - requires:
+      cluster_features: "semantic_text.sparse_vector_index_options"
+      reason: Index options for sparse vector introduced in 9.2.0
+
+  - do:
+      catch: /Model settings must be set to validate index options/
+      indices.create:
+        index: should-be-invalid-index
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              semantic_field:
+                type: semantic_text
+                inference_id: nonexistent-inference-id
+                index_options:
+                  sparse_vector:
+                    prune: false
+
+  - match: { status: 400 }
+
+---
+"Updating sparse vector index options":
+  - requires:
+      cluster_features: "semantic_text.sparse_vector_index_options"
+      reason: Index options for sparse vector introduced in 9.2.0
+
+  - do:
+      indices.create:
+        index: test-index-options
+        body:
+          settings:
+            number_of_shards: 1
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              semantic_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+                index_options:
+                  sparse_vector:
+                    prune: true
+                    pruning_config:
+                      tokens_freq_ratio_threshold: 1.0
+                      tokens_weight_threshold: 1.0
+
+  - do:
+      indices.get_mapping:
+        index: test-index-options
+
+  - match: { "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.prune": true }
+  - match: { "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.pruning_config.tokens_freq_ratio_threshold": 1.0 }
+  - match: { "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.pruning_config.tokens_weight_threshold": 1.0 }
+
+  - do:
+      index:
+        index: test-index-options
+        id: doc_1
+        refresh: true
+        body:
+          semantic_field: "cheese is comet"
+          _inference_fields.semantic_field:
+            inference:
+              inference_id: sparse-inference-id
+              model_settings:
+                task_type: sparse_embedding
+              chunks:
+                semantic_field:
+                  - start_offset: 0
+                    end_offset: 67
+                    embeddings:
+                      feature_0: 2.671405
+                      feature_1: 0.11809908
+                      feature_2: 0.26088917
+
+  - do:
+      index:
+        index: test-index-options
+        id: doc_2
+        refresh: true
+        body:
+          semantic_field: "planet is astronomy moon"
+          _inference_fields.semantic_field:
+            inference:
+              inference_id: sparse-inference-id
+              model_settings:
+                task_type: sparse_embedding
+              chunks:
+                semantic_field:
+                  - start_offset: 0
+                    end_offset: 67
+                    embeddings:
+                      feature_0: 2.3438394
+                      feature_1: 0.54600334
+                      feature_2: 0.36015007
+                      feature_3: 0.20022368
+
+  - do:
+      index:
+        index: test-index-options
+        id: doc_3
+        refresh: true
+        body:
+          semantic_field: "is globe ocean underground"
+          _inference_fields.semantic_field:
+            inference:
+              inference_id: sparse-inference-id
+              model_settings:
+                task_type: sparse_embedding
+              chunks:
+                semantic_field:
+                  - start_offset: 0
+                    end_offset: 67
+                    embeddings:
+                      feature_0: 0.6891394
+                      feature_1: 0.484035
+                      feature_2: 0.080102935
+                      feature_3: 0.053516876
+
+  - do:
+      search:
+        index: test-index-options
+        body:
+          query:
+            semantic:
+              field: "semantic_field"
+              query: "test query"
+
+  - match: { hits.total.value: 2 }
+  - match: { hits.hits.0._id: "doc_2" }
+  - match: { hits.hits.1._id: "doc_3" }
+
+  - do:
+      indices.put_mapping:
+        index: test-index-options
+        body:
+          properties:
+            semantic_field:
+              type: semantic_text
+              inference_id: sparse-inference-id
+              index_options:
+                sparse_vector:
+                  prune: false
+
+  - do:
+      indices.get_mapping:
+        index: test-index-options
+
+  - match: { "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.prune": false }
+  - not_exists: "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.pruning_config.tokens_freq_ratio_threshold"
+  - not_exists: "test-index-options.mappings.properties.semantic_field.index_options.sparse_vector.pruning_config.tokens_weight_threshold"
+
+  - do:
+      search:
+        index: test-index-options
+        body:
+          query:
+            semantic:
+              field: "semantic_field"
+              query: "test query"
+
+  - match: { hits.total.value: 3 }
+  - match: { hits.hits.0._id: "doc_2" }
+  - match: { hits.hits.1._id: "doc_1" }
+  - match: { hits.hits.2._id: "doc_3" }
+
+
+---
+"Displaying default sparse vector index_options with and without include_defaults":
+  - requires:
+      cluster_features: "semantic_text.sparse_vector_index_options"
+      reason: Index options for sparse vector introduced in 9.2.0
+
+  - do:
+      indices.create:
+        index: test-index-options-sparse
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              semantic_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+
+  - do:
+      indices.get_mapping:
+        index: test-index-options-sparse
+
+  - not_exists: test-index-options-sparse.mappings.semantic_field.mapping.index_options
+
+  - do:
+      indices.get_field_mapping:
+        index: test-index-options-sparse
+        fields: semantic_field
+        include_defaults: true
+
+  - match: { "test-index-options-sparse.mappings.semantic_field.mapping.semantic_field.index_options.sparse_vector.prune": true }
+  - match: { "test-index-options-sparse.mappings.semantic_field.mapping.semantic_field.index_options.sparse_vector.pruning_config.tokens_freq_ratio_threshold": 5.0 }
+  - match: { "test-index-options-sparse.mappings.semantic_field.mapping.semantic_field.index_options.sparse_vector.pruning_config.tokens_weight_threshold": 0.4 }
+
+  # Validate that actually specifying the same values as our defaults will still serialize the user provided index_options
+  - do:
+      indices.create:
+        index: test-index-options-sparse2
+        body:
+          settings:
+            index:
+              mapping:
+                semantic_text:
+                  use_legacy_format: false
+          mappings:
+            properties:
+              semantic_field:
+                type: semantic_text
+                inference_id: sparse-inference-id
+                index_options:
+                  sparse_vector:
+                    prune: true
+                    pruning_config:
+                      tokens_freq_ratio_threshold: 5.0
+                      tokens_weight_threshold: 0.4
+
+  - do:
+      indices.get_mapping:
+        index: test-index-options-sparse2
+
+  - match: { "test-index-options-sparse2.mappings.properties.semantic_field.index_options.sparse_vector.prune": true }
+  - match: { "test-index-options-sparse2.mappings.properties.semantic_field.index_options.sparse_vector.pruning_config.tokens_freq_ratio_threshold": 5.0 }
+  - match: { "test-index-options-sparse2.mappings.properties.semantic_field.index_options.sparse_vector.pruning_config.tokens_weight_threshold": 0.4 }
+
+  - do:
+      indices.get_field_mapping:
+        index: test-index-options-sparse2
+        fields: semantic_field
+        include_defaults: true
+
+  - match: { "test-index-options-sparse2.mappings.semantic_field.mapping.semantic_field.index_options.sparse_vector.prune": true }
+  - match: { "test-index-options-sparse2.mappings.semantic_field.mapping.semantic_field.index_options.sparse_vector.pruning_config.tokens_freq_ratio_threshold": 5.0 }
+  - match: { "test-index-options-sparse2.mappings.semantic_field.mapping.semantic_field.index_options.sparse_vector.pruning_config.tokens_weight_threshold": 0.4 }