ソースを参照

[9.1] refactor(test): add sparse vector pruning tests (#132264) (#132799)

Michail Romaios 2 ヶ月 前
コミット
fc95457f55

+ 187 - 179
server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java

@@ -16,6 +16,7 @@ import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.BoostQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.store.Directory;
@@ -38,6 +39,7 @@ import org.elasticsearch.index.query.SearchExecutionContext;
 import org.elasticsearch.inference.WeightedToken;
 import org.elasticsearch.search.lookup.Source;
 import org.elasticsearch.search.vectors.SparseVectorQueryWrapper;
+import org.elasticsearch.test.ESTestCase;
 import org.elasticsearch.test.index.IndexVersionUtils;
 import org.elasticsearch.xcontent.ToXContent;
 import org.elasticsearch.xcontent.XContentBuilder;
@@ -53,12 +55,17 @@ import java.util.Collection;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.NavigableSet;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
 
+import static org.elasticsearch.index.IndexVersions.NEW_SPARSE_VECTOR;
 import static org.elasticsearch.index.IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT;
+import static org.elasticsearch.index.IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT_BACKPORT_8_X;
 import static org.elasticsearch.index.IndexVersions.UPGRADE_TO_LUCENE_10_0_0;
-import static org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper.NEW_SPARSE_VECTOR_INDEX_VERSION;
-import static org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper.PREVIOUS_SPARSE_VECTOR_INDEX_VERSION;
-import static org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION;
+import static org.elasticsearch.index.IndexVersions.V_8_0_0;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertToXContentEquivalent;
 import static org.elasticsearch.xcontent.XContentFactory.jsonBuilder;
 import static org.hamcrest.Matchers.containsString;
@@ -67,6 +74,22 @@ import static org.hamcrest.Matchers.instanceOf;
 
 public class SparseVectorFieldMapperTests extends MapperTestCase {
 
+    public static final float STRICT_TOKENS_WEIGHT_THRESHOLD = 0.5f;
+    public static final float STRICT_TOKENS_FREQ_RATIO_THRESHOLD = 1;
+
+    private static final Map<String, Float> COMMON_TOKENS = Map.of(
+        "common1_drop_default",
+        0.1f,
+        "common2_drop_default",
+        0.1f,
+        "common3_drop_default",
+        0.1f
+    );
+
+    private static final Map<String, Float> MEDIUM_TOKENS = Map.of("medium1_keep_strict", 0.5f, "medium2_keep_default", 0.25f);
+
+    private static final Map<String, Float> RARE_TOKENS = Map.of("rare1_keep_strict", 0.9f, "rare2_keep_strict", 0.85f);
+
     @Override
     protected Object getSampleValueForDocument() {
         Map<String, Float> map = new LinkedHashMap<>();
@@ -122,7 +145,7 @@ public class SparseVectorFieldMapperTests extends MapperTestCase {
             b.field("prune", true);
             b.startObject("pruning_config");
             {
-                b.field("tokens_freq_ratio_threshold", 3.0f);
+                b.field("tokens_freq_ratio_threshold", 1.0f);
                 b.field("tokens_weight_threshold", 0.5f);
             }
             b.endObject();
@@ -177,6 +200,13 @@ public class SparseVectorFieldMapperTests extends MapperTestCase {
         b.endObject();
     }
 
+    private void mapping(XContentBuilder b, @Nullable Boolean prune, PruningConfig pruningConfig) throws IOException {
+        b.field("type", "sparse_vector");
+        if (prune != null) {
+            b.field("index_options", new SparseVectorFieldMapper.IndexOptions(prune, pruningConfig.tokenPruningConfig));
+        }
+    }
+
     @Override
     protected boolean supportsStoredFields() {
         return false;
@@ -262,7 +292,7 @@ public class SparseVectorFieldMapperTests extends MapperTestCase {
         IndexVersion indexVersion = IndexVersionUtils.randomVersionBetween(
             random(),
             UPGRADE_TO_LUCENE_10_0_0,
-            IndexVersionUtils.getPreviousVersion(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_VERSION)
+            IndexVersionUtils.getPreviousVersion(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT)
         );
 
         XContentBuilder orig = JsonXContent.contentBuilder().startObject();
@@ -483,7 +513,7 @@ public class SparseVectorFieldMapperTests extends MapperTestCase {
     @Override
     protected String[] getParseMinimalWarnings(IndexVersion indexVersion) {
         String[] additionalWarnings = null;
-        if (indexVersion.before(PREVIOUS_SPARSE_VECTOR_INDEX_VERSION)) {
+        if (indexVersion.before(V_8_0_0)) {
             additionalWarnings = new String[] { SparseVectorFieldMapper.ERROR_MESSAGE_7X };
         }
         return Strings.concatStringArrays(super.getParseMinimalWarnings(indexVersion), additionalWarnings);
@@ -491,15 +521,11 @@ public class SparseVectorFieldMapperTests extends MapperTestCase {
 
     @Override
     protected IndexVersion boostNotAllowedIndexVersion() {
-        return NEW_SPARSE_VECTOR_INDEX_VERSION;
+        return NEW_SPARSE_VECTOR;
     }
 
     public void testSparseVectorUnsupportedIndex() {
-        IndexVersion version = IndexVersionUtils.randomVersionBetween(
-            random(),
-            PREVIOUS_SPARSE_VECTOR_INDEX_VERSION,
-            IndexVersions.FIRST_DETACHED_INDEX_VERSION
-        );
+        IndexVersion version = IndexVersionUtils.randomVersionBetween(random(), V_8_0_0, IndexVersions.FIRST_DETACHED_INDEX_VERSION);
         Exception e = expectThrows(MapperParsingException.class, () -> createMapperService(version, fieldMapping(b -> {
             b.field("type", "sparse_vector");
         })));
@@ -678,14 +704,60 @@ public class SparseVectorFieldMapperTests extends MapperTestCase {
         );
     }
 
+    private enum PruningScenario {
+        NO_PRUNING,      // No pruning applied - all tokens preserved
+        DEFAULT_PRUNING, // Default pruning configuration
+        STRICT_PRUNING   // Stricter pruning with higher thresholds
+    }
+
+    private enum PruningConfig {
+        NULL(null),
+        EXPLICIT_DEFAULT(new TokenPruningConfig()),
+        STRICT(new TokenPruningConfig(STRICT_TOKENS_FREQ_RATIO_THRESHOLD, STRICT_TOKENS_WEIGHT_THRESHOLD, false));
+
+        public final @Nullable TokenPruningConfig tokenPruningConfig;
+
+        PruningConfig(@Nullable TokenPruningConfig tokenPruningConfig) {
+            this.tokenPruningConfig = tokenPruningConfig;
+        }
+    }
+
+    private final Set<PruningOptions> validIndexPruningScenarios = Set.of(
+        new PruningOptions(false, PruningConfig.NULL),
+        new PruningOptions(true, PruningConfig.NULL),
+        new PruningOptions(true, PruningConfig.EXPLICIT_DEFAULT),
+        new PruningOptions(true, PruningConfig.STRICT),
+        new PruningOptions(null, PruningConfig.NULL)
+    );
+
+    private record PruningOptions(@Nullable Boolean prune, PruningConfig pruningConfig) {}
+
     private void withSearchExecutionContext(MapperService mapperService, CheckedConsumer<SearchExecutionContext, IOException> consumer)
         throws IOException {
         var mapper = mapperService.documentMapper();
         try (Directory directory = newDirectory()) {
             RandomIndexWriter iw = new RandomIndexWriter(random(), directory);
-            var sourceToParse = source(this::writeField);
-            ParsedDocument doc1 = mapper.parse(sourceToParse);
-            iw.addDocument(doc1.rootDoc());
+
+            int commonDocs = 20;
+            for (int i = 0; i < commonDocs; i++) {
+                iw.addDocument(mapper.parse(source(b -> b.field("field", COMMON_TOKENS))).rootDoc());
+            }
+
+            int mediumDocs = 5;
+            for (int i = 0; i < mediumDocs; i++) {
+                iw.addDocument(mapper.parse(source(b -> b.field("field", MEDIUM_TOKENS))).rootDoc());
+            }
+
+            iw.addDocument(mapper.parse(source(b -> b.field("field", RARE_TOKENS))).rootDoc());
+
+            // This will lower the averageTokenFreqRatio so that common tokens get pruned with default settings.
+            // Depending on how the index is created, we will have 30-37 numUniqueTokens
+            // this will result in an averageTokenFreqRatio of 0.1021 - 0.1259
+            Map<String, Float> uniqueDoc = new TreeMap<>();
+            for (int i = 0; i < 30; i++) {
+                uniqueDoc.put("unique" + i, 0.5f);
+            }
+            iw.addDocument(mapper.parse(source(b -> b.field("field", uniqueDoc))).rootDoc());
             iw.close();
 
             try (DirectoryReader reader = wrapInMockESDirectoryReader(DirectoryReader.open(directory))) {
@@ -695,203 +767,139 @@ public class SparseVectorFieldMapperTests extends MapperTestCase {
         }
     }
 
-    public void testTypeQueryFinalizationWithRandomOptions() throws Exception {
-        for (int i = 0; i < 20; i++) {
-            runTestTypeQueryFinalization(
-                randomBoolean(), // useIndexVersionBeforeIndexOptions
-                randomBoolean(), // useMapperDefaultIndexOptions
-                randomBoolean(), // setMapperIndexOptionsPruneToFalse
-                randomBoolean(), // queryOverridesPruningConfig
-                randomBoolean()  // queryOverridesPruneToBeFalse
+    public void testPruningScenarios() throws Exception {
+        for (int i = 0; i < 200; i++) {
+            assertPruningScenario(
+                randomFrom(validIndexPruningScenarios),
+                new PruningOptions(randomFrom(true, false, null), randomFrom(PruningConfig.values()))
             );
         }
     }
 
-    public void testTypeQueryFinalizationDefaultsCurrentVersion() throws Exception {
-        IndexVersion version = IndexVersion.current();
-        MapperService mapperService = createMapperService(version, fieldMapping(this::minimalMapping));
-
-        // query should be pruned by default on newer index versions
-        performTypeQueryFinalizationTest(mapperService, null, null, true);
+    private XContentBuilder getIndexMapping(PruningOptions pruningOptions) throws IOException {
+        return fieldMapping(b -> mapping(b, pruningOptions.prune, pruningOptions.pruningConfig));
     }
 
-    public void testTypeQueryFinalizationDefaultsPreviousVersion() throws Exception {
-        IndexVersion version = IndexVersionUtils.randomVersionBetween(
-            random(),
-            UPGRADE_TO_LUCENE_10_0_0,
-            IndexVersionUtils.getPreviousVersion(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT)
-        );
-        MapperService mapperService = createMapperService(version, fieldMapping(this::minimalMapping));
-
-        // query should _not_ be pruned by default on older index versions
-        performTypeQueryFinalizationTest(mapperService, null, null, false);
-    }
-
-    public void testTypeQueryFinalizationWithIndexExplicit() throws Exception {
-        IndexVersion version = IndexVersion.current();
-        MapperService mapperService = createMapperService(version, fieldMapping(this::minimalMapping));
+    private void assertQueryContains(List<Query> expectedClauses, Query query) {
+        SparseVectorQueryWrapper queryWrapper = (SparseVectorQueryWrapper) query;
+        var termsQuery = queryWrapper.getTermsQuery();
+        assertNotNull(termsQuery);
+        var booleanQuery = (BooleanQuery) termsQuery;
 
-        // query should be pruned via explicit index options
-        performTypeQueryFinalizationTest(mapperService, null, null, true);
+        Collection<Query> shouldClauses = booleanQuery.getClauses(BooleanClause.Occur.SHOULD);
+        assertThat(shouldClauses, Matchers.containsInAnyOrder(expectedClauses.toArray()));
     }
 
-    public void testTypeQueryFinalizationWithIndexExplicitDoNotPrune() throws Exception {
-        IndexVersion version = IndexVersion.current();
-        MapperService mapperService = createMapperService(version, fieldMapping(this::mappingWithIndexOptionsPruneFalse));
+    private PruningScenario getEffectivePruningScenario(
+        PruningOptions indexPruningOptions,
+        PruningOptions queryPruningOptions,
+        IndexVersion indexVersion
+    ) {
+        Boolean shouldPrune = queryPruningOptions.prune;
+        if (shouldPrune == null) {
+            shouldPrune = indexPruningOptions.prune;
+        }
 
-        // query should be pruned via explicit index options
-        performTypeQueryFinalizationTest(mapperService, null, null, false);
-    }
+        if (shouldPrune == null) {
+            shouldPrune = indexVersion.between(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT_BACKPORT_8_X, UPGRADE_TO_LUCENE_10_0_0)
+                || indexVersion.onOrAfter(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT);
+        }
 
-    public void testTypeQueryFinalizationQueryOverridesPruning() throws Exception {
-        IndexVersion version = IndexVersion.current();
-        MapperService mapperService = createMapperService(version, fieldMapping(this::mappingWithIndexOptionsPruneFalse));
+        PruningScenario pruningScenario = PruningScenario.NO_PRUNING;
+        if (shouldPrune) {
+            PruningConfig pruningConfig = queryPruningOptions.pruningConfig;
+            if (pruningConfig == PruningConfig.NULL) {
+                pruningConfig = indexPruningOptions.pruningConfig;
+            }
+            pruningScenario = switch (pruningConfig) {
+                case STRICT -> PruningScenario.STRICT_PRUNING;
+                case EXPLICIT_DEFAULT, NULL -> PruningScenario.DEFAULT_PRUNING;
+            };
+        }
 
-        // query should still be pruned due to query builder setting it
-        performTypeQueryFinalizationTest(mapperService, true, new TokenPruningConfig(), true);
+        return pruningScenario;
     }
 
-    public void testTypeQueryFinalizationQueryOverridesPruningOff() throws Exception {
-        IndexVersion version = IndexVersion.current();
-        MapperService mapperService = createMapperService(version, fieldMapping(this::mappingWithIndexOptionsPruneFalse));
+    private List<Query> getExpectedQueryClauses(
+        SparseVectorFieldMapper.SparseVectorFieldType ft,
+        PruningScenario pruningScenario,
+        SearchExecutionContext searchExecutionContext
+    ) {
+        List<WeightedToken> tokens = switch (pruningScenario) {
+            case NO_PRUNING -> QUERY_VECTORS;
+            case DEFAULT_PRUNING -> QUERY_VECTORS.stream()
+                .filter(t -> t.token().startsWith("rare") || t.token().startsWith("medium"))
+                .toList();
+            case STRICT_PRUNING -> QUERY_VECTORS.stream().filter(t -> t.token().endsWith("keep_strict")).toList();
+        };
 
-        // query should not pruned due to query builder setting it
-        performTypeQueryFinalizationTest(mapperService, false, null, false);
+        return tokens.stream().map(t -> {
+            Query termQuery = ft.termQuery(t.token(), searchExecutionContext);
+            return new BoostQuery(termQuery, t.weight());
+        }).collect(Collectors.toUnmodifiableList());
     }
 
-    private void performTypeQueryFinalizationTest(
-        MapperService mapperService,
-        @Nullable Boolean queryPrune,
-        @Nullable TokenPruningConfig queryTokenPruningConfig,
-        boolean queryShouldBePruned
-    ) throws IOException {
+    private void assertPruningScenario(PruningOptions indexPruningOptions, PruningOptions queryPruningOptions) throws IOException {
+        IndexVersion indexVersion = getIndexVersion();
+        MapperService mapperService = createMapperService(indexVersion, getIndexMapping(indexPruningOptions));
+        PruningScenario effectivePruningScenario = getEffectivePruningScenario(indexPruningOptions, queryPruningOptions, indexVersion);
         withSearchExecutionContext(mapperService, (context) -> {
             SparseVectorFieldMapper.SparseVectorFieldType ft = (SparseVectorFieldMapper.SparseVectorFieldType) mapperService.fieldType(
                 "field"
             );
-            Query finalizedQuery = ft.finalizeSparseVectorQuery(context, "field", QUERY_VECTORS, queryPrune, queryTokenPruningConfig);
-
-            if (queryShouldBePruned) {
-                assertQueryWasPruned(finalizedQuery);
-            } else {
-                assertQueryWasNotPruned(finalizedQuery);
-            }
+            List<Query> expectedQueryClauses = getExpectedQueryClauses(ft, effectivePruningScenario, context);
+            Query finalizedQuery = ft.finalizeSparseVectorQuery(
+                context,
+                "field",
+                QUERY_VECTORS,
+                queryPruningOptions.prune,
+                queryPruningOptions.pruningConfig.tokenPruningConfig
+            );
+            assertQueryContains(expectedQueryClauses, finalizedQuery);
         });
     }
 
-    private void assertQueryWasPruned(Query query) {
-        assertQueryHasClauseCount(query, 0);
+    private static IndexVersion getIndexVersion() {
+        VersionRange versionRange = randomFrom(VersionRange.values());
+        return versionRange.getRandomVersion();
     }
 
-    private void assertQueryWasNotPruned(Query query) {
-        assertQueryHasClauseCount(query, QUERY_VECTORS.size());
-    }
-
-    private void assertQueryHasClauseCount(Query query, int clauseCount) {
-        SparseVectorQueryWrapper queryWrapper = (SparseVectorQueryWrapper) query;
-        var termsQuery = queryWrapper.getTermsQuery();
-        assertNotNull(termsQuery);
-        var booleanQuery = (BooleanQuery) termsQuery;
-        Collection<Query> clauses = booleanQuery.getClauses(BooleanClause.Occur.SHOULD);
-        assertThat(clauses.size(), equalTo(clauseCount));
-    }
-
-    /**
-     * Runs a test of the query finalization based on various parameters
-     * that provides
-     * @param useIndexVersionBeforeIndexOptions set to true to use a previous index version before mapper index_options
-     * @param useMapperDefaultIndexOptions set to false to use an explicit, non-default mapper index_options
-     * @param setMapperIndexOptionsPruneToFalse set to true to use prune:false in the mapper index_options
-     * @param queryOverridesPruningConfig set to true to designate the query will provide a pruning_config
-     * @param queryOverridesPruneToBeFalse if true and queryOverridesPruningConfig is true, the query will provide prune:false
-     * @throws IOException
-     */
-    private void runTestTypeQueryFinalization(
-        boolean useIndexVersionBeforeIndexOptions,
-        boolean useMapperDefaultIndexOptions,
-        boolean setMapperIndexOptionsPruneToFalse,
-        boolean queryOverridesPruningConfig,
-        boolean queryOverridesPruneToBeFalse
-    ) throws IOException {
-        MapperService mapperService = getMapperServiceForTest(
-            useIndexVersionBeforeIndexOptions,
-            useMapperDefaultIndexOptions,
-            setMapperIndexOptionsPruneToFalse
-        );
+    private enum VersionRange {
+        ES_V8X_WITHOUT_INDEX_OPTIONS_SUPPORT(
+            NEW_SPARSE_VECTOR,
+            IndexVersionUtils.getPreviousVersion(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT_BACKPORT_8_X)
+        ),
+        ES_V8X_WITH_INDEX_OPTIONS_SUPPORT(
+            SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT_BACKPORT_8_X,
+            IndexVersionUtils.getPreviousVersion(UPGRADE_TO_LUCENE_10_0_0)
+        ),
+        ES_V9X_WITHOUT_INDEX_OPTIONS_SUPPORT(
+            UPGRADE_TO_LUCENE_10_0_0,
+            IndexVersionUtils.getPreviousVersion(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT)
+        ),
+        ES_V9X_WITH_INDEX_OPTIONS_SUPPORT(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT, IndexVersion.current());
 
-        // check and see if the query should explicitly override the index_options
-        Boolean shouldQueryPrune = queryOverridesPruningConfig ? (queryOverridesPruneToBeFalse == false) : null;
-
-        // get the pruning configuration for the query if it's overriding
-        TokenPruningConfig queryPruningConfig = Boolean.TRUE.equals(shouldQueryPrune) ? new TokenPruningConfig() : null;
-
-        // our logic if the results should be pruned or not
-        // we should _not_ prune if any of the following:
-        // - the query explicitly overrides the options and `prune` is set to false
-        // - the query does not override the pruning options and:
-        // - either we are using a previous index version
-        // - or the index_options explicitly sets `prune` to false
-        boolean resultShouldNotBePruned = ((queryOverridesPruningConfig && queryOverridesPruneToBeFalse)
-            || (queryOverridesPruningConfig == false && (useIndexVersionBeforeIndexOptions || setMapperIndexOptionsPruneToFalse)));
-
-        try {
-            performTypeQueryFinalizationTest(mapperService, shouldQueryPrune, queryPruningConfig, resultShouldNotBePruned == false);
-        } catch (AssertionError e) {
-            String message = "performTypeQueryFinalizationTest failed using parameters: "
-                + "useIndexVersionBeforeIndexOptions: "
-                + useIndexVersionBeforeIndexOptions
-                + ", useMapperDefaultIndexOptions: "
-                + useMapperDefaultIndexOptions
-                + ", setMapperIndexOptionsPruneToFalse: "
-                + setMapperIndexOptionsPruneToFalse
-                + ", queryOverridesPruningConfig: "
-                + queryOverridesPruningConfig
-                + ", queryOverridesPruneToBeFalse: "
-                + queryOverridesPruneToBeFalse;
-            throw new AssertionError(message, e);
-        }
-    }
+        private final IndexVersion fromVersion;
+        private final IndexVersion toVersion;
 
-    private IndexVersion getIndexVersionForTest(boolean usePreviousIndex) {
-        return usePreviousIndex
-            ? IndexVersionUtils.randomVersionBetween(
-                random(),
-                UPGRADE_TO_LUCENE_10_0_0,
-                IndexVersionUtils.getPreviousVersion(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT)
-            )
-            : IndexVersionUtils.randomVersionBetween(random(), SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT, IndexVersion.current());
-    }
-
-    private MapperService getMapperServiceForTest(
-        boolean usePreviousIndex,
-        boolean useIndexOptionsDefaults,
-        boolean explicitIndexOptionsDoNotPrune
-    ) throws IOException {
-        // get the index version of the test to use
-        // either a current version that supports index options, or a previous version that does not
-        IndexVersion indexVersion = getIndexVersionForTest(usePreviousIndex);
-
-        // if it's using the old index, we always use the minimal mapping without index_options
-        if (usePreviousIndex) {
-            return createMapperService(indexVersion, fieldMapping(this::minimalMapping));
+        VersionRange(IndexVersion fromVersion, IndexVersion toVersion) {
+            this.fromVersion = fromVersion;
+            this.toVersion = toVersion;
         }
 
-        // if we set explicitIndexOptionsDoNotPrune, the index_options (if present) will explicitly include "prune: false"
-        if (explicitIndexOptionsDoNotPrune) {
-            return createMapperService(indexVersion, fieldMapping(this::mappingWithIndexOptionsPruneFalse));
+        IndexVersion getRandomVersion() {
+            // TODO: replace implementation with `IndexVersionUtils::randomVersionBetween` once support is added
+            // for handling unbalanced version distributions.
+            NavigableSet<IndexVersion> allReleaseVersions = IndexVersionUtils.allReleasedVersions();
+            Set<IndexVersion> candidateVersions = allReleaseVersions.subSet(fromVersion, toVersion);
+            return ESTestCase.randomFrom(candidateVersions);
         }
-
-        // either return the default (minimal) mapping or one with an explicit pruning_config
-        return useIndexOptionsDefaults
-            ? createMapperService(indexVersion, fieldMapping(this::minimalMapping))
-            : createMapperService(indexVersion, fieldMapping(this::minimalMappingWithExplicitIndexOptions));
     }
 
-    private static List<WeightedToken> QUERY_VECTORS = List.of(
-        new WeightedToken("pugs", 0.5f),
-        new WeightedToken("cats", 0.4f),
-        new WeightedToken("is", 0.1f)
-    );
+    private static final List<WeightedToken> QUERY_VECTORS = Stream.of(RARE_TOKENS, MEDIUM_TOKENS, COMMON_TOKENS)
+        .flatMap(map -> map.entrySet().stream())
+        .map(entry -> new WeightedToken(entry.getKey(), entry.getValue()))
+        .collect(Collectors.toList());
 
     /**
      * Handles float/double conversion when reading/writing with xcontent by converting all numbers to floats.

+ 15 - 3
x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/multi_cluster/50_sparse_vector.yml

@@ -622,7 +622,7 @@ teardown:
           {"index": { "_id": "11" }}
           {"content_embedding":{"is": 0.6, "pugs": 0.6 }}
           {"index": { "_id": "12" }}
-          {"content_embedding":{"is": 0.1891394, "pugs": 0.1 }}
+          {"content_embedding":{"cats": 0.1 }}
 
   - do:
       search:
@@ -633,13 +633,25 @@ teardown:
               field: content_embedding
               query_vector:
                 pugs: 0.5
-                cats: 0.5
-                is: 0.04600334
+                cats: 0.18
+                is: 0.20
 
   - match: { hits.total.value: 2 }
   - match: { hits.hits.0._id: "11" }
   - match: { hits.hits.1._id: "12" }
 
+  - do:
+      search:
+        index: test-sparse-vector-pruning-default
+        body:
+          query:
+            sparse_vector:
+              field: content_embedding
+              query_vector:
+                is: 0.21 # 0.2 is the weight threshold for the default pruning config
+
+  - match: { hits.total.value: 11 }
+
   - do:
       search:
         index: test-sparse-vector-pruning-default

+ 16 - 3
x-pack/plugin/ml/qa/multi-cluster-tests-with-security/src/test/resources/rest-api-spec/test/remote_cluster/50_sparse_vector.yml

@@ -620,7 +620,7 @@ teardown:
           {"index": { "_id": "11" }}
           {"content_embedding":{"is": 0.6, "pugs": 0.6 }}
           {"index": { "_id": "12" }}
-          {"content_embedding":{"is": 0.1891394, "pugs": 0.1 }}
+          {"content_embedding":{"cats": 0.1 }}
 
   - do:
       search:
@@ -631,13 +631,26 @@ teardown:
               field: content_embedding
               query_vector:
                 pugs: 0.5
-                cats: 0.5
-                is: 0.04600334
+                cats: 0.18
+                is: 0.20
 
   - match: { hits.total.value: 2 }
   - match: { hits.hits.0._id: "11" }
   - match: { hits.hits.1._id: "12" }
 
+  - do:
+      search:
+        index: test-sparse-vector-pruning-default
+        body:
+          query:
+            sparse_vector:
+              field: content_embedding
+              query_vector:
+                is: 0.21 # 0.2 is the weight threshold for the default pruning config
+
+  - match: { hits.total.value: 11 }
+
+
   - do:
       search:
         index: test-sparse-vector-pruning-default

+ 15 - 3
x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/sparse_vector_search.yml

@@ -944,7 +944,7 @@ teardown:
           {"index": { "_id": "11" }}
           {"content_embedding":{"is": 0.6, "pugs": 0.6 }}
           {"index": { "_id": "12" }}
-          {"content_embedding":{"is": 0.1891394, "pugs": 0.1 }}
+          {"content_embedding":{"cats": 0.1 }}
 
   - do:
       search:
@@ -955,13 +955,25 @@ teardown:
               field: content_embedding
               query_vector:
                 pugs: 0.5
-                cats: 0.5
-                is: 0.04600334
+                cats: 0.18
+                is: 0.2
 
   - match: { hits.total.value: 2 }
   - match: { hits.hits.0._id: "11" }
   - match: { hits.hits.1._id: "12" }
 
+  - do:
+      search:
+        index: test-sparse-vector-pruning-default
+        body:
+          query:
+            sparse_vector:
+              field: content_embedding
+              query_vector:
+                is: 0.21 # 0.2 is the weight threshold for the default pruning config
+
+  - match: { hits.total.value: 11 }
+
   - do:
       search:
         index: test-sparse-vector-pruning-default