1
0
Эх сурвалжийг харах

Fix bug for kNN with filtered aliases (#89621)

This change adds the filter query for a filtered alias to the knn query during the dfs phase on the 
shard. This ensures the correct number of k results are returned instead of removing results as a post 
filter.

Fixes: #89561
Jack Conradson 3 жил өмнө
parent
commit
8c30b86fe2

+ 5 - 0
docs/changelog/89621.yaml

@@ -0,0 +1,5 @@
+pr: 89621
+summary: Fix bug for kNN with filtered aliases
+area: Vector Search
+type: bug
+issues: []

+ 0 - 4
docs/reference/search/knn-search.asciidoc

@@ -87,10 +87,6 @@ will return the top `k` documents that also match the filter query.
 to search. Supports wildcards (`*`). To search all data streams and indices,
 use `*` or `_all`.
 
-WARNING: kNN search does not yet work with <<filter-alias,filtered aliases>>.
-Running a kNN search against a filtered alias may incorrectly result in fewer
-than _k_ hits.
-
 [role="child_attributes"]
 [[knn-search-api-query-params]]
 ==== {api-query-parms-title}

+ 278 - 0
rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/60_knn_search_filter_alias.yml

@@ -0,0 +1,278 @@
+setup:
+  - skip:
+      version: ' - 8.4.99'
+      reason: 'filtered alias for kNN search added in 8.5'
+
+  - do:
+      indices.create:
+        index: test
+        body:
+          settings:
+            number_of_shards: 1
+            number_of_replicas: 0
+          mappings:
+            dynamic: false
+            properties:
+              test_vector:
+                type: dense_vector
+                dims: 4
+                index : true
+                similarity : l2_norm
+              name:
+                type: keyword
+                store: true
+          aliases:
+            test-alias:
+              filter:
+                term:
+                  name: v1
+
+  - do:
+      index:
+        index: test
+        id: "1"
+        body:
+          name: v1
+          test_vector: [230.0, 300.33, -34.8988, 15.555]
+
+  - do:
+      index:
+        index: test
+        id: "2"
+        body:
+          name: v1
+          test_vector: [0.5, 0.5, 0.5, -1]
+
+  - do:
+      index:
+        index: test
+        id: "3"
+        body:
+          name: v2
+          test_vector: [0.5, 0.5, 0.5, 0.5]
+
+  - do:
+      index:
+        index: test
+        id: "4"
+        body:
+          name: v2
+          test_vector: [0.5, 0.5, 0.5, 0]
+
+  - do:
+      index:
+        index: test
+        id: "5"
+        body:
+          name: v2
+          test_vector: [0.5, 0.5, 0.5, 0.4]
+
+  - do:
+     indices.refresh: {}
+
+---
+"kNN filter alias":
+
+  # test knn search w/ no filter
+  - do:
+      search:
+        index: test
+        body:
+          fields: [ name ]
+          knn:
+            field: test_vector
+            query_vector: [ 230.0, 300.33, -34.8988, -1.0 ]
+            k: 2
+            num_candidates: 100
+
+  - match: { hits.total.value: 2 }
+
+  - match: { hits.hits.0._id: "1" }
+  - match: { hits.hits.0.fields.name.0: v1 }
+
+  - match: { hits.hits.1._id: "2" }
+  - match: { hits.hits.1.fields.name.0: v1 }
+
+  # test knn search w/ a prefilter of term v1
+  - do:
+      search:
+        index: test
+        body:
+          fields: [ name ]
+          knn:
+            field: test_vector
+            query_vector: [ 230.0, 300.33, -34.8988, -1.0 ]
+            k: 2
+            num_candidates: 100
+            filter:
+              term:
+                name: v1
+
+  - match: { hits.total.value: 2 }
+
+  - match: { hits.hits.0._id: "1" }
+  - match: { hits.hits.0.fields.name.0: v1 }
+
+  - match: { hits.hits.1._id: "2" }
+  - match: { hits.hits.1.fields.name.0: v1 }
+
+  # test knn search w/ a filtered alias of term v1
+  - do:
+      search:
+        index: test-alias
+        body:
+          fields: [ name ]
+          knn:
+            field: test_vector
+            query_vector: [ 230.0, 300.33, -34.8988, -1.0 ]
+            k: 2
+            num_candidates: 100
+
+  - match: { hits.total.value: 2 }
+
+  - match: { hits.hits.0._id: "1" }
+  - match: { hits.hits.0.fields.name.0: v1 }
+
+  - match: { hits.hits.1._id: "2" }
+  - match: { hits.hits.1.fields.name.0: v1 }
+
+  # test knn search w/ with no filter to show
+  # the nearest vectors are 1 and 3
+  - do:
+      search:
+        index: test
+        body:
+          fields: [ name ]
+          knn:
+            field: test_vector
+            query_vector: [ 230.0, 300.33, -34.8988, 0.5 ]
+            k: 2
+            num_candidates: 100
+
+  - match: { hits.total.value: 2 }
+
+  - match: { hits.hits.0._id: "1" }
+  - match: { hits.hits.0.fields.name.0: v1 }
+
+  - match: { hits.hits.1._id: "3" }
+  - match: { hits.hits.1.fields.name.0: v2 }
+
+  # test knn search w/ a prefilter of term v1
+  # the nearest vectors w/ the prefilter are 1 and 2
+  # instead of 1 and 3
+  - do:
+      search:
+        index: test
+        body:
+          fields: [ name ]
+          knn:
+            field: test_vector
+            query_vector: [ 230.0, 300.33, -34.8988, 0.5 ]
+            k: 2
+            num_candidates: 100
+            filter:
+              term:
+                name: v1
+
+  - match: { hits.total.value: 2 }
+
+  - match: { hits.hits.0._id: "1" }
+  - match: { hits.hits.0.fields.name.0: v1 }
+
+  - match: { hits.hits.1._id: "2" }
+  - match: { hits.hits.1.fields.name.0: v1 }
+
+  # test knn search w/ with a filtered alias
+  # the nearest vectors w/ the filtered alias are 1 and 2
+  # instead of 1 and 3
+  - do:
+      search:
+        index: test-alias
+        body:
+          fields: [ name ]
+          knn:
+            field: test_vector
+            query_vector: [ 230.0, 300.33, -34.8988, 0.5 ]
+            k: 2
+            num_candidates: 100
+
+  - match: { hits.total.value: 2 }
+
+  - match: { hits.hits.0._id: "1" }
+  - match: { hits.hits.0.fields.name.0: v1 }
+
+  - match: { hits.hits.1._id: "2" }
+  - match: { hits.hits.1.fields.name.0: v1 }
+
+  # test knn search w/ with no filter to show
+  # the nearest vectors are 1, 3, and 5
+  - do:
+      search:
+        index: test
+        body:
+          fields: [ name ]
+          knn:
+            field: test_vector
+            query_vector: [ 230.0, 300.33, -34.8988, 0.5 ]
+            k: 3
+            num_candidates: 100
+
+  - match: { hits.total.value: 3 }
+
+  - match: { hits.hits.0._id: "1" }
+  - match: { hits.hits.0.fields.name.0: v1 }
+
+  - match: { hits.hits.1._id: "3" }
+  - match: { hits.hits.1.fields.name.0: v2 }
+
+  - match: { hits.hits.2._id: "5" }
+  - match: { hits.hits.2.fields.name.0: v2 }
+
+  # test knn search w/ a prefilter of term v1
+  # the nearest vectors w/ the prefilter are 1 and 2
+  # instead of 1, 3, and 5
+  # note there are only 2 vectors found w/ the prefilter
+  - do:
+      search:
+        index: test
+        body:
+          fields: [ name ]
+          knn:
+            field: test_vector
+            query_vector: [ 230.0, 300.33, -34.8988, 0.5 ]
+            k: 3
+            num_candidates: 100
+            filter:
+              term:
+                name: v1
+
+  - match: { hits.total.value: 2 }
+
+  - match: { hits.hits.0._id: "1" }
+  - match: { hits.hits.0.fields.name.0: v1 }
+
+  - match: { hits.hits.1._id: "2" }
+  - match: { hits.hits.1.fields.name.0: v1 }
+
+  # test knn search w/ with a filtered alias
+  # the nearest vectors w/ the filtered alias are 1 and 2
+  # instead of 1, 3, and 5
+  # note there are only 2 vectors found w/ the filtered alias
+  - do:
+      search:
+        index: test-alias
+        body:
+          fields: [ name ]
+          knn:
+            field: test_vector
+            query_vector: [ 230.0, 300.33, -34.8988, 0.5 ]
+            k: 3
+            num_candidates: 100
+
+  - match: { hits.total.value: 2 }
+
+  - match: { hits.hits.0._id: "1" }
+  - match: { hits.hits.0.fields.name.0: v1 }
+
+  - match: { hits.hits.1._id: "2" }
+  - match: { hits.hits.1.fields.name.0: v1 }

+ 6 - 3
server/src/main/java/org/elasticsearch/search/dfs/DfsPhase.java

@@ -16,12 +16,12 @@ import org.apache.lucene.search.ScoreMode;
 import org.apache.lucene.search.TermStatistics;
 import org.apache.lucene.search.TopDocs;
 import org.elasticsearch.index.query.ParsedQuery;
-import org.elasticsearch.index.query.QueryBuilder;
 import org.elasticsearch.index.query.SearchExecutionContext;
 import org.elasticsearch.search.builder.SearchSourceBuilder;
 import org.elasticsearch.search.internal.SearchContext;
 import org.elasticsearch.search.rescore.RescoreContext;
 import org.elasticsearch.search.vectors.KnnSearchBuilder;
+import org.elasticsearch.search.vectors.KnnVectorQueryBuilder;
 import org.elasticsearch.tasks.TaskCancelledException;
 
 import java.io.IOException;
@@ -91,8 +91,11 @@ public class DfsPhase {
                 SearchExecutionContext searchExecutionContext = context.getSearchExecutionContext();
                 KnnSearchBuilder knnSearch = source.knnSearch();
 
-                QueryBuilder queryBuilder = knnSearch.toQueryBuilder();
-                ParsedQuery query = searchExecutionContext.toQuery(queryBuilder);
+                KnnVectorQueryBuilder knnVectorQueryBuilder = knnSearch.toQueryBuilder();
+                if (context.request().getAliasFilter().getQueryBuilder() != null) {
+                    knnVectorQueryBuilder.addFilterQuery(context.request().getAliasFilter().getQueryBuilder());
+                }
+                ParsedQuery query = searchExecutionContext.toQuery(knnVectorQueryBuilder);
 
                 TopDocs topDocs = searcher.search(query.query(), knnSearch.k());
                 DfsKnnResults knnResults = new DfsKnnResults(topDocs.scoreDocs);

+ 40 - 0
server/src/test/java/org/elasticsearch/action/search/KnnSearchSingleNodeTests.java

@@ -150,6 +150,46 @@ public class KnnSearchSingleNodeTests extends ESSingleNodeTestCase {
         assertEquals(5, response.getHits().getHits().length);
     }
 
+    public void testKnnFilteredAlias() throws IOException {
+        int numShards = 1 + randomInt(3);
+        Settings indexSettings = Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, numShards).build();
+
+        XContentBuilder builder = XContentFactory.jsonBuilder()
+            .startObject()
+            .startObject("properties")
+            .startObject("vector")
+            .field("type", "dense_vector")
+            .field("dims", VECTOR_DIMENSION)
+            .field("index", true)
+            .field("similarity", "l2_norm")
+            .endObject()
+            .startObject("field")
+            .field("type", "keyword")
+            .endObject()
+            .endObject()
+            .endObject();
+        createIndex("index", indexSettings, builder);
+        client().admin().indices().prepareAliases().addAlias("index", "test-alias", QueryBuilders.termQuery("field", "hit")).get();
+
+        int expectedHits = 0;
+        for (int doc = 0; doc < 10; doc++) {
+            if (randomBoolean()) {
+                client().prepareIndex("index").setId(String.valueOf(doc)).setSource("vector", randomVector(), "field", "hit").get();
+                ++expectedHits;
+            } else {
+                client().prepareIndex("index").setId(String.valueOf(doc)).setSource("vector", randomVector(), "field", "not hit").get();
+            }
+        }
+        client().admin().indices().prepareRefresh("index").get();
+
+        float[] queryVector = randomVector();
+        KnnSearchBuilder knnSearch = new KnnSearchBuilder("vector", queryVector, 10, 50);
+        SearchResponse response = client().prepareSearch("test-alias").setKnnSearch(knnSearch).setSize(10).get();
+
+        assertHitCount(response, expectedHits);
+        assertEquals(expectedHits, response.getHits().getHits().length);
+    }
+
     public void testKnnSearchAction() throws IOException {
         Settings indexSettings = Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1).build();
         XContentBuilder builder = XContentFactory.jsonBuilder()