Browse Source

Fix bug when nested knn pre-filter might match nested docs (#105994)

When using a pre-filter with nested kNN vectors, its treated like a
top-level filter. Meaning, it is applied over parent document fields. 

However, there are times when a query filter is applied that may or may
not match internal nested or non-nested docs. We failed to handle this
case correctly and users would receive an error.

closes: https://github.com/elastic/elasticsearch/issues/105901
Benjamin Trent 1 year ago
parent
commit
263a017ca9

+ 5 - 0
docs/changelog/105994.yaml

@@ -0,0 +1,5 @@
+pr: 105994
+summary: Fix bug when nested knn pre-filter might match nested docs
+area: Vector Search
+type: bug
+issues: []

+ 84 - 0
rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/100_knn_nested_search.yml

@@ -323,3 +323,87 @@ setup:
 
   - match: {hits.total.value: 3}
   - is_true : profile
+---
+"nested kNN search with filter that might match nested docs":
+  - skip:
+      version: ' - 8.13.99'
+      reason: 'bugfix for matching non-nested docs in 8.14'
+
+  - do:
+      indices.create:
+        index: nested_text
+        body:
+          mappings:
+            properties:
+              range:
+                type: long
+              other_nested_thing:
+                type: nested
+                properties:
+                  text:
+                    type: text
+              paragraphs:
+                type: nested
+                properties:
+                  other_nested_thing:
+                    type: nested
+                    properties:
+                      text:
+                        type: text
+                      vector:
+                        type: dense_vector
+                        dims: 2
+                        index: true
+                        similarity: cosine
+                  vector:
+                    type: dense_vector
+                    dims: 2
+                    index: true
+                    similarity: cosine
+  - do:
+      index:
+        index: nested_text
+        id: "1"
+        body:
+          publish_date: "1"
+          paragraphs:
+            - vector: [1, 1]
+              text: "some text"
+            - vector: [1, 2]
+              text: "some text"
+              other_nested_thing:
+                - text: "some text"
+                  vector: [1, 2]
+  - do:
+      index:
+        index: nested_text
+        id: "2"
+        body:
+          paragraphs:
+            - vector: [2, 1]
+              text: "some text"
+            - vector: [2, 2]
+              text: "some text"
+              other_nested_thing:
+                - text: "some text"
+                  vector: [ 1, 2 ]
+  - do:
+      indices.refresh: {}
+
+  - do:
+      search:
+          index: nested_text
+          body:
+            knn:
+              field: paragraphs.vector
+              query_vector: [1, 2]
+              num_candidates: 10
+              k: 10
+              filter:
+                bool:
+                  must_not:
+                    exists:
+                      field: publish_date
+
+  - match: {hits.total.value: 1}
+  - match: {hits.hits.0._id: "2"}

+ 87 - 0
rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/130_knn_query_nested_search.yml

@@ -319,3 +319,90 @@ setup:
   # Rabbit only has one passage vector
   - match: {hits.hits.4.fields.name.0: "rabbit.jpg"}
   - length: { hits.hits.4.inner_hits.nested.hits.hits: 1 }
+---
+"nested kNN query search with filter that might match nested docs":
+  - skip:
+      version: ' - 8.13.99'
+      reason: 'bugfix for matching non-nested docs in 8.14'
+
+  - do:
+      indices.create:
+        index: nested_text
+        body:
+          mappings:
+            properties:
+              range:
+                type: long
+              other_nested_thing:
+                type: nested
+                properties:
+                  text:
+                    type: text
+              paragraphs:
+                type: nested
+                properties:
+                  other_nested_thing:
+                    type: nested
+                    properties:
+                      text:
+                        type: text
+                      vector:
+                        type: dense_vector
+                        dims: 2
+                        index: true
+                        similarity: cosine
+                  vector:
+                    type: dense_vector
+                    dims: 2
+                    index: true
+                    similarity: cosine
+  - do:
+      index:
+        index: nested_text
+        id: "1"
+        body:
+          publish_date: "1"
+          paragraphs:
+            - vector: [1, 1]
+              text: "some text"
+            - vector: [1, 2]
+              text: "some text"
+              other_nested_thing:
+                - text: "some text"
+                  vector: [1, 2]
+  - do:
+      index:
+        index: nested_text
+        id: "2"
+        body:
+          paragraphs:
+            - vector: [2, 1]
+              text: "some text"
+            - vector: [2, 2]
+              text: "some text"
+              other_nested_thing:
+                - text: "some text"
+                  vector: [ 1, 2 ]
+  - do:
+      indices.refresh: {}
+
+  - do:
+      search:
+        index: nested_text
+        body:
+          query:
+            nested:
+              path: paragraphs
+              query:
+                knn:
+                  field: paragraphs.vector
+                  query_vector: [1, 2]
+                  num_candidates: 10
+                  filter:
+                    bool:
+                      must_not:
+                        exists:
+                          field: publish_date
+
+  - match: {hits.total.value: 1}
+  - match: {hits.hits.0._id: "2"}

+ 17 - 6
server/src/main/java/org/elasticsearch/search/vectors/KnnVectorQueryBuilder.java

@@ -28,6 +28,7 @@ import org.elasticsearch.index.query.MatchNoneQueryBuilder;
 import org.elasticsearch.index.query.QueryBuilder;
 import org.elasticsearch.index.query.QueryRewriteContext;
 import org.elasticsearch.index.query.SearchExecutionContext;
+import org.elasticsearch.index.search.NestedHelper;
 import org.elasticsearch.xcontent.ConstructingObjectParser;
 import org.elasticsearch.xcontent.ObjectParser;
 import org.elasticsearch.xcontent.ParseField;
@@ -274,7 +275,6 @@ public class KnnVectorQueryBuilder extends AbstractQueryBuilder<KnnVectorQueryBu
             );
         }
 
-        final BitSetProducer parentFilter;
         BooleanQuery.Builder builder = new BooleanQuery.Builder();
         for (QueryBuilder query : this.filterQueries) {
             builder.add(query.toQuery(context), BooleanClause.Occur.FILTER);
@@ -289,6 +289,8 @@ public class KnnVectorQueryBuilder extends AbstractQueryBuilder<KnnVectorQueryBu
         String parentPath = context.nestedLookup().getNestedParent(fieldName);
 
         if (parentPath != null) {
+            final BitSetProducer parentBitSet;
+            final Query parentFilter;
             NestedObjectMapper originalObjectMapper = context.nestedScope().getObjectMapper();
             if (originalObjectMapper != null) {
                 try {
@@ -296,19 +298,28 @@ public class KnnVectorQueryBuilder extends AbstractQueryBuilder<KnnVectorQueryBu
                     context.nestedScope().previousLevel();
                     NestedObjectMapper objectMapper = context.nestedScope().getObjectMapper();
                     parentFilter = objectMapper == null
-                        ? context.bitsetFilter(Queries.newNonNestedFilter(context.indexVersionCreated()))
-                        : context.bitsetFilter(objectMapper.nestedTypeFilter());
+                        ? Queries.newNonNestedFilter(context.indexVersionCreated())
+                        : objectMapper.nestedTypeFilter();
                 } finally {
                     context.nestedScope().nextLevel(originalObjectMapper);
                 }
             } else {
                 // we are NOT in a nested context, coming from the top level knn search
-                parentFilter = context.bitsetFilter(Queries.newNonNestedFilter(context.indexVersionCreated()));
+                parentFilter = Queries.newNonNestedFilter(context.indexVersionCreated());
             }
+            parentBitSet = context.bitsetFilter(parentFilter);
             if (filterQuery != null) {
-                filterQuery = new ToChildBlockJoinQuery(filterQuery, parentFilter);
+                NestedHelper nestedHelper = new NestedHelper(context.nestedLookup(), context::isFieldMapped);
+                // We treat the provided filter as a filter over PARENT documents, so if it might match nested documents
+                // we need to adjust it.
+                if (nestedHelper.mightMatchNestedDocs(filterQuery)) {
+                    // Ensure that the query only returns parent documents matching `filterQuery`
+                    filterQuery = Queries.filtered(filterQuery, parentFilter);
+                }
+                // Now join the filterQuery & parentFilter to provide the matching blocks of children
+                filterQuery = new ToChildBlockJoinQuery(filterQuery, parentBitSet);
             }
-            return vectorFieldType.createKnnQuery(queryVector, adjustedNumCands, filterQuery, vectorSimilarity, parentFilter);
+            return vectorFieldType.createKnnQuery(queryVector, adjustedNumCands, filterQuery, vectorSimilarity, parentBitSet);
         }
         return vectorFieldType.createKnnQuery(queryVector, adjustedNumCands, filterQuery, vectorSimilarity, null);
     }