Kaynağa Gözat

Forbid empty doc values on vector functions (#43944)

Currently when a document misses a vector value, vector function
returns 0 as a score for this document. We think this is incorrect
behaviour.
With this change, an error will be thrown if vector functions are
used with docs that are missing vector doc values.
Also VectorScriptDocValues is modified to allow size() function,
which can be used to check if a document has a value for the
vector field.
Mayya Sharipova 6 yıl önce
ebeveyn
işleme
5255eb3c77

+ 10 - 2
docs/reference/query-dsl/script-score-query.asciidoc

@@ -195,8 +195,16 @@ between a given query vector and document vectors.
 // NOTCONSOLE
 
 NOTE: If a document doesn't have a value for a vector field on which
-a vector function is executed, 0 is returned as a result
-for this document.
+a vector function is executed, an error will be thrown.
+
+You can check if a document has a value for the field `my_vector` by
+`doc['my_vector'].size() == 0`. Your overall script can look like this:
+
+[source,js]
+--------------------------------------------------
+"source": "doc['my_vector'].size() == 0 ? 0 : cosineSimilarity(params.queryVector, doc['my_vector'])"
+--------------------------------------------------
+// NOTCONSOLE
 
 NOTE: If a document's dense vector field has a number of dimensions
 different from the query's vector, an error will be thrown.

+ 19 - 1
x-pack/plugin/src/test/resources/rest-api-spec/test/vectors/20_dense_vector_special_cases.yml

@@ -131,7 +131,7 @@ setup:
   - match: { error.root_cause.0.type: "script_exception" }
 
 ---
-"Distance functions for documents missing vector field should return 0":
+"Documents missing a vector field":
 - do:
     index:
       index: test-index
@@ -149,7 +149,9 @@ setup:
 - do:
     indices.refresh: {}
 
+# expect an error when documents miss a vector field
 - do:
+    catch: bad_request
     headers:
       Content-Type: application/json
     search:
@@ -162,6 +164,22 @@ setup:
               source: "cosineSimilarity(params.query_vector, doc['my_dense_vector'])"
               params:
                 query_vector: [10.0, 10.0, 10.0]
+- match: { error.root_cause.0.type: "script_exception" }
+
+# guard against missing values by checking size()
+- do:
+    headers:
+      Content-Type: application/json
+    search:
+      rest_total_hits_as_int: true
+      body:
+        query:
+          script_score:
+            query: {match_all: {} }
+            script:
+              source: "doc['my_dense_vector'].size() == 0 ? 0 : cosineSimilarity(params.query_vector, doc['my_dense_vector'])"
+              params:
+                query_vector: [10.0, 10.0, 10.0]
 
 - match: {hits.total: 2}
 - match: {hits.hits.0._id: "1"}

+ 19 - 1
x-pack/plugin/src/test/resources/rest-api-spec/test/vectors/40_sparse_vector_special_cases.yml

@@ -87,7 +87,7 @@ setup:
   - match: {hits.hits.2._id: "3"}
 
 ---
-"Distance functions for documents missing vector field should return 0":
+"Documents missing a vector field":
 - do:
     index:
       index: test-index
@@ -105,7 +105,9 @@ setup:
 - do:
     indices.refresh: {}
 
+# expect an error when documents miss a vector field
 - do:
+    catch: bad_request
     headers:
       Content-Type: application/json
     search:
@@ -118,6 +120,22 @@ setup:
               source: "cosineSimilaritySparse(params.query_vector, doc['my_sparse_vector'])"
               params:
                 query_vector: {"1": 10.0}
+- match: { error.root_cause.0.type: "script_exception" }
+
+# guard against missing values by checking size()
+- do:
+    headers:
+      Content-Type: application/json
+    search:
+      rest_total_hits_as_int: true
+      body:
+        query:
+          script_score:
+            query: {match_all: {} }
+            script:
+              source: "doc['my_sparse_vector'].size() == 0 ? 0 : cosineSimilaritySparse(params.query_vector, doc['my_sparse_vector'])"
+              params:
+                query_vector: {"1": 10.0}
 
 - match: {hits.total: 2}
 - match: {hits.hits.0._id: "1"}

+ 0 - 4
x-pack/plugin/vectors/src/main/java/org/elasticsearch/xpack/vectors/query/ScoreScriptUtils.java

@@ -28,7 +28,6 @@ public class ScoreScriptUtils {
      */
     public static double dotProduct(List<Number> queryVector, VectorScriptDocValues.DenseVectorScriptDocValues dvs){
         BytesRef value = dvs.getEncodedValue();
-        if (value == null) return 0;
         float[] docVector = VectorEncoderDecoder.decodeDenseVector(value);
         if (queryVector.size() != docVector.length) {
             throw new IllegalArgumentException("Can't calculate dotProduct! The number of dimensions of the query vector [" +
@@ -63,7 +62,6 @@ public class ScoreScriptUtils {
 
         public double cosineSimilarity(VectorScriptDocValues.DenseVectorScriptDocValues dvs) {
             BytesRef value = dvs.getEncodedValue();
-            if (value == null) return 0;
             float[] docVector = VectorEncoderDecoder.decodeDenseVector(value);
             if (queryVector.size() != docVector.length) {
                 throw new IllegalArgumentException("Can't calculate cosineSimilarity! The number of dimensions of the query vector [" +
@@ -129,7 +127,6 @@ public class ScoreScriptUtils {
 
         public double dotProductSparse(VectorScriptDocValues.SparseVectorScriptDocValues dvs) {
             BytesRef value = dvs.getEncodedValue();
-            if (value == null) return 0;
             int[] docDims = VectorEncoderDecoder.decodeSparseVectorDims(value);
             float[] docValues = VectorEncoderDecoder.decodeSparseVector(value);
             return intDotProductSparse(queryValues, queryDims, docValues, docDims);
@@ -174,7 +171,6 @@ public class ScoreScriptUtils {
 
         public double cosineSimilaritySparse(VectorScriptDocValues.SparseVectorScriptDocValues dvs) {
             BytesRef value = dvs.getEncodedValue();
-            if (value == null) return 0;
             int[] docDims = VectorEncoderDecoder.decodeSparseVectorDims(value);
             float[] docValues = VectorEncoderDecoder.decodeSparseVector(value);
 

+ 6 - 2
x-pack/plugin/vectors/src/main/java/org/elasticsearch/xpack/vectors/query/VectorScriptDocValues.java

@@ -41,12 +41,16 @@ public abstract class VectorScriptDocValues extends ScriptDocValues<BytesRef> {
 
     @Override
     public BytesRef get(int index) {
-        throw new UnsupportedOperationException("vector fields may only be used via vector functions in scripts");
+        throw new UnsupportedOperationException("accessing a vector field's value through 'get' or 'value' is not supported");
     }
 
     @Override
     public int size() {
-        throw new UnsupportedOperationException("vector fields may only be used via vector functions in scripts");
+        if (value == null) {
+            return 0;
+        } else {
+            return 1;
+        }
     }
 
     // not final, as it needs to be extended by Mockito for tests