瀏覽代碼

[8.x] Add docvalue_fields Support for dense_vector Fields (#114484) (#116491)

* Add `docvalue_fields` Support for `dense_vector` Fields (#114484)

Currently dense_vector field don't support docvalue_fields.

This add this support for debugging purposes. Users can inspect
row values of their vectors even if the source is disabled.

Co-authored-by: Mayya Sharipova <mayya.sharipova@elastic.co>
(cherry picked from commit c8a8d4d9311bc588b9376dbaea3523a6354926bc)

* fixing for backport

---------

Co-authored-by: Rassyan <yjkhngds@gmail.com>
Benjamin Trent 11 月之前
父節點
當前提交
308ad0c05f

+ 6 - 0
docs/changelog/114484.yaml

@@ -0,0 +1,6 @@
+pr: 114484
+summary: Add `docvalue_fields` Support for `dense_vector` Fields
+area: Search
+type: enhancement
+issues:
+  - 108470

+ 163 - 0
rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/200_dense_vector_docvalue_fields.yml

@@ -0,0 +1,163 @@
+setup:
+  - requires:
+      capabilities:
+        - method: POST
+          path: /_search
+          capabilities: [ dense_vector_docvalue_fields ]
+      test_runner_features: [ capabilities, close_to ]
+      reason: Capability required to run test
+  - do:
+      indices.create:
+        index: test
+        body:
+          mappings:
+            properties:
+              name:
+                type: keyword
+              vector1:
+                type: dense_vector
+                element_type: float
+                dims: 5
+                index: true
+              vector2:
+                type: dense_vector
+                element_type: float
+                dims: 5
+                index: false
+              vector3:
+                type: dense_vector
+                element_type: byte
+                dims: 5
+                index: true
+              vector4:
+                type: dense_vector
+                element_type: byte
+                dims: 5
+                index: false
+              vector5:
+                type: dense_vector
+                element_type: bit
+                dims: 40
+                index: true
+              vector6:
+                type: dense_vector
+                element_type: bit
+                dims: 40
+                index: false
+  - do:
+      index:
+        index: test
+        id: "1"
+        body:
+          name: cow.jpg
+          vector1: [230.0, 300.33, -34.8988, 15.555, -200.0]
+          vector2: [130.0, 115.0, -1.02, 15.555, -100.0]
+          vector3: [-1, 100, -13, 15, -128]
+          vector4: [-1, 50, -1, 1, 120]
+          vector5: [1, 111, -13, 15, -128]
+          vector6: [-1, 11, 0, 12, 111]
+  - do:
+      index:
+        index: test
+        id: "2"
+        body:
+          name: moose.jpg
+          vector1: [-0.5, 100.0, -13, 14.8, -156.0]
+          vector4: [-1, 50, -1, 1, 120]
+          vector5: [1, 111, -13, 15, -128]
+          vector6: null
+  - do:
+      index:
+        index: test
+        id: "3"
+        body:
+          name: rabbit.jpg
+          vector2: [130.0, 115.0, -1.02, 15.555, -100.0]
+          vector3: [-1, 100, -13, 15, -128]
+
+  - do:
+      indices.refresh: {}
+
+---
+"Enable docvalue_fields parameter for dense_vector fields":
+  - requires:
+      capabilities:
+        - method: POST
+          path: /_search
+          capabilities: [ dense_vector_docvalue_fields ]
+      test_runner_features: capabilities
+      reason: "Support for dense vector doc value fields capability required"
+  - do:
+      search:
+        _source: false
+        index: test
+        body:
+          docvalue_fields: [name, vector1, vector2, vector3, vector4, vector5, vector6]
+          sort: name
+
+
+  - match: {hits.hits.0._id: "1"}
+  - match: {hits.hits.0.fields.name.0: "cow.jpg"}
+
+  - length: {hits.hits.0.fields.vector1.0: 5}
+  - length: {hits.hits.0.fields.vector2.0: 5}
+  - length: {hits.hits.0.fields.vector3.0: 5}
+  - length: {hits.hits.0.fields.vector4.0: 5}
+  - length: {hits.hits.0.fields.vector5.0: 5}
+  - length: {hits.hits.0.fields.vector6.0: 5}
+
+  - close_to: { hits.hits.0.fields.vector1.0.0: { value: 230.0, error: 0.001 } }
+  - close_to: { hits.hits.0.fields.vector1.0.1: { value: 300.33, error: 0.001 } }
+  - close_to: { hits.hits.0.fields.vector1.0.2: { value: -34.8988, error: 0.001 } }
+  - close_to: { hits.hits.0.fields.vector1.0.3: { value: 15.555, error: 0.001 } }
+  - close_to: { hits.hits.0.fields.vector1.0.4: { value: -200.0, error: 0.001 } }
+
+  - close_to: { hits.hits.0.fields.vector2.0.0: { value: 130.0, error: 0.001 } }
+  - close_to: { hits.hits.0.fields.vector2.0.1: { value: 115.0, error: 0.001 } }
+  - close_to: { hits.hits.0.fields.vector2.0.2: { value: -1.02, error: 0.001 } }
+  - close_to: { hits.hits.0.fields.vector2.0.3: { value: 15.555, error: 0.001 } }
+  - close_to: { hits.hits.0.fields.vector2.0.4: { value: -100.0, error: 0.001 } }
+
+  - match: {hits.hits.0.fields.vector3.0: [-1, 100, -13, 15, -128]}
+  - match: {hits.hits.0.fields.vector4.0: [-1, 50, -1, 1, 120]}
+  - match: {hits.hits.0.fields.vector5.0: [1, 111, -13, 15, -128]}
+  - match: {hits.hits.0.fields.vector6.0: [-1, 11, 0, 12, 111]}
+
+
+  - match: {hits.hits.1._id: "2"}
+  - match: {hits.hits.1.fields.name.0: "moose.jpg"}
+
+  - length: {hits.hits.1.fields.vector1.0: 5}
+  - length: {hits.hits.1.fields.vector4.0: 5}
+  - length: {hits.hits.1.fields.vector5.0: 5}
+  - match: {hits.hits.1.fields.vector2: null}
+  - match: {hits.hits.1.fields.vector3: null}
+  - match: {hits.hits.1.fields.vector6: null}
+
+  - close_to: { hits.hits.1.fields.vector1.0.0: { value: -0.5, error: 0.001 } }
+  - close_to: { hits.hits.1.fields.vector1.0.1: { value: 100.0, error: 0.001 } }
+  - close_to: { hits.hits.1.fields.vector1.0.2: { value: -13, error: 0.001 } }
+  - close_to: { hits.hits.1.fields.vector1.0.3: { value: 14.8, error: 0.001 } }
+  - close_to: { hits.hits.1.fields.vector1.0.4: { value: -156.0, error: 0.001 } }
+
+  - match: {hits.hits.1.fields.vector4.0: [-1, 50, -1, 1, 120]}
+  - match: {hits.hits.1.fields.vector5.0: [1, 111, -13, 15, -128]}
+
+
+  - match: {hits.hits.2._id: "3"}
+  - match: {hits.hits.2.fields.name.0: "rabbit.jpg"}
+
+  - length: {hits.hits.2.fields.vector2.0: 5}
+  - length: {hits.hits.2.fields.vector3.0: 5}
+  - match: {hits.hits.2.fields.vector1: null}
+  - match: {hits.hits.2.fields.vector4: null}
+  - match: {hits.hits.2.fields.vector5: null}
+  - match: {hits.hits.2.fields.vector6: null}
+
+  - close_to: { hits.hits.2.fields.vector2.0.0: { value: 130.0, error: 0.001 } }
+  - close_to: { hits.hits.2.fields.vector2.0.1: { value: 115.0, error: 0.001 } }
+  - close_to: { hits.hits.2.fields.vector2.0.2: { value: -1.02, error: 0.001 } }
+  - close_to: { hits.hits.2.fields.vector2.0.3: { value: 15.555, error: 0.001 } }
+  - close_to: { hits.hits.2.fields.vector2.0.4: { value: -100.0, error: 0.001 } }
+
+  - match: {hits.hits.2.fields.vector3.0: [-1, 100, -13, 15, -128]}

+ 1 - 3
server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java

@@ -1903,9 +1903,7 @@ public class DenseVectorFieldMapper extends FieldMapper {
 
         @Override
         public DocValueFormat docValueFormat(String format, ZoneId timeZone) {
-            throw new IllegalArgumentException(
-                "Field [" + name() + "] of type [" + typeName() + "] doesn't support docvalue_fields or aggregations"
-            );
+            return DocValueFormat.DENSE_VECTOR;
         }
 
         @Override

+ 120 - 0
server/src/main/java/org/elasticsearch/index/mapper/vectors/VectorDVLeafFieldData.java

@@ -10,9 +10,14 @@
 package org.elasticsearch.index.mapper.vectors;
 
 import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.ByteVectorValues;
 import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.FloatVectorValues;
 import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.index.IndexVersion;
+import org.elasticsearch.index.fielddata.FormattedDocValues;
 import org.elasticsearch.index.fielddata.LeafFieldData;
 import org.elasticsearch.index.fielddata.SortedBinaryDocValues;
 import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.ElementType;
@@ -23,8 +28,12 @@ import org.elasticsearch.script.field.vectors.BitKnnDenseVectorDocValuesField;
 import org.elasticsearch.script.field.vectors.ByteBinaryDenseVectorDocValuesField;
 import org.elasticsearch.script.field.vectors.ByteKnnDenseVectorDocValuesField;
 import org.elasticsearch.script.field.vectors.KnnDenseVectorDocValuesField;
+import org.elasticsearch.search.DocValueFormat;
 
 import java.io.IOException;
+import java.util.Arrays;
+
+import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
 
 final class VectorDVLeafFieldData implements LeafFieldData {
 
@@ -76,4 +85,115 @@ final class VectorDVLeafFieldData implements LeafFieldData {
         }
     }
 
+    @Override
+    public FormattedDocValues getFormattedValues(DocValueFormat format) {
+        int dims = elementType == ElementType.BIT ? this.dims / Byte.SIZE : this.dims;
+        return switch (elementType) {
+            case BYTE, BIT -> new FormattedDocValues() {
+                private byte[] vector = new byte[dims];
+                private ByteVectorValues byteVectorValues; // use when indexed
+                private BinaryDocValues binary; // use when not indexed
+                {
+                    try {
+                        if (indexed) {
+                            byteVectorValues = reader.getByteVectorValues(field);
+                        } else {
+                            binary = DocValues.getBinary(reader, field);
+                        }
+                    } catch (IOException e) {
+                        throw new IllegalStateException("Cannot load doc values", e);
+                    }
+
+                }
+
+                @Override
+                public boolean advanceExact(int docId) throws IOException {
+                    if (indexed) {
+                        if (iteratorAdvanceExact(byteVectorValues, docId) == false) {
+                            return false;
+                        }
+                        vector = byteVectorValues.vectorValue();
+                    } else {
+                        if (binary == null || binary.advanceExact(docId) == false) {
+                            return false;
+                        }
+                        BytesRef ref = binary.binaryValue();
+                        System.arraycopy(ref.bytes, ref.offset, vector, 0, dims);
+                    }
+                    return true;
+                }
+
+                @Override
+                public int docValueCount() {
+                    return 1;
+                }
+
+                public Object nextValue() {
+                    Byte[] vectorValue = new Byte[dims];
+                    for (int i = 0; i < dims; i++) {
+                        vectorValue[i] = vector[i];
+                    }
+                    return vectorValue;
+                }
+            };
+            case FLOAT -> new FormattedDocValues() {
+                float[] vector = new float[dims];
+                private FloatVectorValues floatVectorValues; // use when indexed
+                private BinaryDocValues binary; // use when not indexed
+                {
+                    try {
+                        if (indexed) {
+                            floatVectorValues = reader.getFloatVectorValues(field);
+                        } else {
+                            binary = DocValues.getBinary(reader, field);
+                        }
+                    } catch (IOException e) {
+                        throw new IllegalStateException("Cannot load doc values", e);
+                    }
+
+                }
+
+                @Override
+                public boolean advanceExact(int docId) throws IOException {
+                    if (indexed) {
+                        if (iteratorAdvanceExact(floatVectorValues, docId) == false) {
+                            return false;
+                        }
+                        vector = floatVectorValues.vectorValue();
+                    } else {
+                        if (binary == null || binary.advanceExact(docId) == false) {
+                            return false;
+                        }
+                        BytesRef ref = binary.binaryValue();
+                        VectorEncoderDecoder.decodeDenseVector(indexVersion, ref, vector);
+                    }
+                    return true;
+                }
+
+                @Override
+                public int docValueCount() {
+                    return 1;
+                }
+
+                @Override
+                public Object nextValue() {
+                    return Arrays.copyOf(vector, vector.length);
+                }
+            };
+        };
+    }
+
+    private static boolean iteratorAdvanceExact(DocIdSetIterator iterator, int docId) throws IOException {
+        if (iterator == null) return false;
+        int currentDoc = iterator.docID();
+        if (currentDoc == NO_MORE_DOCS || docId < currentDoc) {
+            return false;
+        } else if (docId > currentDoc) {
+            currentDoc = iterator.advance(docId);
+            if (currentDoc != docId) {
+                return false;
+            }
+        }
+        return true;
+    }
 }

+ 4 - 1
server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java

@@ -28,6 +28,8 @@ public final class SearchCapabilities {
     private static final String BIT_DENSE_VECTOR_SYNTHETIC_SOURCE_CAPABILITY = "bit_dense_vector_synthetic_source";
     /** Support Byte and Float with Bit dot product. */
     private static final String BYTE_FLOAT_BIT_DOT_PRODUCT_CAPABILITY = "byte_float_bit_dot_product";
+    /** Support docvalue_fields parameter for `dense_vector` field. */
+    private static final String DENSE_VECTOR_DOCVALUE_FIELDS = "dense_vector_docvalue_fields";
     /** Support kql query. */
     private static final String KQL_QUERY_SUPPORTED = "kql_query";
 
@@ -37,7 +39,8 @@ public final class SearchCapabilities {
         Set<String> capabilities = Set.of(
             RANGE_REGEX_INTERVAL_QUERY_CAPABILITY,
             BIT_DENSE_VECTOR_SYNTHETIC_SOURCE_CAPABILITY,
-            BYTE_FLOAT_BIT_DOT_PRODUCT_CAPABILITY
+            BYTE_FLOAT_BIT_DOT_PRODUCT_CAPABILITY,
+            DENSE_VECTOR_DOCVALUE_FIELDS
         );
 
         if (Build.current().isSnapshot()) {

+ 25 - 0
server/src/main/java/org/elasticsearch/search/DocValueFormat.java

@@ -168,6 +168,31 @@ public interface DocValueFormat extends NamedWriteable {
         }
     };
 
+    DocValueFormat DENSE_VECTOR = DenseVectorDocValueFormat.INSTANCE;
+
+    /**
+     * Singleton, stateless formatter, for dense vector values, no need to actually format anything
+     */
+    class DenseVectorDocValueFormat implements DocValueFormat {
+
+        public static final DocValueFormat INSTANCE = new DenseVectorDocValueFormat();
+
+        private DenseVectorDocValueFormat() {}
+
+        @Override
+        public String getWriteableName() {
+            return "dense_vector";
+        }
+
+        @Override
+        public void writeTo(StreamOutput out) {}
+
+        @Override
+        public String toString() {
+            return "dense_vector";
+        }
+    };
+
     DocValueFormat BINARY = BinaryDocValueFormat.INSTANCE;
 
     /**

+ 1 - 0
server/src/main/java/org/elasticsearch/search/SearchModule.java

@@ -1020,6 +1020,7 @@ public class SearchModule {
         registerValueFormat(DocValueFormat.IP.getWriteableName(), in -> DocValueFormat.IP);
         registerValueFormat(DocValueFormat.RAW.getWriteableName(), in -> DocValueFormat.RAW);
         registerValueFormat(DocValueFormat.BINARY.getWriteableName(), in -> DocValueFormat.BINARY);
+        registerValueFormat(DocValueFormat.DENSE_VECTOR.getWriteableName(), in -> DocValueFormat.DENSE_VECTOR);
         registerValueFormat(DocValueFormat.UNSIGNED_LONG_SHIFTED.getWriteableName(), in -> DocValueFormat.UNSIGNED_LONG_SHIFTED);
         registerValueFormat(DocValueFormat.TIME_SERIES_ID.getWriteableName(), in -> DocValueFormat.TIME_SERIES_ID);
         registerValueFormat(TS_ROUTING_HASH_DOC_VALUE_FORMAT.getWriteableName(), in -> TS_ROUTING_HASH_DOC_VALUE_FORMAT);

+ 3 - 2
server/src/test/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldTypeTests.java

@@ -21,6 +21,7 @@ import org.elasticsearch.index.mapper.FieldTypeTestCase;
 import org.elasticsearch.index.mapper.MappedFieldType;
 import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.DenseVectorFieldType;
 import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.VectorSimilarity;
+import org.elasticsearch.search.DocValueFormat;
 import org.elasticsearch.search.vectors.DenseVectorQuery;
 import org.elasticsearch.search.vectors.VectorData;
 
@@ -134,9 +135,9 @@ public class DenseVectorFieldTypeTests extends FieldTypeTestCase {
 
     public void testDocValueFormat() {
         DenseVectorFieldType fft = createFloatFieldType();
-        expectThrows(IllegalArgumentException.class, () -> fft.docValueFormat(null, null));
+        assertEquals(DocValueFormat.DENSE_VECTOR, fft.docValueFormat(null, null));
         DenseVectorFieldType bft = createByteFieldType();
-        expectThrows(IllegalArgumentException.class, () -> bft.docValueFormat(null, null));
+        assertEquals(DocValueFormat.DENSE_VECTOR, bft.docValueFormat(null, null));
     }
 
     public void testFetchSourceValue() throws IOException {