Browse Source

Vector field (#33022)

1. Dense vector

PUT dindex
{
  "mappings": {
    "_doc": {
      "properties": {
        "my_vector": {
          "type": "dense_vector"
        },
        "my_text" : {
          "type" : "keyword"
        }
      }
    }
  }
}

PUT dinex/_doc/1
{
  "my_text" : "text1",
  "my_vector" : [ 0.5, 10, 6 ]
}

2. Sparse vector

PUT sindex
{
  "mappings": {
    "_doc": {
      "properties": {
        "my_vector": {
          "type": "sparse_vector"
        },
        "my_text" : {
          "type" : "keyword"
        }
      }
    }
  }
}

PUT sindex/_doc/1
{
  "my_text" : "text1",
  "my_vector" : {"1": 0.5, "99": -0.5,  "5": 1}
}
Mayya Sharipova 6 years ago
parent
commit
b5d532f9e3

+ 9 - 0
docs/reference/mapping/types.asciidoc

@@ -47,6 +47,11 @@ string::        <<text,`text`>> and <<keyword,`keyword`>>
 
 <<feature-vector>>:: Record numeric feature vectors to boost hits at query time.
 
+<<dense-vector>>::   Record dense vectors of float values.
+
+<<sparse-vector>>::   Record sparse vectors of float values.
+
+
 [float]
 === Multi-fields
 
@@ -98,3 +103,7 @@ include::types/parent-join.asciidoc[]
 include::types/feature.asciidoc[]
 
 include::types/feature-vector.asciidoc[]
+
+include::types/dense-vector.asciidoc[]
+
+include::types/sparse-vector.asciidoc[]

+ 52 - 0
docs/reference/mapping/types/dense-vector.asciidoc

@@ -0,0 +1,52 @@
+[[dense-vector]]
+=== Dense vector datatype
+
+A `dense_vector` field stores dense vectors of float values.
+The maximum number of dimensions that can be in a vector should
+not exceed 500. The number of dimensions can be
+different across documents. A `dense_vector` field is
+a single-valued field.
+
+These vectors can be used for document scoring.
+For example, a document score can represent a distance between
+a given query vector and the indexed document vector.
+
+You index a dense vector as an array of floats.
+
+[source,js]
+--------------------------------------------------
+PUT my_index
+{
+  "mappings": {
+    "_doc": {
+      "properties": {
+        "my_vector": {
+          "type": "dense_vector"
+        },
+        "my_text" : {
+          "type" : "keyword"
+        }
+      }
+    }
+  }
+}
+
+PUT my_index/_doc/1
+{
+  "my_text" : "text1",
+  "my_vector" : [0.5, 10, 6]
+}
+
+PUT my_index/_doc/2
+{
+  "my_text" : "text2",
+  "my_vector" : [-0.5, 10, 10, 4]
+}
+
+--------------------------------------------------
+// CONSOLE
+
+Internally, each document's dense vector is encoded as a binary
+doc value. Its size in bytes is equal to
+`4 * NUMBER_OF_DIMENSIONS`, where `NUMBER_OF_DIMENSIONS` -
+number of the vector's dimensions.

+ 55 - 0
docs/reference/mapping/types/sparse-vector.asciidoc

@@ -0,0 +1,55 @@
+[[sparse-vector]]
+=== Sparse vector datatype
+
+A `sparse_vector` field stores sparse vectors of float values.
+The maximum number of dimensions that can be in a vector should
+not exceed 500. The number of dimensions can be
+different across documents. A `sparse_vector` field is
+a single-valued field.
+
+These vectors can be used for document scoring.
+For example, a document score can represent a distance between
+a given query vector and the indexed document vector.
+
+You represent a sparse vector as an object, where object fields
+are dimensions, and fields values are values for these dimensions.
+Dimensions are integer values from `0` to `65535` encoded as strings.
+Dimensions don't need to be in order.
+
+[source,js]
+--------------------------------------------------
+PUT my_index
+{
+  "mappings": {
+    "_doc": {
+      "properties": {
+        "my_vector": {
+          "type": "sparse_vector"
+        },
+        "my_text" : {
+          "type" : "keyword"
+        }
+      }
+    }
+  }
+}
+
+PUT my_index/_doc/1
+{
+  "my_text" : "text1",
+  "my_vector" : {"1": 0.5, "5": -0.5,  "100": 1}
+}
+
+PUT my_index/_doc/2
+{
+  "my_text" : "text2",
+  "my_vector" : {"103": 0.5, "4": -0.5,  "5": 1, "11" : 1.2}
+}
+
+--------------------------------------------------
+// CONSOLE
+
+Internally, each document's sparse vector is encoded as a binary
+doc value. Its size in bytes is equal to
+`6 * NUMBER_OF_DIMENSIONS`, where `NUMBER_OF_DIMENSIONS` -
+number of the vector's dimensions.

+ 195 - 0
modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/DenseVectorFieldMapper.java

@@ -0,0 +1,195 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.mapper;
+
+import org.apache.lucene.document.BinaryDocValuesField;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.search.DocValuesFieldExistsQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.xcontent.XContentParser.Token;
+import org.elasticsearch.index.fielddata.IndexFieldData;
+import org.elasticsearch.index.query.QueryShardContext;
+import org.elasticsearch.search.DocValueFormat;
+import org.joda.time.DateTimeZone;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import static org.elasticsearch.common.xcontent.XContentParserUtils.ensureExpectedToken;
+
+/**
+ * A {@link FieldMapper} for indexing a dense vector of floats.
+ */
+public class DenseVectorFieldMapper extends FieldMapper implements ArrayValueMapperParser {
+
+    public static final String CONTENT_TYPE = "dense_vector";
+    public static short MAX_DIMS_COUNT = 500; //maximum allowed number of dimensions
+    private static final byte INT_BYTES = 4;
+
+    public static class Defaults {
+        public static final MappedFieldType FIELD_TYPE = new DenseVectorFieldType();
+
+        static {
+            FIELD_TYPE.setTokenized(false);
+            FIELD_TYPE.setIndexOptions(IndexOptions.NONE);
+            FIELD_TYPE.setHasDocValues(true);
+            FIELD_TYPE.setOmitNorms(true);
+            FIELD_TYPE.freeze();
+        }
+    }
+
+    public static class Builder extends FieldMapper.Builder<Builder, DenseVectorFieldMapper> {
+
+        public Builder(String name) {
+            super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE);
+            builder = this;
+        }
+
+        @Override
+        public DenseVectorFieldType fieldType() {
+            return (DenseVectorFieldType) super.fieldType();
+        }
+
+        @Override
+        public DenseVectorFieldMapper build(BuilderContext context) {
+            setupFieldType(context);
+            return new DenseVectorFieldMapper(
+                    name, fieldType, defaultFieldType,
+                    context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo);
+        }
+    }
+
+    public static class TypeParser implements Mapper.TypeParser {
+        @Override
+        public Mapper.Builder<?,?> parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
+            DenseVectorFieldMapper.Builder builder = new DenseVectorFieldMapper.Builder(name);
+            return builder;
+        }
+    }
+
+    public static final class DenseVectorFieldType extends MappedFieldType {
+
+        public DenseVectorFieldType() {}
+
+        protected DenseVectorFieldType(DenseVectorFieldType ref) {
+            super(ref);
+        }
+
+        public DenseVectorFieldType clone() {
+            return new DenseVectorFieldType(this);
+        }
+
+        @Override
+        public String typeName() {
+            return CONTENT_TYPE;
+        }
+
+        @Override
+        public DocValueFormat docValueFormat(String format, DateTimeZone timeZone) {
+            throw new UnsupportedOperationException(
+                "Field [" + name() + "] of type [" + typeName() + "] doesn't support docvalue_fields or aggregations");
+        }
+
+        @Override
+        public Query existsQuery(QueryShardContext context) {
+            return new DocValuesFieldExistsQuery(name());
+        }
+
+        @Override
+        public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) {
+            throw new UnsupportedOperationException(
+                "Field [" + name() + "] of type [" + typeName() + "] doesn't support sorting, scripting or aggregating");
+        }
+
+        @Override
+        public Query termQuery(Object value, QueryShardContext context) {
+            throw new UnsupportedOperationException(
+                "Field [" + name() + "] of type [" + typeName() + "] doesn't support queries");
+        }
+    }
+
+    private DenseVectorFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType,
+                                   Settings indexSettings, MultiFields multiFields, CopyTo copyTo) {
+        super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo);
+        assert fieldType.indexOptions() == IndexOptions.NONE;
+    }
+
+    @Override
+    protected DenseVectorFieldMapper clone() {
+        return (DenseVectorFieldMapper) super.clone();
+    }
+
+    @Override
+    public DenseVectorFieldType fieldType() {
+        return (DenseVectorFieldType) super.fieldType();
+    }
+
+    @Override
+    public void parse(ParseContext context) throws IOException {
+        if (context.externalValueSet()) {
+            throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] can't be used in multi-fields");
+        }
+
+        // encode array of floats as array of integers and store into buf
+        // this code is here and not int the VectorEncoderDecoder so not to create extra arrays
+        byte[] buf = new byte[0];
+        int offset = 0;
+        int dim = 0;
+        for (Token token = context.parser().nextToken(); token != Token.END_ARRAY; token = context.parser().nextToken()) {
+            ensureExpectedToken(Token.VALUE_NUMBER, token, context.parser()::getTokenLocation);
+            float value = context.parser().floatValue(true);
+            if (buf.length < (offset + INT_BYTES)) {
+                buf = ArrayUtil.grow(buf, (offset + INT_BYTES));
+            }
+            int intValue = Float.floatToIntBits(value);
+            buf[offset] =  (byte) (intValue >> 24);
+            buf[offset+1] = (byte) (intValue >> 16);
+            buf[offset+2] = (byte) (intValue >>  8);
+            buf[offset+3] = (byte) intValue;
+            offset += INT_BYTES;
+            dim++;
+            if (dim >= MAX_DIMS_COUNT) {
+                throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() +
+                    "] has exceeded the maximum allowed number of dimensions of :[" + MAX_DIMS_COUNT + "]");
+            }
+        }
+        BinaryDocValuesField field = new BinaryDocValuesField(fieldType().name(), new BytesRef(buf, 0, offset));
+        if (context.doc().getByKey(fieldType().name()) != null) {
+            throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() +
+                "] doesn't not support indexing multiple values for the same field in the same document");
+        }
+        context.doc().addWithKey(fieldType().name(), field);
+    }
+
+    @Override
+    protected void parseCreateField(ParseContext context, List<IndexableField> fields) {
+        throw new AssertionError("parse is implemented directly");
+    }
+
+    @Override
+    protected String contentType() {
+        return CONTENT_TYPE;
+    }
+}

+ 2 - 0
modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/MapperExtrasPlugin.java

@@ -39,6 +39,8 @@ public class MapperExtrasPlugin extends Plugin implements MapperPlugin, SearchPl
         mappers.put(TokenCountFieldMapper.CONTENT_TYPE, new TokenCountFieldMapper.TypeParser());
         mappers.put(FeatureFieldMapper.CONTENT_TYPE, new FeatureFieldMapper.TypeParser());
         mappers.put(FeatureVectorFieldMapper.CONTENT_TYPE, new FeatureVectorFieldMapper.TypeParser());
+        mappers.put(DenseVectorFieldMapper.CONTENT_TYPE, new DenseVectorFieldMapper.TypeParser());
+        mappers.put(SparseVectorFieldMapper.CONTENT_TYPE, new SparseVectorFieldMapper.TypeParser());
         return Collections.unmodifiableMap(mappers);
     }
 

+ 207 - 0
modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/SparseVectorFieldMapper.java

@@ -0,0 +1,207 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.mapper;
+
+import org.apache.lucene.document.BinaryDocValuesField;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.search.DocValuesFieldExistsQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.xcontent.XContentParser.Token;
+import org.elasticsearch.index.fielddata.IndexFieldData;
+import org.elasticsearch.index.query.QueryShardContext;
+import org.elasticsearch.search.DocValueFormat;
+import org.joda.time.DateTimeZone;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import static org.elasticsearch.common.xcontent.XContentParserUtils.ensureExpectedToken;
+
+/**
+ * A {@link FieldMapper} for indexing a sparse vector of floats.
+ */
+public class SparseVectorFieldMapper extends FieldMapper {
+
+    public static final String CONTENT_TYPE = "sparse_vector";
+    public static short MAX_DIMS_COUNT = 500; //maximum allowed number of dimensions
+    public static int MAX_DIMS_NUMBER = 65535; //maximum allowed dimension's number
+
+    public static class Defaults {
+        public static final MappedFieldType FIELD_TYPE = new SparseVectorFieldType();
+
+        static {
+            FIELD_TYPE.setTokenized(false);
+            FIELD_TYPE.setIndexOptions(IndexOptions.NONE);
+            FIELD_TYPE.setHasDocValues(true);
+            FIELD_TYPE.setOmitNorms(true);
+            FIELD_TYPE.freeze();
+        }
+    }
+
+    public static class Builder extends FieldMapper.Builder<Builder, SparseVectorFieldMapper> {
+
+        public Builder(String name) {
+            super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE);
+            builder = this;
+        }
+
+        @Override
+        public SparseVectorFieldType fieldType() {
+            return (SparseVectorFieldType) super.fieldType();
+        }
+
+        @Override
+        public SparseVectorFieldMapper build(BuilderContext context) {
+            setupFieldType(context);
+            return new SparseVectorFieldMapper(
+                    name, fieldType, defaultFieldType,
+                    context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo);
+        }
+    }
+
+    public static class TypeParser implements Mapper.TypeParser {
+        @Override
+        public Mapper.Builder<?,?> parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
+            SparseVectorFieldMapper.Builder builder = new SparseVectorFieldMapper.Builder(name);
+            return builder;
+        }
+    }
+
+    public static final class SparseVectorFieldType extends MappedFieldType {
+
+        public SparseVectorFieldType() {}
+
+        protected SparseVectorFieldType(SparseVectorFieldType ref) {
+            super(ref);
+        }
+
+        public SparseVectorFieldType clone() {
+            return new SparseVectorFieldType(this);
+        }
+
+        @Override
+        public String typeName() {
+            return CONTENT_TYPE;
+        }
+
+        @Override
+        public DocValueFormat docValueFormat(String format, DateTimeZone timeZone) {
+            throw new UnsupportedOperationException(
+                "Field [" + name() + "] of type [" + typeName() + "] doesn't support docvalue_fields or aggregations");
+        }
+
+        @Override
+        public Query existsQuery(QueryShardContext context) {
+            return new DocValuesFieldExistsQuery(name());
+        }
+
+        @Override
+        public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName) {
+            throw new UnsupportedOperationException(
+                "Field [" + name() + "] of type [" + typeName() + "] doesn't support sorting, scripting or aggregating");
+        }
+
+        @Override
+        public Query termQuery(Object value, QueryShardContext context) {
+            throw new UnsupportedOperationException(
+                "Field [" + name() + "] of type [" + typeName() + "] doesn't support queries");
+        }
+    }
+
+
+    private SparseVectorFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType,
+                                    Settings indexSettings, MultiFields multiFields, CopyTo copyTo) {
+        super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo);
+        assert fieldType.indexOptions() == IndexOptions.NONE;
+    }
+
+    @Override
+    protected SparseVectorFieldMapper clone() {
+        return (SparseVectorFieldMapper) super.clone();
+    }
+
+    @Override
+    public SparseVectorFieldType fieldType() {
+        return (SparseVectorFieldType) super.fieldType();
+    }
+
+    @Override
+    public void parse(ParseContext context) throws IOException {
+        if (context.externalValueSet()) {
+            throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] can't be used in multi-fields");
+        }
+        ensureExpectedToken(Token.START_OBJECT, context.parser().currentToken(), context.parser()::getTokenLocation);
+        int[] dims = new int[0];
+        float[] values = new float[0];
+        int dimCount = 0;
+        int dim = 0;
+        float value;
+        for (Token token = context.parser().nextToken(); token != Token.END_OBJECT; token = context.parser().nextToken()) {
+            if (token == Token.FIELD_NAME) {
+                try {
+                    dim = Integer.parseInt(context.parser().currentName());
+                    if (dim < 0 || dim > MAX_DIMS_NUMBER) {
+                        throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "]'s dimension number " +
+                            "must be a non-negative integer value not exceeding [" + MAX_DIMS_NUMBER + "], got [" + dim + "]");
+                    }
+                } catch (NumberFormatException e) {
+                    throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "]'s dimensions should be " +
+                        "integers represented as strings, but got [" + context.parser().currentName() + "]", e);
+                }
+            } else if (token == Token.VALUE_NUMBER) {
+                value = context.parser().floatValue(true);
+                if (dims.length <= dimCount) { // ensure arrays have enough capacity
+                    values = ArrayUtil.grow(values, dimCount + 1);
+                    dims = ArrayUtil.grow(dims, dimCount + 1);
+                }
+                dims[dimCount] = dim;
+                values[dimCount] = value;
+                dimCount ++;
+                if (dimCount >= MAX_DIMS_COUNT) {
+                    throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() +
+                        "] has exceeded the maximum allowed number of dimensions of :[" + MAX_DIMS_COUNT + "]");
+                }
+            } else {
+                throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() +
+                    "] takes an object that maps a dimension number to a float, " + "but got unexpected token [" + token + "]");
+            }
+        }
+
+        BytesRef br = VectorEncoderDecoder.encodeSparseVector(dims, values, dimCount);
+        BinaryDocValuesField field = new BinaryDocValuesField(fieldType().name(), br);
+        context.doc().addWithKey(fieldType().name(), field);
+    }
+
+
+    @Override
+    protected void parseCreateField(ParseContext context, List<IndexableField> fields) {
+        throw new AssertionError("parse is implemented directly");
+    }
+
+    @Override
+    protected String contentType() {
+        return CONTENT_TYPE;
+    }
+}

+ 141 - 0
modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/VectorEncoderDecoder.java

@@ -0,0 +1,141 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.mapper;
+
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.InPlaceMergeSorter;
+
+// static utility functions for encoding and decoding dense_vector and sparse_vector fields
+final class VectorEncoderDecoder {
+    static final byte INT_BYTES = 4;
+    static final byte SHORT_BYTES = 2;
+
+    private VectorEncoderDecoder() { }
+
+    /**
+     * Encodes a sparse array represented by values, dims and dimCount into a bytes array - BytesRef
+     * BytesRef: int[] floats encoded as integers values, 2 bytes for each dimension
+     * @param values - values of the sparse array
+     * @param dims - dims of the sparse array
+     * @param dimCount - number of the dimension
+     * @return BytesRef
+     */
+    static BytesRef encodeSparseVector(int[] dims, float[] values, int dimCount) {
+        // 1. Sort dims and values
+        sortSparseDimsValues(dims, values, dimCount);
+        byte[] buf = new byte[dimCount * (INT_BYTES + SHORT_BYTES)];
+
+        // 2. Encode dimensions
+        // as each dimension is a positive value that doesn't exceed 65535, 2 bytes is enough for encoding it
+        int offset = 0;
+        for (int dim = 0; dim < dimCount; dim++) {
+            buf[offset] = (byte) (dims[dim] >>  8);
+            buf[offset+1] = (byte) dims[dim];
+            offset += SHORT_BYTES;
+        }
+
+        // 3. Encode values
+        for (int dim = 0; dim < dimCount; dim++) {
+            int intValue = Float.floatToIntBits(values[dim]);
+            buf[offset] =  (byte) (intValue >> 24);
+            buf[offset+1] = (byte) (intValue >> 16);
+            buf[offset+2] = (byte) (intValue >>  8);
+            buf[offset+3] = (byte) intValue;
+            offset += INT_BYTES;
+        }
+
+        return new BytesRef(buf);
+    }
+
+    /**
+     * Decodes the first part of BytesRef into sparse vector dimensions
+     * @param vectorBR - vector decoded in BytesRef
+     */
+    static int[] decodeSparseVectorDims(BytesRef vectorBR) {
+        int dimCount = vectorBR.length / (INT_BYTES + SHORT_BYTES);
+        int[] dims = new int[dimCount];
+        int offset = vectorBR.offset;
+        for (int dim = 0; dim < dimCount; dim++) {
+            dims[dim] = ((vectorBR.bytes[offset] & 0xFF) << 8) | (vectorBR.bytes[offset+1] & 0xFF);
+            offset += SHORT_BYTES;
+        }
+        return dims;
+    }
+
+    /**
+     * Decodes the second part of the BytesRef into sparse vector values
+     * @param vectorBR - vector decoded in BytesRef
+     */
+    static float[] decodeSparseVector(BytesRef vectorBR) {
+        int dimCount = vectorBR.length / (INT_BYTES + SHORT_BYTES);
+        int offset =  vectorBR.offset + SHORT_BYTES * dimCount; //calculate the offset from where values are encoded
+        float[] vector = new float[dimCount];
+        for (int dim = 0; dim < dimCount; dim++) {
+            int intValue = ((vectorBR.bytes[offset] & 0xFF) << 24)   |
+                ((vectorBR.bytes[offset+1] & 0xFF) << 16) |
+                ((vectorBR.bytes[offset+2] & 0xFF) <<  8) |
+                (vectorBR.bytes[offset+3] & 0xFF);
+            vector[dim] = Float.intBitsToFloat(intValue);
+            offset = offset + INT_BYTES;
+        }
+        return vector;
+    }
+
+
+    /**
+    Sort dimensions in the ascending order and
+    sort values in the same order as their corresponding dimensions
+    **/
+    static void sortSparseDimsValues(int[] dims, float[] values, int n) {
+        new InPlaceMergeSorter() {
+            @Override
+            public int compare(int i, int j) {
+                return Integer.compare(dims[i], dims[j]);
+            }
+
+            @Override
+            public void swap(int i, int j) {
+                int tempDim = dims[i];
+                dims[i] = dims[j];
+                dims[j] = tempDim;
+
+                float tempValue = values[j];
+                values[j] = values[i];
+                values[i] = tempValue;
+            }
+        }.sort(0, n);
+    }
+
+    // Decodes a BytesRef into an array of floats
+    static float[] decodeDenseVector(BytesRef vectorBR) {
+        int dimCount = vectorBR.length / INT_BYTES;
+        float[] vector = new float[dimCount];
+        int offset = vectorBR.offset;
+        for (int dim = 0; dim < dimCount; dim++) {
+            int intValue = ((vectorBR.bytes[offset] & 0xFF) << 24)   |
+                ((vectorBR.bytes[offset+1] & 0xFF) << 16) |
+                ((vectorBR.bytes[offset+2] & 0xFF) <<  8) |
+                (vectorBR.bytes[offset+3] & 0xFF);
+            vector[dim] = Float.intBitsToFloat(intValue);
+            offset = offset + INT_BYTES;
+        }
+        return vector;
+    }
+}

+ 81 - 0
modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/DenseVectorFieldMapperTests.java

@@ -0,0 +1,81 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.mapper;
+
+import org.apache.lucene.document.BinaryDocValuesField;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.common.Strings;
+import org.elasticsearch.common.bytes.BytesReference;
+import org.elasticsearch.common.compress.CompressedXContent;
+import org.elasticsearch.common.xcontent.XContentFactory;
+import org.elasticsearch.common.xcontent.XContentType;
+import org.elasticsearch.index.IndexService;
+import org.elasticsearch.plugins.Plugin;
+import org.elasticsearch.test.ESSingleNodeTestCase;
+import org.hamcrest.Matchers;
+
+import java.util.Collection;
+
+public class DenseVectorFieldMapperTests extends ESSingleNodeTestCase {
+
+   @Override
+    protected Collection<Class<? extends Plugin>> getPlugins() {
+        return pluginList(MapperExtrasPlugin.class);
+    }
+
+    public void testDefaults() throws Exception {
+        IndexService indexService =  createIndex("test-index");
+        DocumentMapperParser parser = indexService.mapperService().documentMapperParser();
+        String mapping = Strings.toString(XContentFactory.jsonBuilder()
+            .startObject()
+                .startObject("_doc")
+                    .startObject("properties")
+                        .startObject("my-dense-vector").field("type", "dense_vector")
+                        .endObject()
+                    .endObject()
+                .endObject()
+            .endObject());
+
+        DocumentMapper mapper = parser.parse("_doc", new CompressedXContent(mapping));
+        assertEquals(mapping, mapper.mappingSource().toString());
+
+        float[] expectedArray = {-12.1f, 100.7f, -4};
+        ParsedDocument doc1 = mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference
+            .bytes(XContentFactory.jsonBuilder()
+                .startObject()
+                    .startArray("my-dense-vector").value(expectedArray[0]).value(expectedArray[1]).value(expectedArray[2]).endArray()
+                .endObject()),
+            XContentType.JSON));
+        IndexableField[] fields = doc1.rootDoc().getFields("my-dense-vector");
+        assertEquals(1, fields.length);
+        assertThat(fields[0], Matchers.instanceOf(BinaryDocValuesField.class));
+
+        // assert that after decoding the indexed value is equal to expected
+        BytesRef vectorBR = ((BinaryDocValuesField) fields[0]).binaryValue();
+        float[] decodedValues = VectorEncoderDecoder.decodeDenseVector(vectorBR);
+        assertArrayEquals(
+            "Decoded dense vector values is not equal to the indexed one.",
+            expectedArray,
+            decodedValues,
+            0.001f
+        );
+    }
+}

+ 28 - 0
modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/DenseVectorFieldTypeTests.java

@@ -0,0 +1,28 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.mapper;
+
+public class DenseVectorFieldTypeTests extends FieldTypeTestCase {
+
+    @Override
+    protected MappedFieldType createDefaultFieldType() {
+        return new DenseVectorFieldMapper.DenseVectorFieldType();
+    }
+}

+ 164 - 0
modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/SparseVectorFieldMapperTests.java

@@ -0,0 +1,164 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.mapper;
+
+import org.apache.lucene.document.BinaryDocValuesField;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.common.Strings;
+import org.elasticsearch.common.bytes.BytesReference;
+import org.elasticsearch.common.compress.CompressedXContent;
+import org.elasticsearch.common.xcontent.XContentFactory;
+import org.elasticsearch.common.xcontent.XContentType;
+import org.elasticsearch.index.IndexService;
+import org.elasticsearch.plugins.Plugin;
+import org.elasticsearch.test.ESSingleNodeTestCase;
+import org.hamcrest.Matchers;
+import org.junit.Before;
+
+import java.util.Collection;
+
+import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.core.IsInstanceOf.instanceOf;
+
+public class SparseVectorFieldMapperTests extends ESSingleNodeTestCase {
+    private DocumentMapper mapper;
+
+    @Before
+    public void setup() throws Exception {
+        IndexService indexService =  createIndex("test-index");
+        DocumentMapperParser parser = indexService.mapperService().documentMapperParser();
+        String mapping = Strings.toString(XContentFactory.jsonBuilder()
+            .startObject()
+                .startObject("_doc")
+                    .startObject("properties")
+                        .startObject("my-sparse-vector").field("type", "sparse_vector")
+                        .endObject()
+                    .endObject()
+                .endObject()
+            .endObject());
+        mapper = parser.parse("_doc", new CompressedXContent(mapping));
+    }
+
+    @Override
+    protected Collection<Class<? extends Plugin>> getPlugins() {
+        return pluginList(MapperExtrasPlugin.class);
+    }
+
+    public void testDefaults() throws Exception {
+        int[] indexedDims = {65535, 50, 2};
+        float[] indexedValues = {0.5f, 1800f, -34567.11f};
+        ParsedDocument doc1 = mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference
+            .bytes(XContentFactory.jsonBuilder()
+                .startObject()
+                    .startObject("my-sparse-vector")
+                        .field(Integer.toString(indexedDims[0]), indexedValues[0])
+                        .field(Integer.toString(indexedDims[1]), indexedValues[1])
+                        .field(Integer.toString(indexedDims[2]), indexedValues[2])
+                    .endObject()
+                .endObject()),
+            XContentType.JSON));
+        IndexableField[] fields = doc1.rootDoc().getFields("my-sparse-vector");
+        assertEquals(1, fields.length);
+        assertThat(fields[0], Matchers.instanceOf(BinaryDocValuesField.class));
+
+        // assert that after decoding the indexed values are equal to expected
+        int[] expectedDims = {2, 50, 65535}; //the same as indexed but sorted
+        float[] expectedValues = {-34567.11f, 1800f, 0.5f}; //the same as indexed but sorted by their dimensions
+
+        // assert that after decoding the indexed dims and values are equal to expected
+        BytesRef vectorBR = ((BinaryDocValuesField) fields[0]).binaryValue();
+        int[] decodedDims = VectorEncoderDecoder.decodeSparseVectorDims(vectorBR);
+        assertArrayEquals(
+            "Decoded sparse vector dimensions are not equal to the indexed ones.",
+            expectedDims,
+            decodedDims
+        );
+        float[] decodedValues = VectorEncoderDecoder.decodeSparseVector(vectorBR);
+        assertArrayEquals(
+            "Decoded sparse vector values are not equal to the indexed ones.",
+            expectedValues,
+            decodedValues,
+            0.001f
+        );
+    }
+
+    public void testErrors() {
+        // 1. test for an error on negative dimension
+        MapperParsingException e = expectThrows(MapperParsingException.class, () -> {
+            mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference
+            .bytes(XContentFactory.jsonBuilder()
+                .startObject()
+                    .startObject("my-sparse-vector")
+                        .field(Integer.toString(-50), 100f)
+                    .endObject()
+                .endObject()),
+            XContentType.JSON));
+        });
+        assertThat(e.getCause(), instanceOf(IllegalArgumentException.class));
+        assertThat(e.getCause().getMessage(), containsString(
+            "dimension number must be a non-negative integer value not exceeding [65535], got [-50]"));
+
+        // 2. test for an error on a dimension greater than MAX_DIMS_NUMBER
+        e = expectThrows(MapperParsingException.class, () -> {
+            mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference
+            .bytes(XContentFactory.jsonBuilder()
+                .startObject()
+                    .startObject("my-sparse-vector")
+                        .field(Integer.toString(70000), 100f)
+                    .endObject()
+                .endObject()),
+            XContentType.JSON));
+        });
+        assertThat(e.getCause(), instanceOf(IllegalArgumentException.class));
+        assertThat(e.getCause().getMessage(), containsString(
+            "dimension number must be a non-negative integer value not exceeding [65535], got [70000]"));
+
+        // 3. test for an error on a wrong formatted dimension
+        e = expectThrows(MapperParsingException.class, () -> {
+            mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference
+            .bytes(XContentFactory.jsonBuilder()
+                .startObject()
+                    .startObject("my-sparse-vector")
+                        .field("WrongDim123", 100f)
+                    .endObject()
+                .endObject()),
+            XContentType.JSON));
+        });
+        assertThat(e.getCause(), instanceOf(IllegalArgumentException.class));
+        assertThat(e.getCause().getMessage(), containsString(
+            "dimensions should be integers represented as strings, but got [WrongDim123]"));
+
+         // 4. test for an error on a wrong format for the map of dims to values
+        e = expectThrows(MapperParsingException.class, () -> {
+            mapper.parse(SourceToParse.source("test-index", "_doc", "1", BytesReference
+            .bytes(XContentFactory.jsonBuilder()
+                .startObject()
+                    .startObject("my-sparse-vector")
+                        .startArray(Integer.toString(10)).value(10f).value(100f).endArray()
+                    .endObject()
+                .endObject()),
+            XContentType.JSON));
+        });
+        assertThat(e.getCause(), instanceOf(IllegalArgumentException.class));
+        assertThat(e.getCause().getMessage(), containsString(
+            "takes an object that maps a dimension number to a float, but got unexpected token [START_ARRAY]"));
+    }
+}

+ 28 - 0
modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/SparseVectorFieldTypeTests.java

@@ -0,0 +1,28 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.mapper;
+
+public class SparseVectorFieldTypeTests extends FieldTypeTestCase {
+
+    @Override
+    protected MappedFieldType createDefaultFieldType() {
+        return new SparseVectorFieldMapper.SparseVectorFieldType();
+    }
+}

+ 115 - 0
modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/VectorEncoderDecoderTests.java

@@ -0,0 +1,115 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.mapper;
+
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.test.ESTestCase;
+
+import java.util.HashSet;
+import java.util.Set;
+import java.util.Arrays;
+
+public class VectorEncoderDecoderTests extends ESTestCase {
+
+    public void testDenseVectorEncodingDecoding() {
+        int dimCount = randomIntBetween(0, 300);
+        float[] expectedValues = new float[dimCount];
+        for (int i = 0; i < dimCount; i++) {
+            expectedValues[i] = randomFloat();
+        }
+
+        // test that values that went through encoding and decoding are equal to their original
+        BytesRef encodedDenseVector =  mockEncodeDenseVector(expectedValues);
+        float[] decodedValues = VectorEncoderDecoder.decodeDenseVector(encodedDenseVector);
+        assertArrayEquals(
+            "Decoded dense vector values are not equal to their original.",
+            expectedValues,
+            decodedValues,
+            0.001f
+        );
+
+    }
+
+    public void testSparseVectorEncodingDecoding() {
+        int dimCount = randomIntBetween(0, 100);
+        float[] expectedValues = new float[dimCount];
+        int[] expectedDims = randomUniqueDims(dimCount);
+        for (int i = 0; i < dimCount; i++) {
+            expectedValues[i] = randomFloat();
+        }
+
+        // test that sorting in the encoding works as expected
+        int[] sortedDims = Arrays.copyOf(expectedDims, dimCount);
+        Arrays.sort(sortedDims);
+        VectorEncoderDecoder.sortSparseDimsValues(expectedDims, expectedValues, dimCount);
+        assertArrayEquals(
+            "Sparse vector dims are not properly sorted!",
+            sortedDims,
+            expectedDims
+        );
+
+        // test that values that went through encoding and decoding are equal to their original
+        BytesRef encodedSparseVector = VectorEncoderDecoder.encodeSparseVector(expectedDims, expectedValues, dimCount);
+        int[] decodedDims = VectorEncoderDecoder.decodeSparseVectorDims(encodedSparseVector);
+        float[] decodedValues = VectorEncoderDecoder.decodeSparseVector(encodedSparseVector);
+        assertArrayEquals(
+            "Decoded sparse vector dims are not equal to their original!",
+            expectedDims,
+            decodedDims
+        );
+        assertArrayEquals(
+            "Decoded sparse vector values are not equal to their original.",
+            expectedValues,
+            decodedValues,
+            0.001f
+        );
+    }
+
+    // imitates the code in DenseVectorFieldMapper::parse
+    private BytesRef mockEncodeDenseVector(float[] dims) {
+        final short INT_BYTES = VectorEncoderDecoder.INT_BYTES;
+        byte[] buf = new byte[INT_BYTES * dims.length];
+        int offset = 0;
+        int intValue;
+        for (float value: dims) {
+            intValue = Float.floatToIntBits(value);
+            buf[offset] =  (byte) (intValue >> 24);
+            buf[offset+1] = (byte) (intValue >> 16);
+            buf[offset+2] = (byte) (intValue >>  8);
+            buf[offset+3] = (byte) intValue;
+            offset += INT_BYTES;
+        }
+        return new BytesRef(buf, 0, offset);
+    }
+
+    // generate unique random dims
+    private int[] randomUniqueDims(int dimCount) {
+        int[] values = new int[dimCount];
+        Set<Integer> usedValues = new HashSet<>();
+        int value;
+        for (int i = 0; i < dimCount; i++) {
+            value = randomValueOtherThanMany(usedValues::contains, () -> randomIntBetween(0, SparseVectorFieldMapper.MAX_DIMS_NUMBER));
+            usedValues.add(value);
+            values[i] = value;
+        }
+        return values;
+    }
+
+}

+ 29 - 0
modules/mapper-extras/src/test/resources/rest-api-spec/test/dense-vector/10_indexing.yml

@@ -0,0 +1,29 @@
+setup:
+  - skip:
+      version: " - 6.99.99"
+      reason: "dense_vector field was introduced in 7.0.0"
+
+  - do:
+      indices.create:
+        index: test-index
+        body:
+          settings:
+            number_of_replicas: 0
+          mappings:
+            _doc:
+              properties:
+                my_dense_vector:
+                   type: dense_vector
+
+
+---
+"Indexing":
+  - do:
+      index:
+        index: test-index
+        type: _doc
+        id: 1
+        body:
+          my_dense_vector: [1.5, -10, 3455, 345452.4545]
+
+  - match: { result: created }

+ 29 - 0
modules/mapper-extras/src/test/resources/rest-api-spec/test/sparse-vector/10_indexing.yml

@@ -0,0 +1,29 @@
+setup:
+  - skip:
+      version: " - 6.99.99"
+      reason: "sparse_vector field was introduced in 7.0.0"
+
+  - do:
+      indices.create:
+        index: test-index
+        body:
+          settings:
+            number_of_replicas: 0
+          mappings:
+            _doc:
+              properties:
+                my_sparse_vector:
+                   type: sparse_vector
+
+
+---
+"Indexing":
+  - do:
+      index:
+        index: test-index
+        type: _doc
+        id: 1
+        body:
+          my_sparse_vector: { "50" : 1.8, "2" : -0.4, "10" : 1000.3, "4545" : -0.00004}
+
+  - match: { result: created }