Browse Source

Add synthetic source support for binary fields (#107549)

Add synthetic source support for binary fields
Oleksandr Kolomiiets 1 year ago
parent
commit
8ed92db288

+ 5 - 0
docs/changelog/107549.yaml

@@ -0,0 +1,5 @@
+pr: 107549
+summary: Add synthetic source support for binary fields
+area: Mapping
+type: feature
+issues: []

+ 1 - 0
docs/reference/mapping/fields/synthetic-source.asciidoc

@@ -41,6 +41,7 @@ There are a couple of restrictions to be aware of:
 types:
 
 ** <<aggregate-metric-double-synthetic-source, `aggregate_metric_double`>>
+** <<binary-synthetic-source,`binary`>>
 ** <<boolean-synthetic-source,`boolean`>>
 ** <<numeric-synthetic-source,`byte`>>
 ** <<date-synthetic-source,`date`>>

+ 38 - 0
docs/reference/mapping/types/binary.asciidoc

@@ -51,3 +51,41 @@ The following parameters are accepted by `binary` fields:
     Whether the field value should be stored and retrievable separately from
     the <<mapping-source-field,`_source`>> field. Accepts `true` or `false`
     (default).
+
+[[binary-synthetic-source]]
+==== Synthetic `_source`
+
+IMPORTANT: Synthetic `_source` is Generally Available only for TSDB indices
+(indices that have `index.mode` set to `time_series`). For other indices
+synthetic `_source` is in technical preview. Features in technical preview may
+be changed or removed in a future release. Elastic will work to fix
+any issues, but features in technical preview are not subject to the support SLA
+of official GA features.
+
+`binary` fields support <<synthetic-source,synthetic `_source`>> only when <<doc-values,`doc_values`>> are enabled. Synthetic source always sorts `binary` values in order of their byte representation. For example:
+[source,console,id=synthetic-source-binary-example]
+----
+PUT idx
+{
+  "mappings": {
+    "_source": { "mode": "synthetic" },
+    "properties": {
+      "binary": { "type": "binary", "doc_values": true }
+    }
+  }
+}
+PUT idx/_doc/1
+{
+  "binary": ["IAA=", "EAA="]
+}
+----
+// TEST[s/$/\nGET idx\/_doc\/1?filter_path=_source\n/]
+
+Will become:
+[source,console-result]
+----
+{
+  "binary": ["EAA=", "IAA="]
+}
+----
+// TEST[s/^/{"_source":/ s/\n$/}/]

+ 30 - 0
rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search/350_binary_field.yml

@@ -45,3 +45,33 @@
             body:
                 _source: ["binary"]
     - match: { hits.hits.0._source.binary: "U29tZSBiaW5hcnkgYmxvYg==" }
+
+---
+"binary synthetic source":
+  - skip:
+      version: ' - 8.14.99'
+      reason: synthetic source support introduced in 8.15
+  - do:
+      indices.create:
+        index: test
+        body:
+          mappings:
+            _source:
+              mode: synthetic
+            properties:
+              binary:
+                type: binary
+                doc_values: true
+
+  - do:
+      index:
+        index: test
+        refresh: true
+        id: "1"
+        body:
+          binary: U29tZSBiaW5hcnkgYmxvYg==
+
+  - do:
+      search:
+        index: test
+  - match: { hits.hits.0._source.binary: "U29tZSBiaW5hcnkgYmxvYg==" }

+ 61 - 0
server/src/main/java/org/elasticsearch/index/mapper/BinaryDocValuesSyntheticFieldLoader.java

@@ -0,0 +1,61 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.index.mapper;
+
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.xcontent.XContentBuilder;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.stream.Stream;
+
+public abstract class BinaryDocValuesSyntheticFieldLoader implements SourceLoader.SyntheticFieldLoader {
+    private final String name;
+    private BinaryDocValues values;
+    private boolean hasValue;
+
+    protected BinaryDocValuesSyntheticFieldLoader(String name) {
+        this.name = name;
+    }
+
+    protected abstract void writeValue(XContentBuilder b, BytesRef value) throws IOException;
+
+    @Override
+    public Stream<Map.Entry<String, StoredFieldLoader>> storedFieldLoaders() {
+        return Stream.of();
+    }
+
+    @Override
+    public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException {
+        values = leafReader.getBinaryDocValues(name);
+        if (values == null) {
+            return null;
+        }
+        return docId -> {
+            hasValue = values.advanceExact(docId);
+            return hasValue;
+        };
+    }
+
+    @Override
+    public boolean hasValue() {
+        return hasValue;
+    }
+
+    @Override
+    public void write(XContentBuilder b) throws IOException {
+        if (false == hasValue) {
+            return;
+        }
+
+        writeValue(b, values.binaryValue());
+    }
+}

+ 39 - 0
server/src/main/java/org/elasticsearch/index/mapper/BinaryFieldMapper.java

@@ -14,6 +14,7 @@ import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.common.bytes.BytesArray;
 import org.elasticsearch.common.bytes.BytesReference;
+import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
 import org.elasticsearch.common.io.stream.BytesStreamOutput;
 import org.elasticsearch.common.util.CollectionUtils;
 import org.elasticsearch.index.fielddata.FieldDataContext;
@@ -22,6 +23,7 @@ import org.elasticsearch.index.fielddata.plain.BytesBinaryIndexFieldData;
 import org.elasticsearch.index.query.SearchExecutionContext;
 import org.elasticsearch.search.DocValueFormat;
 import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
+import org.elasticsearch.xcontent.XContentBuilder;
 import org.elasticsearch.xcontent.XContentParser;
 
 import java.io.IOException;
@@ -191,6 +193,43 @@ public class BinaryFieldMapper extends FieldMapper {
         return CONTENT_TYPE;
     }
 
+    @Override
+    public SourceLoader.SyntheticFieldLoader syntheticFieldLoader() {
+        if (hasDocValues == false) {
+            throw new IllegalArgumentException(
+                "field [" + name() + "] of type [" + typeName() + "] doesn't support synthetic source because it doesn't have doc values"
+            );
+        }
+
+        return new BinaryDocValuesSyntheticFieldLoader(name()) {
+            @Override
+            protected void writeValue(XContentBuilder b, BytesRef value) throws IOException {
+                var in = new ByteArrayStreamInput();
+                in.reset(value.bytes, value.offset, value.length);
+
+                int count = in.readVInt();
+                switch (count) {
+                    case 0:
+                        return;
+                    case 1:
+                        b.field(simpleName());
+                        break;
+                    default:
+                        b.startArray(simpleName());
+                }
+
+                for (int i = 0; i < count; i++) {
+                    byte[] bytes = in.readByteArray();
+                    b.value(Base64.getEncoder().encodeToString(bytes));
+                }
+
+                if (count > 1) {
+                    b.endArray();
+                }
+            }
+        };
+    }
+
     public static final class CustomBinaryDocValuesField extends CustomDocValuesField {
 
         private final List<byte[]> bytesList;

+ 74 - 1
server/src/test/java/org/elasticsearch/index/mapper/BinaryFieldMapperTests.java

@@ -15,6 +15,7 @@ import org.elasticsearch.common.bytes.BytesArray;
 import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.compress.CompressorFactory;
 import org.elasticsearch.common.io.stream.BytesStreamOutput;
+import org.elasticsearch.core.Tuple;
 import org.elasticsearch.xcontent.XContentBuilder;
 import org.junit.AssumptionViolatedException;
 
@@ -22,8 +23,11 @@ import java.io.IOException;
 import java.io.OutputStream;
 import java.util.Arrays;
 import java.util.Base64;
+import java.util.List;
+import java.util.stream.Collectors;
 
 import static org.hamcrest.Matchers.empty;
+import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.instanceOf;
 
 public class BinaryFieldMapperTests extends MapperTestCase {
@@ -152,9 +156,78 @@ public class BinaryFieldMapperTests extends MapperTestCase {
         return false;
     }
 
+    @Override
+    protected boolean supportsCopyTo() {
+        return false;
+    }
+
     @Override
     protected SyntheticSourceSupport syntheticSourceSupport(boolean ignoreMalformed) {
-        throw new AssumptionViolatedException("not supported");
+        return new SyntheticSourceSupport() {
+            @Override
+            public SyntheticSourceExample example(int maxValues) throws IOException {
+                if (randomBoolean()) {
+                    var value = generateValue();
+                    return new SyntheticSourceExample(value.v1(), value.v2(), this::mapping);
+                }
+
+                List<Tuple<String, byte[]>> values = randomList(1, maxValues, this::generateValue);
+
+                var in = values.stream().map(Tuple::v1).toList();
+
+                var outList = values.stream()
+                    .map(Tuple::v2)
+                    .map(BytesCompareUnsigned::new)
+                    .collect(Collectors.toSet())
+                    .stream()
+                    .sorted()
+                    .map(b -> encode(b.bytes))
+                    .toList();
+                Object out = outList.size() == 1 ? outList.get(0) : outList;
+
+                return new SyntheticSourceExample(in, out, this::mapping);
+            }
+
+            @Override
+            public List<SyntheticSourceInvalidExample> invalidExample() throws IOException {
+                return List.of(
+                    new SyntheticSourceInvalidExample(
+                        equalTo("field [field] of type [binary] doesn't support synthetic source because it doesn't have doc values"),
+                        b -> b.field("type", "binary")
+                    ),
+                    new SyntheticSourceInvalidExample(
+                        equalTo("field [field] of type [binary] doesn't support synthetic source because it doesn't have doc values"),
+                        b -> b.field("type", "binary").field("doc_values", false)
+                    )
+                );
+            }
+
+            private Tuple<String, byte[]> generateValue() {
+                var len = randomIntBetween(1, 256);
+                var bytes = randomByteArrayOfLength(len);
+
+                return Tuple.tuple(encode(bytes), bytes);
+            }
+
+            private String encode(byte[] bytes) {
+                return Base64.getEncoder().encodeToString(bytes);
+            }
+
+            private void mapping(XContentBuilder b) throws IOException {
+                b.field("type", "binary").field("doc_values", "true");
+
+                if (rarely()) {
+                    b.field("store", true);
+                }
+            }
+
+            private record BytesCompareUnsigned(byte[] bytes) implements Comparable<BytesCompareUnsigned> {
+                @Override
+                public int compareTo(BytesCompareUnsigned o) {
+                    return Arrays.compareUnsigned(bytes, o.bytes);
+                }
+            }
+        };
     }
 
     @Override