Browse Source

TSDB: unicode dimensions (#83681)

This fixes dimensions with unicode characters in their name. It also
make the sort order of these dimensions something we can explain:
convert all dimensions to utf-8 and sort by that.
Nik Everett 3 years ago
parent
commit
3d2c33307c

+ 5 - 4
server/src/main/java/org/elasticsearch/index/mapper/LuceneDocument.java

@@ -36,7 +36,7 @@ public class LuceneDocument implements Iterable<IndexableField> {
      * for generating the _tsid field. The map will be used by {@link TimeSeriesIdFieldMapper}
      * for generating the _tsid field. The map will be used by {@link TimeSeriesIdFieldMapper}
      * to build the _tsid field for the document.
      * to build the _tsid field for the document.
      */
      */
-    private SortedMap<String, BytesReference> dimensionBytes;
+    private SortedMap<BytesRef, BytesReference> dimensionBytes;
 
 
     LuceneDocument(String path, LuceneDocument parent) {
     LuceneDocument(String path, LuceneDocument parent) {
         fields = new ArrayList<>();
         fields = new ArrayList<>();
@@ -114,16 +114,17 @@ public class LuceneDocument implements Iterable<IndexableField> {
      * to build the _tsid field for the document.
      * to build the _tsid field for the document.
      */
      */
     public void addDimensionBytes(String fieldName, BytesReference tsidBytes) {
     public void addDimensionBytes(String fieldName, BytesReference tsidBytes) {
+        BytesRef fieldNameBytes = new BytesRef(fieldName);
         if (dimensionBytes == null) {
         if (dimensionBytes == null) {
             // It is a {@link TreeMap} so that it is order by field name.
             // It is a {@link TreeMap} so that it is order by field name.
             dimensionBytes = new TreeMap<>();
             dimensionBytes = new TreeMap<>();
-        } else if (dimensionBytes.containsKey(fieldName)) {
+        } else if (dimensionBytes.containsKey(fieldNameBytes)) {
             throw new IllegalArgumentException("Dimension field [" + fieldName + "] cannot be a multi-valued field.");
             throw new IllegalArgumentException("Dimension field [" + fieldName + "] cannot be a multi-valued field.");
         }
         }
-        dimensionBytes.put(fieldName, tsidBytes);
+        dimensionBytes.put(fieldNameBytes, tsidBytes);
     }
     }
 
 
-    public SortedMap<String, BytesReference> getDimensionBytes() {
+    public SortedMap<BytesRef, BytesReference> getDimensionBytes() {
         if (dimensionBytes == null) {
         if (dimensionBytes == null) {
             return Collections.emptySortedMap();
             return Collections.emptySortedMap();
         }
         }

+ 16 - 12
server/src/main/java/org/elasticsearch/index/mapper/TimeSeriesIdFieldMapper.java

@@ -33,6 +33,7 @@ import java.time.ZoneId;
 import java.util.Collections;
 import java.util.Collections;
 import java.util.LinkedHashMap;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Map;
 import java.util.SortedMap;
 import java.util.SortedMap;
 import java.util.function.Supplier;
 import java.util.function.Supplier;
@@ -141,12 +142,12 @@ public class TimeSeriesIdFieldMapper extends MetadataFieldMapper {
         assert fieldType().isIndexed() == false;
         assert fieldType().isIndexed() == false;
 
 
         // SortedMap is expected to be sorted by key (field name)
         // SortedMap is expected to be sorted by key (field name)
-        SortedMap<String, BytesReference> dimensionFields = context.doc().getDimensionBytes();
+        SortedMap<BytesRef, BytesReference> dimensionFields = context.doc().getDimensionBytes();
         BytesReference timeSeriesId = buildTsidField(dimensionFields);
         BytesReference timeSeriesId = buildTsidField(dimensionFields);
         context.doc().add(new SortedDocValuesField(fieldType().name(), timeSeriesId.toBytesRef()));
         context.doc().add(new SortedDocValuesField(fieldType().name(), timeSeriesId.toBytesRef()));
     }
     }
 
 
-    public static BytesReference buildTsidField(SortedMap<String, BytesReference> dimensionFields) throws IOException {
+    public static BytesReference buildTsidField(SortedMap<BytesRef, BytesReference> dimensionFields) throws IOException {
         if (dimensionFields == null || dimensionFields.isEmpty()) {
         if (dimensionFields == null || dimensionFields.isEmpty()) {
             throw new IllegalArgumentException("Dimension fields are missing.");
             throw new IllegalArgumentException("Dimension fields are missing.");
         }
         }
@@ -166,19 +167,22 @@ public class TimeSeriesIdFieldMapper extends MetadataFieldMapper {
         return CONTENT_TYPE;
         return CONTENT_TYPE;
     }
     }
 
 
-    public static void encodeTsid(StreamOutput out, SortedMap<String, BytesReference> dimensionFields) throws IOException {
+    public static void encodeTsid(StreamOutput out, SortedMap<BytesRef, BytesReference> dimensionFields) throws IOException {
         out.writeVInt(dimensionFields.size());
         out.writeVInt(dimensionFields.size());
-        for (Map.Entry<String, BytesReference> entry : dimensionFields.entrySet()) {
-            String fieldName = entry.getKey();
-            BytesRef fieldNameBytes = new BytesRef(fieldName);
-            int len = fieldNameBytes.length;
-            if (len > DIMENSION_NAME_LIMIT) {
+        for (Map.Entry<BytesRef, BytesReference> entry : dimensionFields.entrySet()) {
+            BytesRef fieldName = entry.getKey();
+            if (fieldName.length > DIMENSION_NAME_LIMIT) {
                 throw new IllegalArgumentException(
                 throw new IllegalArgumentException(
-                    "Dimension name must be less than [" + DIMENSION_NAME_LIMIT + "] bytes but [" + fieldName + "] was [" + len + "]."
+                    String.format(
+                        Locale.ROOT,
+                        "Dimension name must be less than [%d] bytes but [%s] was [%s].",
+                        DIMENSION_NAME_LIMIT,
+                        fieldName.utf8ToString(),
+                        fieldName.length
+                    )
                 );
                 );
             }
             }
-            // Write field name in utf-8 instead of writeString's utf-16-ish thing
-            out.writeBytesRef(fieldNameBytes);
+            out.writeBytesRef(fieldName);
             entry.getValue().writeTo(out);
             entry.getValue().writeTo(out);
         }
         }
 
 
@@ -193,7 +197,7 @@ public class TimeSeriesIdFieldMapper extends MetadataFieldMapper {
             Map<String, Object> result = new LinkedHashMap<String, Object>(size);
             Map<String, Object> result = new LinkedHashMap<String, Object>(size);
 
 
             for (int i = 0; i < size; i++) {
             for (int i = 0; i < size; i++) {
-                String name = in.readString();
+                String name = in.readBytesRef().utf8ToString();
 
 
                 int type = in.read();
                 int type = in.read();
                 switch (type) {
                 switch (type) {

+ 2 - 2
server/src/main/java/org/elasticsearch/search/DocValueFormat.java

@@ -706,9 +706,9 @@ public interface DocValueFormat extends NamedWriteable {
             }
             }
 
 
             Map<?, ?> m = (Map<?, ?>) value;
             Map<?, ?> m = (Map<?, ?>) value;
-            SortedMap<String, BytesReference> dimensionFields = new TreeMap<>();
+            SortedMap<BytesRef, BytesReference> dimensionFields = new TreeMap<>();
             for (Map.Entry<?, ?> entry : m.entrySet()) {
             for (Map.Entry<?, ?> entry : m.entrySet()) {
-                String k = (String) entry.getKey();
+                BytesRef k = new BytesRef(entry.getKey().toString());
                 Object v = entry.getValue();
                 Object v = entry.getValue();
                 BytesReference bytes;
                 BytesReference bytes;
 
 

+ 19 - 0
server/src/test/java/org/elasticsearch/index/mapper/TimeSeriesIdFieldMapperTests.java

@@ -17,6 +17,8 @@ import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.xcontent.XContentBuilder;
 import org.elasticsearch.xcontent.XContentBuilder;
 
 
 import java.io.IOException;
 import java.io.IOException;
+import java.util.List;
+import java.util.Map;
 
 
 import static org.elasticsearch.test.MapMatcher.assertMap;
 import static org.elasticsearch.test.MapMatcher.assertMap;
 import static org.elasticsearch.test.MapMatcher.matchesMap;
 import static org.elasticsearch.test.MapMatcher.matchesMap;
@@ -123,6 +125,23 @@ public class TimeSeriesIdFieldMapperTests extends MetadataMapperTestCase {
         );
         );
     }
     }
 
 
+    public void testUnicodeKeys() throws IOException {
+        String fire = new String(new int[] { 0x1F525 }, 0, 1);
+        String coffee = "\u2615";
+        DocumentMapper docMapper = createDocumentMapper("a", mapping(b -> {
+            b.startObject(fire).field("type", "keyword").field("time_series_dimension", true).endObject();
+            b.startObject(coffee).field("type", "keyword").field("time_series_dimension", true).endObject();
+        }));
+
+        ParsedDocument doc = parseDocument(docMapper, b -> b.field(fire, "hot").field(coffee, "good"));
+        Map<String, Object> tsid = TimeSeriesIdFieldMapper.decodeTsid(
+            new ByteArrayStreamInput(doc.rootDoc().getBinaryValue("_tsid").bytes)
+        );
+        assertMap(tsid, matchesMap().entry(coffee, "good").entry(fire, "hot"));
+        // Also make sure the keys are in order
+        assertThat(List.copyOf(tsid.keySet()), equalTo(List.of(coffee, fire)));
+    }
+
     public void testKeywordTooLong() throws IOException {
     public void testKeywordTooLong() throws IOException {
         DocumentMapper docMapper = createDocumentMapper(
         DocumentMapper docMapper = createDocumentMapper(
             "a",
             "a",

+ 3 - 2
server/src/test/java/org/elasticsearch/search/aggregations/timeseries/TimeSeriesAggregatorTests.java

@@ -17,6 +17,7 @@ import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.io.stream.BytesStreamOutput;
 import org.elasticsearch.common.io.stream.BytesStreamOutput;
 import org.elasticsearch.core.CheckedConsumer;
 import org.elasticsearch.core.CheckedConsumer;
@@ -80,7 +81,7 @@ public class TimeSeriesAggregatorTests extends AggregatorTestCase {
     public static void writeTS(RandomIndexWriter iw, long timestamp, Object[] dimensions, Object[] metrics) throws IOException {
     public static void writeTS(RandomIndexWriter iw, long timestamp, Object[] dimensions, Object[] metrics) throws IOException {
         final List<IndexableField> fields = new ArrayList<>();
         final List<IndexableField> fields = new ArrayList<>();
         fields.add(new SortedNumericDocValuesField(DataStreamTimestampFieldMapper.DEFAULT_PATH, timestamp));
         fields.add(new SortedNumericDocValuesField(DataStreamTimestampFieldMapper.DEFAULT_PATH, timestamp));
-        final SortedMap<String, BytesReference> dimensionFields = new TreeMap<>();
+        final SortedMap<BytesRef, BytesReference> dimensionFields = new TreeMap<>();
         for (int i = 0; i < dimensions.length; i += 2) {
         for (int i = 0; i < dimensions.length; i += 2) {
             final BytesReference reference;
             final BytesReference reference;
             if (dimensions[i + 1] instanceof Number) {
             if (dimensions[i + 1] instanceof Number) {
@@ -88,7 +89,7 @@ public class TimeSeriesAggregatorTests extends AggregatorTestCase {
             } else {
             } else {
                 reference = TimeSeriesIdFieldMapper.encodeTsidValue(dimensions[i + 1].toString());
                 reference = TimeSeriesIdFieldMapper.encodeTsidValue(dimensions[i + 1].toString());
             }
             }
-            dimensionFields.put(dimensions[i].toString(), reference);
+            dimensionFields.put(new BytesRef(dimensions[i].toString()), reference);
         }
         }
         for (int i = 0; i < metrics.length; i += 2) {
         for (int i = 0; i < metrics.length; i += 2) {
             if (metrics[i + 1] instanceof Integer || metrics[i + 1] instanceof Long) {
             if (metrics[i + 1] instanceof Integer || metrics[i + 1] instanceof Long) {