Prechádzať zdrojové kódy

Expand segment sorter for all timeseries indices (#78639)

PR #75195 added segment sorter on @timestamp desc for datastream indices.
This PR applies segment sorter to all indices that have @timestamp
field.

The presence of @timestamp field can serve as a strong
indication that we are dealing with timeseries indices. The most
common type of query for timeseries indices is to get the latest data,
that is data sorted by @timestamp desc. This PR sorts segments
by @timestamp desc which allows to speed up this kind of queries.

Relates to #75195
Mayya Sharipova 4 rokov pred
rodič
commit
a74573be9e

+ 139 - 0
rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search/380_sort_segments_on_timestamp.yml

@@ -0,0 +1,139 @@
+---
+"Test that index segments are sorted on timestamp field if @timestamp field is defined in mapping":
+  - skip:
+      version: " - 7.99.99"
+      reason: "sorting segments was added in 7.16"
+      features: allowed_warnings
+
+  - do:
+      indices.create:
+        index: test_index1
+        body:
+          mappings:
+            properties:
+              "@timestamp":
+                type: date
+          settings:
+            number_of_shards: 1
+            number_of_replicas: 0
+
+  # 1st segment
+  - do:
+      index:
+        index: test_index1
+        body: { "foo": "bar1", "@timestamp": "2021-08-01" }
+        refresh: true
+
+  # 2nd segment
+  - do:
+      index:
+        index: test_index1
+        body: { "foo": "bar2", "@timestamp": "2021-08-02" }
+        refresh: true
+
+  # test that segments are sorted by @timestamp DESC
+  - do:
+      search:
+        index: test_index1
+        body:
+          fields: [{ "field":"@timestamp", "format":"yyyy-MM-dd" }]
+  - match: { hits.total.value: 2 }
+  - match: { hits.hits.0.fields.@timestamp: ["2021-08-02"] }
+  - match: { hits.hits.1.fields.@timestamp: ["2021-08-01"] }
+
+---
+"Test that index segments are NOT sorted on timestamp field when @timestamp field is dynamically added":
+  - skip:
+      version: " - 7.99.99"
+      reason: "sorting segments was added in 7.16"
+      features: allowed_warnings
+
+  - do:
+      indices.create:
+        index: test_index2
+        body:
+          settings:
+            number_of_shards: 1
+            number_of_replicas: 0
+
+  # 1st segment
+  - do:
+      index:
+        index: test_index2
+        body: { "foo": "bar1", "@timestamp": "2021-08-01" }
+        refresh: true
+
+  # 2nd segment
+  - do:
+      index:
+        index: test_index2
+        body: { "foo": "bar2", "@timestamp": "2021-08-02" }
+        refresh: true
+
+  # test that segments are NOT sorted by @timestamp DESC as the field was not
+  - do:
+      search:
+        index: test_index2
+        body:
+          fields: [{ "field":"@timestamp", "format":"yyyy-MM-dd" }]
+  - match: { hits.hits.0.fields.@timestamp: ["2021-08-01"] }
+  - match: { hits.hits.1.fields.@timestamp: ["2021-08-02"] }
+
+  # test that after we reopen the index, segments are sorted by @timestamp DESC
+  - do:
+      indices.close:
+        index: test_index2
+  - is_true: acknowledged
+  - do:
+      indices.open:
+        index: test_index2
+  - is_true: acknowledged
+  - do:
+      search:
+        index: test_index2
+        body:
+          fields: [{ "field":"@timestamp", "format":"yyyy-MM-dd" }]
+  - match: { hits.total.value: 2 }
+  - match: { hits.hits.0.fields.@timestamp: ["2021-08-02"] }
+  - match: { hits.hits.1.fields.@timestamp: ["2021-08-01"] }
+
+---
+"Test if segments are missing @timestamp field we don't get errors":
+  - skip:
+      version: " - 7.99.99"
+      reason: "sorting segments was added in 7.16"
+      features: allowed_warnings
+
+  - do:
+      indices.create:
+        index: test_index3
+        body:
+          mappings:
+            properties:
+              "@timestamp":
+                type: date
+          settings:
+            number_of_shards: 1
+            number_of_replicas: 0
+
+  # 1st segment missing @timestamp field
+  - do:
+      index:
+        index: test_index3
+        body: { "foo": "bar1"}
+        refresh: true
+
+  # 2nd segment
+  - do:
+      index:
+        index: test_index3
+        body: { "foo": "bar2", "@timestamp": "2021-08-02" }
+        refresh: true
+
+  - do:
+      search:
+        index: test_index3
+        body:
+          fields: [{ "field":"@timestamp", "format":"yyyy-MM-dd" }]
+  - match: { hits.hits.0.fields.@timestamp: ["2021-08-02"] }
+  - is_false: hits.hits.1.fields.@timestamp

+ 10 - 6
server/src/main/java/org/elasticsearch/cluster/metadata/DataStream.java

@@ -10,6 +10,7 @@ package org.elasticsearch.cluster.metadata;
 import org.apache.lucene.document.LongPoint;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.PointValues;
+import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.cluster.AbstractDiffable;
 import org.elasticsearch.cluster.Diff;
 import org.elasticsearch.common.Strings;
@@ -42,8 +43,8 @@ public final class DataStream extends AbstractDiffable<DataStream> implements To
 
     public static final String BACKING_INDEX_PREFIX = ".ds-";
     public static final DateFormatter DATE_FORMATTER = DateFormatter.forPattern("uuuu.MM.dd");
-    // Datastreams' leaf readers should be sorted by desc order of their timestamp field, as it allows search time optimizations
-    public static Comparator<LeafReader> DATASTREAM_LEAF_READERS_SORTER =
+    // Timeseries indices' leaf readers should be sorted by desc order of their timestamp field, as it allows search time optimizations
+    public static Comparator<LeafReader> TIMESERIES_LEAF_READERS_SORTER =
         Comparator.comparingLong(
             (LeafReader r) -> {
                 try {
@@ -51,14 +52,17 @@ public final class DataStream extends AbstractDiffable<DataStream> implements To
                     if (points != null) {
                         byte[] sortValue = points.getMaxPackedValue();
                         return LongPoint.decodeDimension(sortValue, 0);
-                    } else if (r.numDocs() == 0) {
-                        // points can be null if the segment contains only deleted documents
+                    } else {
+                        // As we apply this segment sorter to any timeseries indices,
+                        // we don't have a guarantee that all docs contain @timestamp field.
+                        // Some segments may have all docs without @timestamp field, in this
+                        // case they will be sorted last.
                         return Long.MIN_VALUE;
                     }
                 } catch (IOException e) {
+                    throw new ElasticsearchException("Can't access [" +
+                    DataStream.TimestampField.FIXED_TIMESTAMP_FIELD + "] field for the index!", e);
                 }
-                throw new IllegalStateException("Can't access [" +
-                    DataStream.TimestampField.FIXED_TIMESTAMP_FIELD + "] field for the data stream!");
             })
         .reversed();
 

+ 14 - 0
server/src/main/java/org/elasticsearch/index/mapper/MappingLookup.java

@@ -9,6 +9,7 @@
 package org.elasticsearch.index.mapper;
 
 import org.apache.lucene.codecs.PostingsFormat;
+import org.elasticsearch.cluster.metadata.DataStream;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.IndexAnalyzers;
 import org.elasticsearch.index.analysis.NamedAnalyzer;
@@ -389,6 +390,19 @@ public final class MappingLookup {
         return dtfm != null && dtfm.isEnabled();
     }
 
+    /**
+     * Returns if this mapping contains a timestamp field that is of type date, indexed and has doc values.
+     * @return {@code true} if contains a timestamp field of type date that is indexed and has doc values, {@code false} otherwise.
+     */
+    public boolean hasTimestampField() {
+        final MappedFieldType mappedFieldType = fieldTypesLookup().get(DataStream.TimestampField.FIXED_TIMESTAMP_FIELD);
+        if (mappedFieldType instanceof DateFieldMapper.DateFieldType) {
+            return mappedFieldType.isSearchable() && mappedFieldType.hasDocValues();
+        } else {
+            return false;
+        }
+    }
+
     /**
      * Key for the lookup to be used in caches.
      */

+ 11 - 2
server/src/main/java/org/elasticsearch/index/shard/IndexShard.java

@@ -186,7 +186,7 @@ import java.util.function.Supplier;
 import java.util.stream.Collectors;
 import java.util.stream.StreamSupport;
 
-import static org.elasticsearch.cluster.metadata.DataStream.DATASTREAM_LEAF_READERS_SORTER;
+import static org.elasticsearch.cluster.metadata.DataStream.TIMESERIES_LEAF_READERS_SORTER;
 import static org.elasticsearch.index.seqno.RetentionLeaseActions.RETAIN_ALL;
 import static org.elasticsearch.index.seqno.SequenceNumbers.UNASSIGNED_SEQ_NO;
 
@@ -408,6 +408,14 @@ public class IndexShard extends AbstractIndexShardComponent implements IndicesCl
         return indexSortSupplier.get();
     }
 
+    /**
+     * Returns if this shard is a part of datastream
+     * @return {@code true} if this shard is a part of datastream, {@code false} otherwise
+     */
+    public boolean isDataStreamIndex() {
+        return isDataStreamIndex;
+    }
+
     public ShardGetService getService() {
         return this.getService;
     }
@@ -2905,6 +2913,7 @@ public class IndexShard extends AbstractIndexShardComponent implements IndicesCl
                 this.warmer.warm(reader);
             }
         };
+        final boolean isTimeseriesIndex = mapperService == null ? false : mapperService.mappingLookup().hasTimestampField();
         return new EngineConfig(
                 shardId,
                 threadPool,
@@ -2928,7 +2937,7 @@ public class IndexShard extends AbstractIndexShardComponent implements IndicesCl
                 replicationTracker::getRetentionLeases,
                 this::getOperationPrimaryTerm,
                 snapshotCommitSupplier,
-                isDataStreamIndex ? DATASTREAM_LEAF_READERS_SORTER : null);
+                isTimeseriesIndex ? TIMESERIES_LEAF_READERS_SORTER : null);
     }
 
     /**

+ 0 - 111
x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/data_stream/131_sort_segments_migrate_to_data_stream.yml

@@ -1,111 +0,0 @@
----
-"Test that datastream index segments are sorted on timestamp field desc after data stream migration":
-  - skip:
-      version: " - 7.15.99"
-      reason: "sorting segments was added in 7.16"
-      features: allowed_warnings
-
-  - do:
-      allowed_warnings:
-        - "index template [my-template] has index patterns [my_ds] matching patterns from existing older templates [global] with patterns (global => [*]); this template [my-template] will take precedence during new index creation"
-      indices.put_index_template:
-        name: my-template
-        body:
-          index_patterns: [ my_ds ]
-          data_stream: { }
-          template:
-            settings:
-              number_of_shards: 1
-              number_of_replicas: 0
-
-  - do:
-      indices.create:
-        index: test_index1
-        body:
-          settings:
-            number_of_shards: 1
-            number_of_replicas: 0
-          aliases:
-            my_ds:
-              is_write_index: true
-
-  # 1st segment
-  - do:
-      index:
-        index: my_ds
-        body: { "foo": "bar1", "@timestamp": "2021-08-01" }
-        refresh: true
-
-  # 2nd segment
-  - do:
-      index:
-        index: my_ds
-        body: { "foo": "bar2", "@timestamp": "2021-08-02" }
-        refresh: true
-
-  # test that segments are sorted as indexed by @timestamp ASC
-  - do:
-      search:
-        index: my_ds
-        body:
-          fields: [{ "field":"@timestamp", "format":"yyyy-MM-dd" }]
-  - match: { hits.total.value: 2 }
-  - match: { hits.hits.0.fields.@timestamp: ["2021-08-01"] }
-  - match: { hits.hits.1.fields.@timestamp: ["2021-08-02"] }
-
-  # migrate to data-stream
-  - do:
-      indices.migrate_to_data_stream:
-        name: my_ds
-  - is_true: acknowledged
-
-  # test that segments are still sorted as indexed by @timestamp ASC
-  # as we don't reopen existing shards and index readers after migration
-  - do:
-      search:
-        index: my_ds
-        body:
-          fields: [{ "field":"@timestamp", "format":"yyyy-MM-dd" }]
-  - match: { hits.total.value: 2 }
-  - match: { hits.hits.0.fields.@timestamp: ["2021-08-01"] }
-  - match: { hits.hits.1.fields.@timestamp: ["2021-08-02"] }
-
-  # rollover data stream to create new backing index
-  - do:
-      indices.rollover:
-        alias: "my_ds"
-  - match: { rolled_over: true }
-  # save the new backing index names for later use
-  - set: { new_index: idx0name }
-
-  # 1st segment in the new backing index
-  - do:
-      index:
-        index: my_ds
-        body: { "foo": "bar3", "@timestamp": "2021-08-03" }
-        refresh: true
-
-  # 2nd segment in the new backing index
-  - do:
-      index:
-        index: my_ds
-        body: { "foo": "bar4", "@timestamp": "2021-08-04" }
-        refresh: true
-
-
-  # test that segments are sorted by @timestamp DESC in the new backing index,
-  # as the newly created index and shard pick up the index leaf sorter
-  - do:
-      search:
-        index: $idx0name
-        body:
-          fields: [{ "field":"@timestamp", "format":"yyyy-MM-dd" }]
-  - match: { hits.total.value: 2 }
-  - match: { hits.hits.0.fields.@timestamp: ["2021-08-04"] }
-  - match: { hits.hits.1.fields.@timestamp: ["2021-08-03"] }
-
-
-  - do:
-      indices.delete_data_stream:
-        name: my_ds
-  - is_true: acknowledged

+ 2 - 0
x-pack/qa/runtime-fields/build.gradle

@@ -96,6 +96,8 @@ subprojects {
           // The error messages are different
           'search/330_fetch_fields/error includes field name',
           'search/330_fetch_fields/error includes glob pattern',
+          // we need a @timestamp field to be defined in index mapping
+          'search/380_sort_segments_on_timestamp/*',
           /////// NOT SUPPORTED ///////
         ].join(',')
     }