Browse Source

ESQL: Have BUCKET generate friendlier intervals (#111879)

Currently, when specifing a range for BUCKET to generate an interval, the upper bound is not considered as part of the range to cover. This changs that, so that the resulting interval matches closer the formula: `(to - from)/buckets`

Resolves #110916.
Bogdan Pintea 1 year ago
parent
commit
dd91242e78

+ 6 - 0
docs/changelog/111879.yaml

@@ -0,0 +1,6 @@
+pr: 111879
+summary: "ESQL: Have BUCKET generate friendlier intervals"
+area: ES|QL
+type: enhancement
+issues:
+ - 110916

+ 1 - 1
x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/grouping/Bucket.java

@@ -288,7 +288,7 @@ public class Bucket extends GroupingFunction implements Validatable, TwoOptional
             while (used < buckets) {
                 bucket = r.nextRoundingValue(bucket);
                 used++;
-                if (bucket > to) {
+                if (bucket >= to) {
                     return true;
                 }
             }

+ 14 - 0
x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/grouping/BucketTests.java

@@ -92,6 +92,20 @@ public class BucketTests extends AbstractScalarFunctionTestCase {
                         dateResultsMatcher(args)
                     );
                 }));
+                // same as above, but a low bucket count and datetime bounds that match it (at hour span)
+                suppliers.add(new TestCaseSupplier(name, List.of(DataType.DATETIME, DataType.INTEGER, fromType, toType), () -> {
+                    List<TestCaseSupplier.TypedData> args = new ArrayList<>();
+                    args.add(new TestCaseSupplier.TypedData(date.getAsLong(), DataType.DATETIME, "field"));
+                    args.add(new TestCaseSupplier.TypedData(4, DataType.INTEGER, "buckets").forceLiteral());
+                    args.add(dateBound("from", fromType, "2023-02-17T09:00:00Z"));
+                    args.add(dateBound("to", toType, "2023-02-17T12:00:00Z"));
+                    return new TestCaseSupplier.TestCase(
+                        args,
+                        "DateTruncEvaluator[fieldVal=Attribute[channel=0], rounding=Rounding[3600000 in Z][fixed]]",
+                        DataType.DATETIME,
+                        equalTo(Rounding.builder(Rounding.DateTimeUnit.HOUR_OF_DAY).build().prepareForUnknown().round(date.getAsLong()))
+                    );
+                }));
             }
         }
     }

+ 179 - 0
x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/esql/26_aggs_bucket.yml

@@ -0,0 +1,179 @@
+---
+"friendlier BUCKET interval hourly: #110916":
+  - requires:
+      cluster_features: ["gte_v8.14.0"]
+      reason: "BUCKET extended in 8.14.0"
+      test_runner_features: allowed_warnings_regex
+  - do:
+      indices.create:
+        index: test_bucket
+        body:
+          mappings:
+            properties:
+              ts :
+                type : date
+
+  - do:
+      bulk:
+        refresh: true
+        body:
+          - { "index": { "_index": "test_bucket" } }
+          - { "ts": "2024-07-16T08:10:00Z" }
+          - { "index": { "_index": "test_bucket" } }
+          - { "ts": "2024-07-16T09:20:00Z" }
+          - { "index": { "_index": "test_bucket" } }
+          - { "ts": "2024-07-16T10:30:00Z" }
+          - { "index": { "_index": "test_bucket" } }
+          - { "ts": "2024-07-16T11:40:00Z" }
+
+  - do:
+      allowed_warnings_regex:
+        - "No limit defined, adding default limit of \\[.*\\]"
+      esql.query:
+        body:
+          query: 'FROM test_bucket | STATS c = COUNT(*) BY b = BUCKET(ts, 4, "2024-07-16T08:00:00Z", "2024-07-16T12:00:00Z") | SORT b'
+  - match: { columns.0.name: c }
+  - match: { columns.0.type: long }
+  - match: { columns.1.name: b }
+  - match: { columns.1.type: date }
+  - length: { values: 4 }
+  - match: { values.0.0: 1 }
+  - match: { values.0.1: "2024-07-16T08:00:00.000Z" }
+  - match: { values.1.0: 1 }
+  - match: { values.1.1: "2024-07-16T09:00:00.000Z" }
+  - match: { values.2.0: 1 }
+  - match: { values.2.1: "2024-07-16T10:00:00.000Z" }
+  - match: { values.3.0: 1 }
+  - match: { values.3.1: "2024-07-16T11:00:00.000Z" }
+
+  - do:
+      allowed_warnings_regex:
+        - "No limit defined, adding default limit of \\[.*\\]"
+      esql.query:
+        body:
+          query: 'FROM test_bucket | STATS c = COUNT(*) BY b = BUCKET(ts, 4, "2024-07-16T08:00:00Z", "2024-07-16T12:00:00.001Z") | SORT b'
+  - match: { columns.0.name: c }
+  - match: { columns.0.type: long }
+  - match: { columns.1.name: b }
+  - match: { columns.1.type: date }
+  - length: { values: 2 }
+  - match: { values.0.0: 1 }
+  - match: { values.0.1: "2024-07-16T06:00:00.000Z" }
+  - match: { values.1.0: 3 }
+  - match: { values.1.1: "2024-07-16T09:00:00.000Z" }
+
+  - do:
+      allowed_warnings_regex:
+        - "No limit defined, adding default limit of \\[.*\\]"
+      esql.query:
+        body:
+          query: 'FROM test_bucket | STATS c = COUNT(*) BY b = BUCKET(ts, 4, "2024-07-16T08:09:00Z", "2024-07-16T12:00:00Z") | SORT b'
+  - match: { columns.0.name: c }
+  - match: { columns.0.type: long }
+  - match: { columns.1.name: b }
+  - match: { columns.1.type: date }
+  - length: { values: 4 }
+  - match: { values.0.0: 1 }
+  - match: { values.0.1: "2024-07-16T08:00:00.000Z" }
+  - match: { values.1.0: 1 }
+  - match: { values.1.1: "2024-07-16T09:00:00.000Z" }
+  - match: { values.2.0: 1 }
+  - match: { values.2.1: "2024-07-16T10:00:00.000Z" }
+  - match: { values.3.0: 1 }
+  - match: { values.3.1: "2024-07-16T11:00:00.000Z" }
+
+---
+"friendlier BUCKET interval: monthly #110916":
+  - requires:
+      cluster_features: ["gte_v8.14.0"]
+      reason: "BUCKET extended in 8.14.0"
+      test_runner_features: allowed_warnings_regex
+  - do:
+      indices.create:
+        index: test_bucket
+        body:
+          mappings:
+            properties:
+              ts :
+                type : date
+
+  - do:
+      bulk:
+        refresh: true
+        body:
+          - { "index": { "_index": "test_bucket" } }
+          - { "ts": "2024-06-16" }
+          - { "index": { "_index": "test_bucket" } }
+          - { "ts": "2024-07-16" }
+          - { "index": { "_index": "test_bucket" } }
+          - { "ts": "2024-08-16" }
+          - { "index": { "_index": "test_bucket" } }
+          - { "ts": "2024-09-16" }
+
+  - do:
+      allowed_warnings_regex:
+        - "No limit defined, adding default limit of \\[.*\\]"
+      esql.query:
+        body:
+          query: 'FROM test_bucket | STATS c = COUNT(*) BY b = BUCKET(ts, 11, "2024-01-01", "2025-01-01") | SORT b'
+  - match: { columns.0.name: c }
+  - match: { columns.0.type: long }
+  - match: { columns.1.name: b }
+  - match: { columns.1.type: date }
+  - length: { values: 1 }
+  - match: { values.0.0: 4 }
+  - match: { values.0.1: "2024-01-01T00:00:00.000Z" }
+
+  - do:
+      allowed_warnings_regex:
+        - "No limit defined, adding default limit of \\[.*\\]"
+      esql.query:
+        body:
+          query: 'FROM test_bucket | STATS c = COUNT(*) BY b = BUCKET(ts, 12, "2024-01-01", "2025-01-01") | SORT b'
+  - match: { columns.0.name: c }
+  - match: { columns.0.type: long }
+  - match: { columns.1.name: b }
+  - match: { columns.1.type: date }
+  - length: { values: 4 }
+  - match: { values.0.0: 1 }
+  - match: { values.0.1: "2024-06-01T00:00:00.000Z" }
+  - match: { values.1.0: 1 }
+  - match: { values.1.1: "2024-07-01T00:00:00.000Z" }
+  - match: { values.2.0: 1 }
+  - match: { values.2.1: "2024-08-01T00:00:00.000Z" }
+  - match: { values.3.0: 1 }
+  - match: { values.3.1: "2024-09-01T00:00:00.000Z" }
+
+  - do:
+      allowed_warnings_regex:
+        - "No limit defined, adding default limit of \\[.*\\]"
+      esql.query:
+        body:
+          query: 'FROM test_bucket | STATS c = COUNT(*) BY b = BUCKET(ts, 12, "2024-01-01", "2025-01-01T00:00:00.001") | SORT b'
+  - match: { columns.0.name: c }
+  - match: { columns.0.type: long }
+  - match: { columns.1.name: b }
+  - match: { columns.1.type: date }
+  - length: { values: 1 }
+  - match: { values.0.0: 4 }
+  - match: { values.0.1: "2024-01-01T00:00:00.000Z" }
+
+  - do:
+      allowed_warnings_regex:
+        - "No limit defined, adding default limit of \\[.*\\]"
+      esql.query:
+        body:
+          query: 'FROM test_bucket | STATS c = COUNT(*) BY b = BUCKET(ts, 13, "2024-01-01T12:13:14Z", "2025-01-01") | SORT b'
+  - match: { columns.0.name: c }
+  - match: { columns.0.type: long }
+  - match: { columns.1.name: b }
+  - match: { columns.1.type: date }
+  - length: { values: 4 }
+  - match: { values.0.0: 1 }
+  - match: { values.0.1: "2024-06-01T00:00:00.000Z" }
+  - match: { values.1.0: 1 }
+  - match: { values.1.1: "2024-07-01T00:00:00.000Z" }
+  - match: { values.2.0: 1 }
+  - match: { values.2.1: "2024-08-01T00:00:00.000Z" }
+  - match: { values.3.0: 1 }
+  - match: { values.3.1: "2024-09-01T00:00:00.000Z" }