Browse Source

SQL: Fix bug regarding histograms usage in scripting (#36866)

Allow scripts to correctly reference grouping functions
Fix bug in translation of date/time functions mixed with histograms.
Enhance Verifier to prevent histograms being nested inside other
 functions inside GROUP BY (as it implies double grouping)
Extend Histogram docs
Costin Leau 6 years ago
parent
commit
ac032a0b9d
15 changed files with 234 additions and 47 deletions
  1. 24 0
      docs/reference/sql/functions/grouping.asciidoc
  2. 46 4
      x-pack/plugin/sql/qa/src/main/resources/agg.csv-spec
  3. 45 0
      x-pack/plugin/sql/qa/src/main/resources/docs.csv-spec
  4. 38 9
      x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/analysis/analyzer/Verifier.java
  5. 3 0
      x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/expression/function/scalar/whitelist/InternalSqlScriptUtils.java
  6. 24 0
      x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/expression/gen/script/Grouping.java
  7. 8 15
      x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/expression/gen/script/Params.java
  8. 6 0
      x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/expression/gen/script/ParamsBuilder.java
  9. 0 4
      x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/expression/gen/script/ScriptTemplate.java
  10. 14 0
      x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/expression/gen/script/ScriptWeaver.java
  11. 1 1
      x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/planner/QueryFolder.java
  12. 2 2
      x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/planner/QueryTranslator.java
  13. 0 9
      x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/querydsl/agg/AggFilter.java
  14. 20 1
      x-pack/plugin/sql/src/test/java/org/elasticsearch/xpack/sql/analysis/analyzer/VerifierErrorMessagesTests.java
  15. 3 2
      x-pack/plugin/sql/src/test/java/org/elasticsearch/xpack/sql/tree/NodeSubclassTests.java

+ 24 - 0
docs/reference/sql/functions/grouping.asciidoc

@@ -36,6 +36,8 @@ The histogram function takes all matching values and divides them into buckets w
 bucket_key = Math.floor(value / interval) * interval
 ----
 
+NOTE:: The histogram in SQL does *NOT* return empty buckets for missing intervals as the traditional <<search-aggregations-bucket-histogram-aggregation, histogram>> and  <<search-aggregations-bucket-datehistogram-aggregation, date histogram>>. Such behavior does not fit conceptually in SQL which treats all missing values as `NULL`; as such the histogram places all missing values in the `NULL` group.
+
 `Histogram` can be applied on either numeric fields:
 
 
@@ -51,4 +53,26 @@ or date/time fields:
 include-tagged::{sql-specs}/docs.csv-spec[histogramDate]
 ----
 
+Expressions inside the histogram are also supported as long as the
+return type is numeric:
+
+["source","sql",subs="attributes,callouts,macros"]
+----
+include-tagged::{sql-specs}/docs.csv-spec[histogramNumericExpression]
+----
+
+Do note that histograms (and grouping functions in general) allow custom expressions but cannot have any functions applied to them in the `GROUP BY`. In other words, the following statement is *NOT* allowed:
 
+["source","sql",subs="attributes,callouts,macros"]
+----
+include-tagged::{sql-specs}/docs.csv-spec[expressionOnHistogramNotAllowed]
+----
+
+as it requires two groupings (one for histogram followed by a second for applying the function on top of the histogram groups).
+
+Instead one can rewrite the query to move the expression on the histogram _inside_ of it:
+
+["source","sql",subs="attributes,callouts,macros"]
+----
+include-tagged::{sql-specs}/docs.csv-spec[histogramDateExpression]
+----

+ 46 - 4
x-pack/plugin/sql/qa/src/main/resources/agg.csv-spec

@@ -262,9 +262,51 @@ SELECT HISTOGRAM(birth_date, INTERVAL 1 YEAR) AS h, COUNT(*) as c FROM test_emp
 null                |10   
 ;
 
-histogramDateWithDateFunction-Ignore
-SELECT YEAR(HISTOGRAM(birth_date, INTERVAL 1 YEAR)) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC;
+histogramDateWithMonthOnTop
+schema::h:i|c:l
+SELECT HISTOGRAM(MONTH(birth_date), 2) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC;
+
+       h       |       c       
+---------------+---------------
+12             |7              
+10             |17             
+8              |16             
+6              |16             
+4              |18             
+2              |10             
+0              |6              
+null           |10    
+;
+
+histogramDateWithYearOnTop
+schema::h:i|c:l
+SELECT HISTOGRAM(YEAR(birth_date), 2) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC;
+       h       |       c       
+---------------+---------------
+1964           |5              
+1962           |13             
+1960           |16             
+1958           |16             
+1956           |9              
+1954           |12             
+1952           |19             
+null           |10   
+;
 
-          
-     
+histogramNumericWithExpression
+schema::h:i|c:l
+SELECT HISTOGRAM(emp_no % 100, 10) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC;
+
+       h       |       c       
+---------------+---------------
+90             |10             
+80             |10             
+70             |10             
+60             |10             
+50             |10             
+40             |10             
+30             |10             
+20             |10             
+10             |10             
+0              |10   
 ;

+ 45 - 0
x-pack/plugin/sql/qa/src/main/resources/docs.csv-spec

@@ -725,6 +725,27 @@ SELECT HISTOGRAM(salary, 5000) AS h FROM emp GROUP BY h;
 // end::histogramNumeric  
 ;
 
+histogramNumericExpression
+schema::h:i|c:l
+// tag::histogramNumericExpression
+SELECT HISTOGRAM(salary % 100, 10) AS h, COUNT(*) AS c FROM emp GROUP BY h;
+
+       h       |       c       
+---------------+---------------
+0              |10             
+10             |15             
+20             |10             
+30             |14             
+40             |9              
+50             |9              
+60             |8              
+70             |13             
+80             |3              
+90             |9    
+
+// end::histogramNumericExpression  
+;
+
 histogramDate
 schema::h:ts|c:l
 // tag::histogramDate
@@ -752,6 +773,30 @@ null                |10
 // end::histogramDate
 ;
 
+expressionOnHistogramNotAllowed-Ignore
+// tag::expressionOnHistogramNotAllowed
+SELECT MONTH(HISTOGRAM(birth_date), 2)) AS h, COUNT(*) as c FROM emp GROUP BY h ORDER BY h DESC;
+// end::expressionOnHistogramNotAllowed
+
+histogramDateExpression
+schema::h:i|c:l
+// tag::histogramDateExpression
+SELECT HISTOGRAM(MONTH(birth_date), 2) AS h, COUNT(*) as c FROM emp GROUP BY h ORDER BY h DESC;
+
+       h       |       c       
+---------------+---------------
+12             |7              
+10             |17             
+8              |16             
+6              |16             
+4              |18             
+2              |10             
+0              |6              
+null           |10 
+
+// end::histogramDateExpression   
+;
+
 ///////////////////////////////
 //
 // Date/Time

+ 38 - 9
x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/analysis/analyzer/Verifier.java

@@ -18,6 +18,8 @@ import org.elasticsearch.xpack.sql.expression.function.Function;
 import org.elasticsearch.xpack.sql.expression.function.FunctionAttribute;
 import org.elasticsearch.xpack.sql.expression.function.Functions;
 import org.elasticsearch.xpack.sql.expression.function.Score;
+import org.elasticsearch.xpack.sql.expression.function.aggregate.AggregateFunctionAttribute;
+import org.elasticsearch.xpack.sql.expression.function.grouping.GroupingFunctionAttribute;
 import org.elasticsearch.xpack.sql.expression.function.scalar.ScalarFunction;
 import org.elasticsearch.xpack.sql.expression.predicate.conditional.ConditionalFunction;
 import org.elasticsearch.xpack.sql.expression.predicate.operator.comparison.In;
@@ -224,6 +226,7 @@ public final class Verifier {
                 validateConditional(p, localFailures);
 
                 checkFilterOnAggs(p, localFailures);
+                checkFilterOnGrouping(p, localFailures);
 
                 if (!groupingFailures.contains(p)) {
                     checkGroupBy(p, localFailures, resolvedFunctions, groupingFailures);
@@ -419,7 +422,7 @@ public final class Verifier {
             return true;
         }
         // skip aggs (allowed to refer to non-group columns)
-        if (Functions.isAggregate(e)) {
+        if (Functions.isAggregate(e) || Functions.isGrouping(e)) {
             return true;
         }
 
@@ -448,6 +451,21 @@ public final class Verifier {
                 }
             }));
 
+            a.groupings().forEach(e -> {
+                if (Functions.isGrouping(e) == false) {
+                    e.collectFirstChildren(c -> {
+                        if (Functions.isGrouping(c)) {
+                            localFailures.add(fail(c,
+                                    "Cannot combine [%s] grouping function inside GROUP BY, found [%s];"
+                                            + " consider moving the expression inside the histogram",
+                                    Expressions.name(c), Expressions.name(e)));
+                            return true;
+                        }
+                        return false;
+                    });
+                }
+            });
+
             if (!localFailures.isEmpty()) {
                 return false;
             }
@@ -547,19 +565,30 @@ public final class Verifier {
         if (p instanceof Filter) {
             Filter filter = (Filter) p;
             if ((filter.child() instanceof Aggregate) == false) {
-                filter.condition().forEachDown(f -> {
-                    if (Functions.isAggregate(f) || Functions.isGrouping(f)) {
-                        String type = Functions.isAggregate(f) ? "aggregate" : "grouping";
-                        localFailures.add(fail(f,
-                                "Cannot use WHERE filtering on %s function [%s], use HAVING instead", type, Expressions.name(f)));
+                filter.condition().forEachDown(e -> {
+                    if (Functions.isAggregate(e) || e instanceof AggregateFunctionAttribute) {
+                        localFailures.add(
+                                fail(e, "Cannot use WHERE filtering on aggregate function [%s], use HAVING instead", Expressions.name(e)));
                     }
-
-                }, Function.class);
+                }, Expression.class);
             }
         }
     }
 
 
+    private static void checkFilterOnGrouping(LogicalPlan p, Set<Failure> localFailures) {
+        if (p instanceof Filter) {
+            Filter filter = (Filter) p;
+            filter.condition().forEachDown(e -> {
+                if (Functions.isGrouping(e) || e instanceof GroupingFunctionAttribute) {
+                    localFailures
+                            .add(fail(e, "Cannot filter on grouping function [%s], use its argument instead", Expressions.name(e)));
+                }
+            }, Expression.class);
+        }
+    }
+
+
     private static void checkForScoreInsideFunctions(LogicalPlan p, Set<Failure> localFailures) {
         // Make sure that SCORE is only used in "top level" functions
         p.forEachExpressions(e ->
@@ -647,4 +676,4 @@ public final class Verifier {
                 (left.isNumeric() && right.isNumeric());
         }
     }
-}
+}

+ 3 - 0
x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/expression/function/scalar/whitelist/InternalSqlScriptUtils.java

@@ -346,6 +346,9 @@ public final class InternalSqlScriptUtils {
     }
 
     public static ZonedDateTime asDateTime(Object dateTime) {
+        if (dateTime == null) {
+            return null;
+        }
         if (dateTime instanceof JodaCompatibleZonedDateTime) {
             return ((JodaCompatibleZonedDateTime) dateTime).getZonedDateTime();
         }

+ 24 - 0
x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/expression/gen/script/Grouping.java

@@ -0,0 +1,24 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.sql.expression.gen.script;
+
+import org.elasticsearch.xpack.sql.expression.function.grouping.GroupingFunctionAttribute;
+
+class Grouping extends Param<GroupingFunctionAttribute> {
+
+    Grouping(GroupingFunctionAttribute groupRef) {
+        super(groupRef);
+    }
+
+    String groupName() {
+        return value().functionId();
+    }
+
+    @Override
+    public String prefix() {
+        return "g";
+    }
+}

+ 8 - 15
x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/expression/gen/script/Params.java

@@ -85,25 +85,15 @@ public class Params {
                 String s = a.aggProperty() != null ? a.aggProperty() : a.aggName();
                 map.put(p.prefix() + aggs++, s);
             }
-        }
-
-        return map;
-    }
-
-    // return the agg refs
-    List<String> asAggRefs() {
-        List<String> refs = new ArrayList<>();
-
-        for (Param<?> p : params) {
-            if (p instanceof Agg) {
-                refs.add(((Agg) p).aggName());
+            if (p instanceof Grouping) {
+                Grouping g = (Grouping) p;
+                map.put(p.prefix() + aggs++, g.groupName());
             }
         }
 
-        return refs;
+        return map;
     }
 
-
     private static List<Param<?>> flatten(List<Param<?>> params) {
         List<Param<?>> flatten = emptyList();
 
@@ -116,6 +106,9 @@ public class Params {
                 else if (p instanceof Agg) {
                     flatten.add(p);
                 }
+                else if (p instanceof Grouping) {
+                    flatten.add(p);
+                }
                 else if (p instanceof Var) {
                     flatten.add(p);
                 }
@@ -131,4 +124,4 @@ public class Params {
     public String toString() {
         return params.toString();
     }
-}
+}

+ 6 - 0
x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/expression/gen/script/ParamsBuilder.java

@@ -6,6 +6,7 @@
 package org.elasticsearch.xpack.sql.expression.gen.script;
 
 import org.elasticsearch.xpack.sql.expression.function.aggregate.AggregateFunctionAttribute;
+import org.elasticsearch.xpack.sql.expression.function.grouping.GroupingFunctionAttribute;
 
 import java.util.ArrayList;
 import java.util.List;
@@ -28,6 +29,11 @@ public class ParamsBuilder {
         return this;
     }
 
+    public ParamsBuilder grouping(GroupingFunctionAttribute grouping) {
+        params.add(new Grouping(grouping));
+        return this;
+    }
+
     public ParamsBuilder script(Params ps) {
         params.add(new Script(ps));
         return this;

+ 0 - 4
x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/expression/gen/script/ScriptTemplate.java

@@ -44,10 +44,6 @@ public class ScriptTemplate {
         return params;
     }
 
-    public List<String> aggRefs() {
-        return params.asAggRefs();
-    }
-
     public Map<String, String> aggPaths() {
         return params.asAggPaths();
     }

+ 14 - 0
x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/expression/gen/script/ScriptWeaver.java

@@ -12,6 +12,7 @@ import org.elasticsearch.xpack.sql.expression.Expression;
 import org.elasticsearch.xpack.sql.expression.Expressions;
 import org.elasticsearch.xpack.sql.expression.FieldAttribute;
 import org.elasticsearch.xpack.sql.expression.function.aggregate.AggregateFunctionAttribute;
+import org.elasticsearch.xpack.sql.expression.function.grouping.GroupingFunctionAttribute;
 import org.elasticsearch.xpack.sql.expression.function.scalar.ScalarFunctionAttribute;
 import org.elasticsearch.xpack.sql.expression.literal.IntervalDayTime;
 import org.elasticsearch.xpack.sql.expression.literal.IntervalYearMonth;
@@ -37,6 +38,9 @@ public interface ScriptWeaver {
             if (attr instanceof AggregateFunctionAttribute) {
                 return scriptWithAggregate((AggregateFunctionAttribute) attr);
             }
+            if (attr instanceof GroupingFunctionAttribute) {
+                return scriptWithGrouping((GroupingFunctionAttribute) attr);
+            }
             if (attr instanceof FieldAttribute) {
                 return scriptWithField((FieldAttribute) attr);
             }
@@ -83,6 +87,16 @@ public interface ScriptWeaver {
                 dataType());
     }
 
+    default ScriptTemplate scriptWithGrouping(GroupingFunctionAttribute grouping) {
+        String template = "{}";
+        if (grouping.dataType() == DataType.DATE) {
+            template = "{sql}.asDateTime({})";
+        }
+        return new ScriptTemplate(processScript(template),
+                paramsBuilder().grouping(grouping).build(),
+                dataType());
+    }
+    
     default ScriptTemplate scriptWithField(FieldAttribute field) {
         return new ScriptTemplate(processScript("doc[{}].value"),
                 paramsBuilder().variable(field.name()).build(),

+ 1 - 1
x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/planner/QueryFolder.java

@@ -281,7 +281,7 @@ class QueryFolder extends RuleExecutor<PhysicalPlan> {
                                 // found match for expression; if it's an attribute or scalar, end the processing chain with
                                 // the reference to the backing agg
                                 if (matchingGroup != null) {
-                                    if (exp instanceof Attribute || exp instanceof ScalarFunction) {
+                                    if (exp instanceof Attribute || exp instanceof ScalarFunction || exp instanceof GroupingFunction) {
                                         Processor action = null;
                                         ZoneId zi = DataType.DATE == exp.dataType() ? DateUtils.UTC : null;
                                         /*

+ 2 - 2
x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/planner/QueryTranslator.java

@@ -277,7 +277,7 @@ final class QueryTranslator {
                             if (h.dataType() == DataType.DATE) {
                                 long intervalAsMillis = Intervals.inMillis(h.interval());
                                 // TODO: set timezone
-                                if (field instanceof FieldAttribute || field instanceof DateTimeHistogramFunction) {
+                                if (field instanceof FieldAttribute) {
                                     key = new GroupByDateHistogram(aggId, nameOf(field), intervalAsMillis, h.zoneId());
                                 } else if (field instanceof Function) {
                                     key = new GroupByDateHistogram(aggId, ((Function) field).asScript(), intervalAsMillis, h.zoneId());
@@ -285,7 +285,7 @@ final class QueryTranslator {
                             }
                             // numeric histogram
                             else {
-                                if (field instanceof FieldAttribute || field instanceof DateTimeHistogramFunction) {
+                                if (field instanceof FieldAttribute) {
                                     key = new GroupByNumericHistogram(aggId, nameOf(field), Foldables.doubleValueOf(h.interval()));
                                 } else if (field instanceof Function) {
                                     key = new GroupByNumericHistogram(aggId, ((Function) field).asScript(),

+ 0 - 9
x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/querydsl/agg/AggFilter.java

@@ -11,7 +11,6 @@ import org.elasticsearch.xpack.sql.expression.gen.script.ScriptTemplate;
 import org.elasticsearch.xpack.sql.expression.gen.script.Scripts;
 import org.elasticsearch.xpack.sql.util.Check;
 
-import java.util.Collection;
 import java.util.Map;
 import java.util.Objects;
 
@@ -32,14 +31,6 @@ public class AggFilter extends PipelineAgg {
         this.aggPaths = scriptTemplate.aggPaths();
     }
 
-    public Map<String, String> aggPaths() {
-        return aggPaths;
-    }
-
-    public Collection<String> aggRefs() {
-        return scriptTemplate.aggRefs();
-    }
-
     public ScriptTemplate scriptTemplate() {
         return scriptTemplate;
     }

+ 20 - 1
x-pack/plugin/sql/src/test/java/org/elasticsearch/xpack/sql/analysis/analyzer/VerifierErrorMessagesTests.java

@@ -481,8 +481,27 @@ public class VerifierErrorMessagesTests extends ESTestCase {
     }
 
     public void testHistogramInFilter() {
-        assertEquals("1:63: Cannot use WHERE filtering on grouping function [HISTOGRAM(date)], use HAVING instead",
+        assertEquals("1:63: Cannot filter on grouping function [HISTOGRAM(date)], use its argument instead",
                 error("SELECT HISTOGRAM(date, INTERVAL 1 MONTH) AS h FROM test WHERE "
                         + "HISTOGRAM(date, INTERVAL 1 MONTH) > CAST('2000-01-01' AS DATE) GROUP BY h"));
     }
+
+    // related https://github.com/elastic/elasticsearch/issues/36853
+    public void testHistogramInHaving() {
+        assertEquals("1:75: Cannot filter on grouping function [h], use its argument instead",
+                error("SELECT HISTOGRAM(date, INTERVAL 1 MONTH) AS h FROM test GROUP BY h HAVING "
+                        + "h > CAST('2000-01-01' AS DATE)"));
+    }
+
+    public void testGroupByScalarOnTopOfGrouping() {
+        assertEquals(
+                "1:14: Cannot combine [HISTOGRAM(date)] grouping function inside GROUP BY, "
+                + "found [MONTH_OF_YEAR(HISTOGRAM(date) [Z])]; consider moving the expression inside the histogram",
+                error("SELECT MONTH(HISTOGRAM(date, INTERVAL 1 MONTH)) AS h FROM test GROUP BY h"));
+    }
+
+    public void testAggsInHistogram() {
+        assertEquals("1:47: Cannot use an aggregate [MAX] for grouping",
+                error("SELECT MAX(date) FROM test GROUP BY HISTOGRAM(MAX(int), 1)"));
+    }
 }

+ 3 - 2
x-pack/plugin/sql/src/test/java/org/elasticsearch/xpack/sql/tree/NodeSubclassTests.java

@@ -92,8 +92,9 @@ import static org.mockito.Mockito.mock;
  */
 public class NodeSubclassTests<T extends B, B extends Node<B>> extends ESTestCase {
 
-    private static final List<Class<? extends Node<?>>> CLASSES_WITH_MIN_TWO_CHILDREN = Arrays.asList(
-        IfNull.class, In.class, InPipe.class, Percentile.class, Percentiles.class, PercentileRanks.class);
+
+    private static final List<Class<?>> CLASSES_WITH_MIN_TWO_CHILDREN = Arrays.<Class<?>> asList(IfNull.class, In.class, InPipe.class,
+            Percentile.class, Percentiles.class, PercentileRanks.class);
 
     private final Class<T> subclass;