2 years ago · 82d67dc289
--- a/docs/reference/aggregations/metrics/cardinality-aggregation.asciidoc
+++ b/docs/reference/aggregations/metrics/cardinality-aggregation.asciidoc
@@ -77,6 +77,8 @@ https://static.googleusercontent.com/media/research.google.com/fr//pubs/archive/
 
				 algorithm, which counts based on the hashes of the values with some interesting
			
 
				 properties:
			
 
				 
			
 
				+// tag::explanation[]
			
 
				+
			
 
				  * configurable precision, which decides on how to trade memory for accuracy,
			
 
				  * excellent accuracy on low-cardinality sets,
			
 
				  * fixed memory usage: no matter if there are tens or billions of unique values,
			
@@ -157,9 +159,11 @@ accuracy. Also note that even with a threshold as low as 100, the error
 
				 remains very low (1-6% as seen in the above graph) even when counting millions of items.
			
 
				 
			
 
				 The HyperLogLog++ algorithm depends on the leading zeros of hashed
			
 
				-values, the exact distributions of hashes in a dataset can affect the 
			
 
				+values, the exact distributions of hashes in a dataset can affect the
			
 
				 accuracy of the cardinality.
			
 
				 
			
 
				+// end::explanation[]
			
 
				+
			
 
				 ==== Pre-computed hashes
			
 
				 
			
 
				 On string fields that have a high cardinality, it might be faster to store the
			
@@ -249,7 +253,7 @@ There are different mechanisms by which cardinality aggregations can be executed
 
				 
			
 
				 Additionally, there are two "heuristic based" modes.  These modes will cause
			
 
				 Elasticsearch to use some data about the state of the index to choose an
			
 
				-appropriate execution method.  The two heuristics are: 
			
 
				+appropriate execution method.  The two heuristics are:
			
 
				  - `save_time_heuristic` - this is the default in Elasticsearch 8.4 and later.
			
 
				  - `save_memory_heuristic` - this was the default in Elasticsearch 8.3 and
			
 
				    earlier
			
--- a/docs/reference/aggregations/metrics/percentile-aggregation.asciidoc
+++ b/docs/reference/aggregations/metrics/percentile-aggregation.asciidoc
@@ -220,6 +220,7 @@ GET latency/_search
 
				 [[search-aggregations-metrics-percentile-aggregation-approximation]]
			
 
				 ==== Percentiles are (usually) approximate
			
 
				 
			
 
				+// tag::approximate[]
			
 
				 There are many different algorithms to calculate percentiles. The naive
			
 
				 implementation simply stores all the values in a sorted array. To find the 50th
			
 
				 percentile, you simply find the value that is at `my_array[count(my_array) * 0.5]`.
			
@@ -254,6 +255,8 @@ for large number of values is that the law of large numbers makes the distributi
 
				 values more and more uniform and the t-digest tree can do a better job at summarizing
			
 
				 it. It would not be the case on more skewed distributions.
			
 
				 
			
 
				+// end::approximate[]
			
 
				+
			
 
				 [WARNING]
			
 
				 ====
			
 
				 Percentile aggregations are also
			
--- a/docs/reference/esql/aggregation-functions.asciidoc
+++ b/docs/reference/esql/aggregation-functions.asciidoc
@@ -0,0 +1,32 @@
 
				+[[esql-agg-functions]]
			
 
				+== ESQL aggregation functions
			
 
				+
			
 
				+++++
			
 
				+<titleabbrev>Aggregation functions</titleabbrev>
			
 
				+++++
			
 
				+:keywords: {es}, ESQL, {es} query language, functions
			
 
				+:description: ESQL supports various functions for calculating values.
			
 
				+
			
 
				+<<esql-stats-by>> support these functions:
			
 
				+
			
 
				+// tag::functions[]
			
 
				+* <<esql-agg-avg>>
			
 
				+* <<esql-agg-count>>
			
 
				+* <<esql-agg-count-distinct>>
			
 
				+* <<esql-agg-max>>
			
 
				+* <<esql-agg-median>>
			
 
				+* <<esql-agg-median-absolute-deviation>>
			
 
				+* <<esql-agg-min>>
			
 
				+* <<esql-agg-percentile>>
			
 
				+* <<esql-agg-sum>>
			
 
				+// end::functions[]
			
 
				+
			
 
				+include::aggregation-functions/avg.asciidoc[]
			
 
				+include::aggregation-functions/count.asciidoc[]
			
 
				+include::aggregation-functions/count-distinct.asciidoc[]
			
 
				+include::aggregation-functions/max.asciidoc[]
			
 
				+include::aggregation-functions/median.asciidoc[]
			
 
				+include::aggregation-functions/median-absolute-deviation.asciidoc[]
			
 
				+include::aggregation-functions/min.asciidoc[]
			
 
				+include::aggregation-functions/percentile.asciidoc[]
			
 
				+include::aggregation-functions/sum.asciidoc[]
			
--- a/docs/reference/esql/aggregation-functions/avg.asciidoc
+++ b/docs/reference/esql/aggregation-functions/avg.asciidoc
@@ -0,0 +1,14 @@
 
				+[[esql-agg-avg]]
			
 
				+=== `AVG`
			
 
				+The average of a numeric field.
			
 
				+
			
 
				+[source.merge.styled,esql]
			
 
				+----
			
 
				+include::{esql-specs}/stats.csv-spec[tag=avg]
			
 
				+----
			
 
				+[%header.monospaced.styled,format=dsv,separator=|]
			
 
				+|===
			
 
				+include::{esql-specs}/stats.csv-spec[tag=avg-result]
			
 
				+|===
			
 
				+
			
 
				+The result is always a `double` not matter the input type.
			
--- a/docs/reference/esql/aggregation-functions/count-distinct.asciidoc
+++ b/docs/reference/esql/aggregation-functions/count-distinct.asciidoc
@@ -0,0 +1,43 @@
 
				+[[esql-agg-count-distinct]]
			
 
				+=== `COUNT_DISTINCT`
			
 
				+The approximate number of distinct values.
			
 
				+
			
 
				+[source.merge.styled,esql]
			
 
				+----
			
 
				+include::{esql-specs}/stats_count_distinct.csv-spec[tag=count-distinct]
			
 
				+----
			
 
				+[%header.monospaced.styled,format=dsv,separator=|]
			
 
				+|===
			
 
				+include::{esql-specs}/stats_count_distinct.csv-spec[tag=count-distinct-result]
			
 
				+|===
			
 
				+
			
 
				+Can take any field type as input and the result is always a `long` not matter
			
 
				+the input type.
			
 
				+
			
 
				+==== Counts are approximate
			
 
				+
			
 
				+Computing exact counts requires loading values into a set and returning its
			
 
				+size. This doesn't scale when working on high-cardinality sets and/or large
			
 
				+values as the required memory usage and the need to communicate those
			
 
				+per-shard sets between nodes would utilize too many resources of the cluster.
			
 
				+
			
 
				+This `COUNT_DISTINCT` function is based on the
			
 
				+https://static.googleusercontent.com/media/research.google.com/fr//pubs/archive/40671.pdf[HyperLogLog++]
			
 
				+algorithm, which counts based on the hashes of the values with some interesting
			
 
				+properties:
			
 
				+
			
 
				+include::../../aggregations/metrics/cardinality-aggregation.asciidoc[tag=explanation]
			
 
				+
			
 
				+==== Precision is configurable
			
 
				+
			
 
				+The `COUNT_DISTINCT` function takes an optional second parameter to configure the
			
 
				+precision discussed previously.
			
 
				+
			
 
				+[source.merge.styled,esql]
			
 
				+----
			
 
				+include::{esql-specs}/stats_count_distinct.csv-spec[tag=count-distinct-precision]
			
 
				+----
			
 
				+[%header.monospaced.styled,format=dsv,separator=|]
			
 
				+|===
			
 
				+include::{esql-specs}/stats_count_distinct.csv-spec[tag=count-distinct-precision-result]
			
 
				+|===
			
--- a/docs/reference/esql/aggregation-functions/count.asciidoc
+++ b/docs/reference/esql/aggregation-functions/count.asciidoc
@@ -0,0 +1,18 @@
 
				+[[esql-agg-count]]
			
 
				+=== `COUNT`
			
 
				+Counts field values.
			
 
				+
			
 
				+[source.merge.styled,esql]
			
 
				+----
			
 
				+include::{esql-specs}/stats.csv-spec[tag=count]
			
 
				+----
			
 
				+[%header.monospaced.styled,format=dsv,separator=|]
			
 
				+|===
			
 
				+include::{esql-specs}/stats.csv-spec[tag=count-result]
			
 
				+|===
			
 
				+
			
 
				+Can take any field type as input and the result is always a `long` not matter
			
 
				+the input type.
			
 
				+
			
 
				+NOTE: There isn't yet a `COUNT(*)`. Please count a single valued field if you
			
 
				+      need a count of rows.
			
--- a/docs/reference/esql/aggregation-functions/max.asciidoc
+++ b/docs/reference/esql/aggregation-functions/max.asciidoc
@@ -0,0 +1,12 @@
 
				+[[esql-agg-max]]
			
 
				+=== `MAX`
			
 
				+The maximum value of a numeric field.
			
 
				+
			
 
				+[source.merge.styled,esql]
			
 
				+----
			
 
				+include::{esql-specs}/stats.csv-spec[tag=max]
			
 
				+----
			
 
				+[%header.monospaced.styled,format=dsv,separator=|]
			
 
				+|===
			
 
				+include::{esql-specs}/stats.csv-spec[tag=max-result]
			
 
				+|===
			
--- a/docs/reference/esql/aggregation-functions/median-absolute-deviation.asciidoc
+++ b/docs/reference/esql/aggregation-functions/median-absolute-deviation.asciidoc
@@ -0,0 +1,28 @@
 
				+[[esql-agg-median-absolute-deviation]]
			
 
				+=== `MEDIAN_ABSOLUTE_DEVIATION`
			
 
				+The median absolute deviation, a measure of variability. It is a robust
			
 
				+statistic, meaning that it is useful for describing data that may have outliers,
			
 
				+or may not be normally distributed. For such data it can be more descriptive than
			
 
				+standard deviation.
			
 
				+
			
 
				+It is calculated as the median of each data point’s deviation from the median of
			
 
				+the entire sample. That is, for a random variable `X`, the median absolute deviation
			
 
				+is `median(|median(X) - Xi|)`.
			
 
				+
			
 
				+[source.merge.styled,esql]
			
 
				+----
			
 
				+include::{esql-specs}/stats_percentile.csv-spec[tag=median-absolute-deviation]
			
 
				+----
			
 
				+[%header.monospaced.styled,format=dsv,separator=|]
			
 
				+|===
			
 
				+include::{esql-specs}/stats_percentile.csv-spec[tag=median-absolute-deviation-result]
			
 
				+|===
			
 
				+
			
 
				+NOTE: Like <<esql-agg-percentile>>, `MEDIAN_ABSOLUTE_DEVIATION` is
			
 
				+      <<esql-agg-percentile-approximate,usually approximate>>.
			
 
				+
			
 
				+[WARNING]
			
 
				+====
			
 
				+`MEDIAN_ABSOLUTE_DEVIATION` is also {wikipedia}/Nondeterministic_algorithm[non-deterministic].
			
 
				+This means you can get slightly different results using the same data.
			
 
				+====
			
--- a/docs/reference/esql/aggregation-functions/median.asciidoc
+++ b/docs/reference/esql/aggregation-functions/median.asciidoc
@@ -0,0 +1,21 @@
 
				+[[esql-agg-median]]
			
 
				+=== `MEDIAN`
			
 
				+The value that is greater than half of all values and less than half of
			
 
				+all values, also known as the 50% <<esql-agg-percentile>>.
			
 
				+
			
 
				+[source.merge.styled,esql]
			
 
				+----
			
 
				+include::{esql-specs}/stats_percentile.csv-spec[tag=median]
			
 
				+----
			
 
				+[%header.monospaced.styled,format=dsv,separator=|]
			
 
				+|===
			
 
				+include::{esql-specs}/stats_percentile.csv-spec[tag=median-result]
			
 
				+|===
			
 
				+
			
 
				+NOTE: Like <<esql-agg-percentile>>, `MEDIAN` is <<esql-agg-percentile-approximate,usually approximate>>.
			
 
				+
			
 
				+[WARNING]
			
 
				+====
			
 
				+`MEDIAN` is also {wikipedia}/Nondeterministic_algorithm[non-deterministic].
			
 
				+This means you can get slightly different results using the same data.
			
 
				+====
			
--- a/docs/reference/esql/aggregation-functions/min.asciidoc
+++ b/docs/reference/esql/aggregation-functions/min.asciidoc
@@ -0,0 +1,12 @@
 
				+[[esql-agg-min]]
			
 
				+=== `MIN`
			
 
				+The minimum value of a numeric field.
			
 
				+
			
 
				+[source.merge.styled,esql]
			
 
				+----
			
 
				+include::{esql-specs}/stats.csv-spec[tag=min]
			
 
				+----
			
 
				+[%header.monospaced.styled,format=dsv,separator=|]
			
 
				+|===
			
 
				+include::{esql-specs}/stats.csv-spec[tag=min-result]
			
 
				+|===
			
--- a/docs/reference/esql/aggregation-functions/percentile.asciidoc
+++ b/docs/reference/esql/aggregation-functions/percentile.asciidoc
@@ -0,0 +1,28 @@
 
				+[[esql-agg-percentile]]
			
 
				+=== `PERCENTILE`
			
 
				+The value at which a certain percentage of observed values occur. For example,
			
 
				+the 95th percentile is the value which is greater than 95% of the observed values and
			
 
				+the 50th percentile is the <<esql-agg-median>>.
			
 
				+
			
 
				+[source.merge.styled,esql]
			
 
				+----
			
 
				+include::{esql-specs}/stats_percentile.csv-spec[tag=percentile]
			
 
				+----
			
 
				+[%header.monospaced.styled,format=dsv,separator=|]
			
 
				+|===
			
 
				+include::{esql-specs}/stats_percentile.csv-spec[tag=percentile-result]
			
 
				+|===
			
 
				+
			
 
				+[[esql-agg-percentile-approximate]]
			
 
				+==== `PERCENTILE` is (usually) approximate
			
 
				+
			
 
				+include::../../aggregations/metrics/percentile-aggregation.asciidoc[tag=approximate]
			
 
				+
			
 
				+[WARNING]
			
 
				+====
			
 
				+`PERCENTILE` is also {wikipedia}/Nondeterministic_algorithm[non-deterministic].
			
 
				+This means you can get slightly different results using the same data.
			
 
				+====
			
 
				+
			
 
				+
			
 
				+
			
--- a/docs/reference/esql/aggregation-functions/sum.asciidoc
+++ b/docs/reference/esql/aggregation-functions/sum.asciidoc
@@ -0,0 +1,12 @@
 
				+[[esql-agg-sum]]
			
 
				+=== `SUM`
			
 
				+The sum of a numeric field.
			
 
				+
			
 
				+[source.merge.styled,esql]
			
 
				+----
			
 
				+include::{esql-specs}/stats.csv-spec[tag=sum]
			
 
				+----
			
 
				+[%header.monospaced.styled,format=dsv,separator=|]
			
 
				+|===
			
 
				+include::{esql-specs}/stats.csv-spec[tag=sum-result]
			
 
				+|===
			
--- a/docs/reference/esql/index.asciidoc
+++ b/docs/reference/esql/index.asciidoc
@@ -129,6 +129,8 @@ include::esql-processing-commands.asciidoc[]
 
				 
			
 
				 include::esql-functions.asciidoc[]
			
 
				 
			
 
				+include::aggregation-functions.asciidoc[]
			
 
				+
			
 
				 include::multivalued-fields.asciidoc[]
			
 
				 
			
 
				 :esql-tests!:
			
--- a/docs/reference/esql/processing-commands/stats.asciidoc
+++ b/docs/reference/esql/processing-commands/stats.asciidoc
@@ -41,11 +41,4 @@ include::{esql-specs}/docs.csv-spec[tag=statsGroupByMultipleValues]
 
				 
			
 
				 The following aggregation functions are supported:
			
 
				 
			
 
				-* `AVG`
			
 
				-* `COUNT`
			
 
				-* `COUNT_DISTINCT`
			
 
				-* `MAX`
			
 
				-* `MEDIAN`
			
 
				-* `MEDIAN_ABSOLUTE_DEVIATION`
			
 
				-* `MIN`
			
 
				-* `SUM`
			
 
				+include::../aggregation-functions.asciidoc[tag=functions]
			
--- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/stats.csv-spec
+++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/stats.csv-spec
@@ -6,10 +6,27 @@ l:long
 
				 ;
			
 
				 
			
 
				 maxOfInteger
			
 
				-from employees | stats l = max(languages);
			
 
				+// tag::max[]
			
 
				+FROM employees
			
 
				+| STATS MAX(languages);
			
 
				+// end::max[]
			
 
				 
			
 
				-l:integer
			
 
				+// tag::max-result[]
			
 
				+MAX(languages):integer
			
 
				 5
			
 
				+// end::max-result[]
			
 
				+;
			
 
				+
			
 
				+minOfInteger
			
 
				+// tag::min[]
			
 
				+FROM employees
			
 
				+| STATS MIN(languages);
			
 
				+// end::min[]
			
 
				+
			
 
				+// tag::min-result[]
			
 
				+MIN(languages):integer
			
 
				+1
			
 
				+// end::min-result[]
			
 
				 ;
			
 
				 
			
 
				 maxOfShort
			
@@ -114,10 +131,16 @@ l:double
 
				 ;
			
 
				 
			
 
				 avgOfDouble
			
 
				-from employees | stats h = avg(height);
			
 
				+// tag::avg[]
			
 
				+FROM employees
			
 
				+| STATS AVG(height)
			
 
				+// end::avg[]
			
 
				+;
			
 
				 
			
 
				-h:double
			
 
				+// tag::avg-result[]
			
 
				+AVG(height):double
			
 
				 1.7682
			
 
				+// end::avg-result[]
			
 
				 ;
			
 
				 
			
 
				 avgOfFloat
			
@@ -140,6 +163,19 @@ h:double
 
				 1.7682
			
 
				 ;
			
 
				 
			
 
				+countOfDouble
			
 
				+// tag::count[]
			
 
				+FROM employees
			
 
				+| STATS COUNT(height)
			
 
				+// end::count[]
			
 
				+;
			
 
				+
			
 
				+// tag::count-result[]
			
 
				+COUNT(height):long
			
 
				+100
			
 
				+// end::count-result[]
			
 
				+;
			
 
				+
			
 
				 sumOfLong
			
 
				 from employees | stats l = sum(languages.long);
			
 
				 
			
@@ -148,10 +184,15 @@ l:long
 
				 ;
			
 
				 
			
 
				 sumOfInteger
			
 
				-from employees | stats l = sum(languages);
			
 
				+// tag::sum[]
			
 
				+FROM employees
			
 
				+| STATS SUM(languages);
			
 
				+// end::sum[]
			
 
				 
			
 
				-l:long
			
 
				+// tag::sum-result[]
			
 
				+SUM(languages):long
			
 
				 281
			
 
				+// end::sum-result[]
			
 
				 ;
			
 
				 
			
 
				 sumOfByte
			
--- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/stats_count_distinct.csv-spec
+++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/stats_count_distinct.csv-spec
@@ -75,10 +75,29 @@ g:long | h:long
 
				 ;
			
 
				 
			
 
				 countDistinctOfIp
			
 
				-from hosts | stats h0 = count_distinct(ip0), h1 = count_distinct(ip1);
			
 
				+// tag::count-distinct[]
			
 
				+FROM hosts
			
 
				+| STATS COUNT_DISTINCT(ip0), COUNT_DISTINCT(ip1)
			
 
				+// end::count-distinct[]
			
 
				+;
			
 
				+
			
 
				+// tag::count-distinct-result[]
			
 
				+COUNT_DISTINCT(ip0):long | COUNT_DISTINCT(ip1):long
			
 
				+7                        | 8
			
 
				+// end::count-distinct-result[]
			
 
				+;
			
 
				+
			
 
				+countDistinctOfIpPrecision
			
 
				+// tag::count-distinct-precision[]
			
 
				+FROM hosts
			
 
				+| STATS COUNT_DISTINCT(ip0, 80000), COUNT_DISTINCT(ip1, 5)
			
 
				+// end::count-distinct-precision[]
			
 
				+;
			
 
				 
			
 
				-h0:long   | h1:long
			
 
				-7         | 8
			
 
				+// tag::count-distinct-precision-result[]
			
 
				+COUNT_DISTINCT(ip0,80000):long | COUNT_DISTINCT(ip1,5):long
			
 
				+7                              | 9
			
 
				+// end::count-distinct-precision-result[]
			
 
				 ;
			
 
				 
			
 
				 countDistinctOfDates
			
--- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/stats_percentile.csv-spec
+++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/stats_percentile.csv-spec
@@ -7,10 +7,18 @@ p0:double   | p50:double  | p99:double
 
				 
			
 
				 
			
 
				 percentileOfInteger
			
 
				-from employees | stats p0 = percentile(salary, 0), p50 = percentile(salary, 50), p99 = percentile(salary, 99);
			
 
				+// tag::percentile[]
			
 
				+FROM employees
			
 
				+| STATS p0 = PERCENTILE(salary,  0)
			
 
				+     , p50 = PERCENTILE(salary, 50)
			
 
				+     , p99 = PERCENTILE(salary, 99)
			
 
				+// end::percentile[]
			
 
				+;
			
 
				 
			
 
				+// tag::percentile-result[]
			
 
				 p0:double   | p50:double  | p99:double
			
 
				 25324       | 47003       | 74984.5
			
 
				+// end::percentile-result[]
			
 
				 ;
			
 
				 
			
 
				 
			
@@ -69,14 +77,18 @@ m:double   | p50:double
 
				 0          | 0 
			
 
				 ;
			
 
				 
			
 
				-
			
 
				 medianOfInteger
			
 
				-from employees | stats m = median(salary), p50 = percentile(salary, 50);
			
 
				-
			
 
				-m:double   | p50:double
			
 
				-47003       | 47003    
			
 
				+// tag::median[]
			
 
				+FROM employees
			
 
				+| STATS MEDIAN(salary), PERCENTILE(salary, 50)
			
 
				+// end::median[]
			
 
				 ;
			
 
				 
			
 
				+// tag::median-result[]
			
 
				+MEDIAN(salary):double | PERCENTILE(salary,50):double
			
 
				+47003                 | 47003    
			
 
				+// end::median-result[]
			
 
				+;
			
 
				 
			
 
				 medianOfDouble
			
 
				 from employees | stats m = median(salary_change), p50 = percentile(salary_change, 50);
			
@@ -117,3 +129,16 @@ m:double           | p50:double           | job_positions:keyword
 
				 4.62               | 4.62                 | "Support Engineer"
			
 
				 3.9299999999999997 | 3.9299999999999997   | "Architect"
			
 
				 ;
			
 
				+
			
 
				+medianAbsoluteDeviation
			
 
				+// tag::median-absolute-deviation[]
			
 
				+FROM employees
			
 
				+| STATS MEDIAN(salary), MEDIAN_ABSOLUTE_DEVIATION(salary)
			
 
				+// end::median-absolute-deviation[]
			
 
				+;
			
 
				+
			
 
				+// tag::median-absolute-deviation-result[]
			
 
				+MEDIAN(salary):double | MEDIAN_ABSOLUTE_DEVIATION(salary):double
			
 
				+47003                 | 10096.5
			
 
				+// end::median-absolute-deviation-result[]
			
 
				+;