Browse Source

Adding cardinality support for random_sampler agg (#86838)

This adds support for the `cardinality` aggregation within a random_sampler.

This usecase is helpful in determining the ratio of unique values compared to the count of total documents within the sampled set.
Benjamin Trent 3 years ago
parent
commit
94f2544998

+ 5 - 0
docs/changelog/86838.yaml

@@ -0,0 +1,5 @@
+pr: 86838
+summary: Adding cardinality support for `random_sampler` agg
+area: Aggregations
+type: enhancement
+issues: []

+ 14 - 0
docs/reference/aggregations/bucket/random-sampler-aggregation.asciidoc

@@ -94,3 +94,17 @@ higher sampling rates, the relative error is still low.
 
 NOTE: This represents the result of aggregations against a typical positively skewed APM data set which also has outliers in the upper tail. The linear dependence of the relative error on the sample size is found to hold widely, but the slope depends on the variation in the quantity being aggregated. As such, the variance in your own data may
       cause relative error rates to increase or decrease at a different rate.
+
+[[random-sampler-special-cases]]
+==== Random sampling special cases
+
+All counts returned by the random sampler aggregation are scaled to ease visualizations and calculations. For example,
+when randomly sampling a <<search-aggregations-bucket-datehistogram-aggregation, date histogram aggregation>> every
+`doc_count` value for every bucket is scaled by the inverse of the random_sampler `probability` value. So, if `doc_count`
+for a bucket is `10,000` with `probability: 0.1`, the actual number of documents aggregated is `1,000`.
+
+An exception to this is <<search-aggregations-metrics-cardinality-aggregation, cardinality aggregation>>. Unique item
+counts are not suitable for automatic scaling. When interpreting the cardinality count, compare it
+to the number of sampled docs provided in the top level `doc_count` within the random_sampler aggregation. It gives 
+you an idea of unique values as a percentage of total values. It may not reflect, however, the exact number of unique values
+for the given field.

+ 2 - 2
rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.aggregation/450_random_sampler.yml

@@ -208,7 +208,7 @@ setup:
           }
 
   - do:
-      catch: /\[random_sampler\] aggregation \[sampled\] does not support sampling \[cardinality\] aggregation \[unique\]/
+      catch: /\[random_sampler\] aggregation \[sampled\] does not support sampling \[sampler\] aggregation \[inner_sampler\]/
       search:
         index: data
         size: 0
@@ -219,7 +219,7 @@ setup:
                 "random_sampler": {
                   "probability": 0.1
                 },
-                "aggs": { "unique": {"cardinality": {"field": "product"}}}
+                "aggs": { "inner_sampler": {"sampler": {}}}
               }
             }
           }

+ 5 - 0
server/src/main/java/org/elasticsearch/search/aggregations/metrics/CardinalityAggregationBuilder.java

@@ -109,6 +109,11 @@ public final class CardinalityAggregationBuilder extends ValuesSourceAggregation
         }
     }
 
+    @Override
+    public boolean supportsSampling() {
+        return true;
+    }
+
     @Override
     protected boolean serializeTargetValueType(Version version) {
         return true;

+ 10 - 0
server/src/main/java/org/elasticsearch/search/aggregations/metrics/InternalCardinality.java

@@ -13,6 +13,7 @@ import org.elasticsearch.common.io.stream.StreamOutput;
 import org.elasticsearch.common.util.BigArrays;
 import org.elasticsearch.search.aggregations.AggregationReduceContext;
 import org.elasticsearch.search.aggregations.InternalAggregation;
+import org.elasticsearch.search.aggregations.support.SamplingContext;
 import org.elasticsearch.xcontent.XContentBuilder;
 
 import java.io.IOException;
@@ -116,6 +117,15 @@ public final class InternalCardinality extends InternalNumericMetricsAggregation
         return counts.equals(0, other.counts, 0);
     }
 
+    /**
+     * The counts created in cardinality do not lend themselves to be automatically scaled.
+     * Consequently, when finalizing the sampling, nothing is changed and the same object is returned
+     */
+    @Override
+    public InternalAggregation finalizeSampling(SamplingContext samplingContext) {
+        return this;
+    }
+
     AbstractHyperLogLogPlusPlus getState() {
         return counts;
     }