8 years ago · c0d525b108
--- a/docs/build.gradle
+++ b/docs/build.gradle
@@ -24,7 +24,6 @@ apply plugin: 'elasticsearch.docs-test'
 
				  * only remove entries from this list. When it is empty we'll remove it
			
 
				  * entirely and have a party! There will be cake and everything.... */
			
 
				 buildRestTests.expectedUnconvertedCandidates = [
			
 
				-  'reference/aggregations/bucket/diversified-sampler-aggregation.asciidoc',
			
 
				   'reference/aggregations/bucket/geodistance-aggregation.asciidoc',
			
 
				   'reference/aggregations/bucket/geohashgrid-aggregation.asciidoc',
			
 
				   'reference/aggregations/bucket/histogram-aggregation.asciidoc',
			
@@ -33,7 +32,6 @@ buildRestTests.expectedUnconvertedCandidates = [
 
				   'reference/aggregations/bucket/nested-aggregation.asciidoc',
			
 
				   'reference/aggregations/bucket/range-aggregation.asciidoc',
			
 
				   'reference/aggregations/bucket/reverse-nested-aggregation.asciidoc',
			
 
				-  'reference/aggregations/bucket/sampler-aggregation.asciidoc',
			
 
				   'reference/aggregations/bucket/significantterms-aggregation.asciidoc',
			
 
				   'reference/aggregations/bucket/terms-aggregation.asciidoc',
			
 
				   'reference/aggregations/matrix/stats-aggregation.asciidoc',
			
@@ -386,3 +384,59 @@ buildRestTests.setups['index_boost'] = '''
 
				         index: index1
			
 
				         name: alias1
			
 
				 '''
			
 
				+// Used by sampler and diversified-sampler aggregation docs
			
 
				+buildRestTests.setups['stackoverflow'] = '''
			
 
				+  - do:
			
 
				+    indices.create:
			
 
				+      index: stackoverflow
			
 
				+      body:
			
 
				+        settings:
			
 
				+          number_of_shards: 1
			
 
				+          number_of_replicas: 1
			
 
				+        mappings:
			
 
				+          question:
			
 
				+            properties:
			
 
				+              author:
			
 
				+                type: keyword
			
 
				+              tags:
			
 
				+                type: keyword
			
 
				+  - do:
			
 
				+    bulk:
			
 
				+      index: stackoverflow
			
 
				+      type: question
			
 
				+      refresh: true
			
 
				+      body: |'''
			
 
				+      
			
 
				+// Make Kibana strongly connected to elasticsearch and logstash
			
 
				+// Make Kibana rarer (and therefore higher-ranking) than Javascript
			
 
				+// Make Javascript strongly connected to jquery and angular
			
 
				+// Make Cabana strongly connected to elasticsearch but only as a result of a single author
			
 
				+
			
 
				+for (int i = 0; i < 150; i++) {
			
 
				+  buildRestTests.setups['stackoverflow'] += """
			
 
				+        {"index":{}}
			
 
				+        {"author": "very_relevant_$i", "tags": ["elasticsearch", "kibana"]}"""
			
 
				+}
			
 
				+for (int i = 0; i < 50; i++) {
			
 
				+  buildRestTests.setups['stackoverflow'] += """
			
 
				+        {"index":{}}
			
 
				+        {"author": "very_relevant_$i", "tags": ["logstash", "kibana"]}"""
			
 
				+}
			
 
				+for (int i = 0; i < 200; i++) {
			
 
				+  buildRestTests.setups['stackoverflow'] += """
			
 
				+        {"index":{}}
			
 
				+        {"author": "partially_relevant_$i", "tags": ["javascript", "jquery"]}"""
			
 
				+}
			
 
				+for (int i = 0; i < 200; i++) {
			
 
				+  buildRestTests.setups['stackoverflow'] += """
			
 
				+        {"index":{}}
			
 
				+        {"author": "partially_relevant_$i", "tags": ["javascript", "angular"]}"""
			
 
				+}
			
 
				+for (int i = 0; i < 50; i++) {
			
 
				+  buildRestTests.setups['stackoverflow'] += """
			
 
				+        {"index":{}}
			
 
				+        {"author": "noisy author", "tags": ["elasticsearch", "cabana"]}"""
			
 
				+}
			
 
				+buildRestTests.setups['stackoverflow'] += """
			
 
				+"""
			
 
				+
			
--- a/docs/reference/aggregations/bucket/diversified-sampler-aggregation.asciidoc
+++ b/docs/reference/aggregations/bucket/diversified-sampler-aggregation.asciidoc
@@ -3,35 +3,51 @@
 
				 
			
 
				 experimental[]
			
 
				 
			
 
				-A filtering aggregation used to limit any sub aggregations' processing to a sample of the top-scoring documents. Diversity settings are 
			
 
				-used to limit the number of matches that share a common value such as an "author".
			
 
				+Like the `sampler` aggregation this is a filtering aggregation used to limit any sub aggregations' processing to a sample of the top-scoring documents.
			
 
				+The `diversified_sampler` aggregation adds the ability to limit the number of matches that share a common value such as an "author".
			
 
				+
			
 
				+NOTE: Any good market researcher will tell you that when working with samples of data it is important
			
 
				+that the sample represents a healthy variety of opinions rather than being skewed by any single voice.
			
 
				+The same is true with aggregations and sampling with these diversify settings can offer a way to remove the bias in your content (an over-populated geography, 
			
 
				+a large spike in a timeline or an over-active forum spammer).  
			
 
				+
			
 
				 
			
 
				 .Example use cases:
			
 
				 * Tightening the focus of analytics to high-relevance matches rather than the potentially very long tail of low-quality matches
			
 
				 * Removing bias from analytics by ensuring fair representation of content from different sources
			
 
				 * Reducing the running cost of aggregations that can produce useful results using only samples e.g. `significant_terms`
			
 
				  
			
 
				+A choice of `field` or `script` setting is used to provide values used for de-duplication and the `max_docs_per_value` setting controls the maximum 
			
 
				+number of documents collected on any one shard which share a common value. The default setting for `max_docs_per_value` is 1.
			
 
				+
			
 
				+The aggregation will throw an error if the choice of `field` or `script` produces multiple values for a single document (de-duplication using multi-valued fields is not supported due to efficiency concerns).
			
 
				+
			
 
				 
			
 
				 Example:
			
 
				 
			
 
				+We might want to see which tags are strongly associated with `#elasticsearch` on StackOverflow
			
 
				+forum posts but ignoring the effects of some prolific users with a tendency to misspell #Kibana as #Cabana.
			
 
				+
			
 
				 [source,js]
			
 
				 --------------------------------------------------
			
 
				+POST /stackoverflow/_search?size=0
			
 
				 {
			
 
				     "query": {
			
 
				-        "match": {
			
 
				-            "text": "iphone"
			
 
				+        "query_string": {
			
 
				+            "query": "tags:elasticsearch"
			
 
				         }
			
 
				     },
			
 
				     "aggs": {
			
 
				-        "sample": {
			
 
				+        "my_unbiased_sample": {
			
 
				             "diversified_sampler": {
			
 
				                 "shard_size": 200,
			
 
				-                "field" : "user.id"   
			
 
				+                "field" : "author"   
			
 
				             },
			
 
				             "aggs": {
			
 
				                 "keywords": {
			
 
				                     "significant_terms": {
			
 
				-                        "field": "text"
			
 
				+                        "field": "tags",
			
 
				+                        "exclude": ["elasticsearch"]
			
 
				                     }
			
 
				                 }
			
 
				             }
			
@@ -39,6 +55,8 @@ Example:
 
				     }
			
 
				 }
			
 
				 --------------------------------------------------
			
 
				+// CONSOLE
			
 
				+// TEST[setup:stackoverflow]
			
 
				 
			
 
				 Response:
			
 
				 
			
@@ -46,92 +64,118 @@ Response:
 
				 --------------------------------------------------
			
 
				 {
			
 
				     ...
			
 
				-        "aggregations": {
			
 
				-        "sample": {
			
 
				+    "aggregations": {
			
 
				+        "my_unbiased_sample": {
			
 
				             "doc_count": 1000,<1>
			
 
				             "keywords": {<2>
			
 
				                 "doc_count": 1000,
			
 
				                 "buckets": [
			
 
				-                    ...
			
 
				                     {
			
 
				-                        "key": "bend",
			
 
				-                        "doc_count": 58,
			
 
				-                        "score": 37.982536582524276,
			
 
				-                        "bg_count": 103
			
 
				-                    },
			
 
				-                    ....
			
 
				+                        "key": "kibana",
			
 
				+                        "doc_count": 150,
			
 
				+                        "score": 2.213,
			
 
				+                        "bg_count": 200
			
 
				+                    }
			
 
				+                ]
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				 }
			
 
				 --------------------------------------------------
			
 
				+// TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/]
			
 
				+// TESTRESPONSE[s/1000/151/]
			
 
				+// TESTRESPONSE[s/2.213/$body.aggregations.my_unbiased_sample.keywords.buckets.0.score/]
			
 
				 
			
 
				 <1> 1000 documents were sampled in total because we asked for a maximum of 200 from an index with 5 shards. The cost of performing the nested significant_terms aggregation was therefore limited rather than unbounded.
			
 
				-<2> The results of the significant_terms aggregation are not skewed by any single over-active Twitter user because we asked for a maximum of one tweet from any one user in our sample.
			
 
				-
			
 
				-
			
 
				-==== shard_size
			
 
				-
			
 
				-The `shard_size` parameter limits how many top-scoring documents are collected in the sample processed on each shard.
			
 
				-The default value is 100.
			
 
				-
			
 
				-==== Controlling diversity
			
 
				-=`field` or `script` and `max_docs_per_value` settings are used to control the maximum number of documents collected on any one shard which share a common value.
			
 
				-The choice of value (e.g. `author`) is loaded from a regular `field` or derived dynamically by a `script`.
			
 
				-
			
 
				-The aggregation will throw an error if the choice of field or script produces multiple values for a document.
			
 
				-It is currently not possible to offer this form of de-duplication using many values, primarily due to concerns over efficiency.
			
 
				-
			
 
				-NOTE: Any good market researcher will tell you that when working with samples of data it is important
			
 
				-that the sample represents a healthy variety of opinions rather than being skewed by any single voice.
			
 
				-The same is true with aggregations and sampling with these diversify settings can offer a way to remove the bias in your content (an over-populated geography, a large spike in a timeline or an over-active forum spammer).  
			
 
				+<2> The results of the significant_terms aggregation are not skewed by any single author's quirks because we asked for a maximum of one post from any one author in our sample.
			
 
				 
			
 
				-==== Field
			
 
				+==== Scripted example:
			
 
				 
			
 
				-Controlling diversity using a field:
			
 
				+In this scenario we might want to diversify on a combination of field values. We can use a `script` to produce a hash of the 
			
 
				+multiple values in a tags field to ensure we don't have a sample that consists of the same repeated combinations of tags.
			
 
				 
			
 
				 [source,js]
			
 
				 --------------------------------------------------
			
 
				+POST /stackoverflow/_search?size=0
			
 
				 {
			
 
				-    "aggs" : {
			
 
				-        "sample" : {
			
 
				-            "diversified_sampler" : {
			
 
				-                "field" : "author",
			
 
				-                "max_docs_per_value" : 3
			
 
				+    "query": {
			
 
				+        "query_string": {
			
 
				+            "query": "tags:kibana"
			
 
				+        }
			
 
				+    },
			
 
				+    "aggs": {
			
 
				+        "my_unbiased_sample": {
			
 
				+            "diversified_sampler": {
			
 
				+                "shard_size": 200,
			
 
				+                "max_docs_per_value" : 3,
			
 
				+                "script" : {
			
 
				+                    "lang": "painless",
			
 
				+                    "inline": "doc['tags'].values.hashCode()"
			
 
				+                }   
			
 
				+            },
			
 
				+            "aggs": {
			
 
				+                "keywords": {
			
 
				+                    "significant_terms": {
			
 
				+                        "field": "tags",
			
 
				+                        "exclude": ["kibana"]
			
 
				+                    }
			
 
				+                }
			
 
				             }
			
 
				         }
			
 
				     }
			
 
				 }
			
 
				 --------------------------------------------------
			
 
				+// CONSOLE
			
 
				+// TEST[setup:stackoverflow]
			
 
				 
			
 
				-Note that the `max_docs_per_value` setting applies on a per-shard basis only for the purposes of shard-local sampling.
			
 
				-It is not intended as a way of providing a global de-duplication feature on search results.
			
 
				-
			
 
				-
			
 
				-
			
 
				-==== Script
			
 
				-
			
 
				-Controlling diversity using a script:
			
 
				+Response:
			
 
				 
			
 
				 [source,js]
			
 
				 --------------------------------------------------
			
 
				 {
			
 
				-    "aggs" : {
			
 
				-        "sample" : {
			
 
				-            "diversified_sampler" : {
			
 
				-                "script" : { 
			
 
				-                    "lang" : "painless",
			
 
				-                    "inline" : "doc['author'].value + '/' + doc['genre'].value"
			
 
				-                }
			
 
				+    ...
			
 
				+    "aggregations": {
			
 
				+        "my_unbiased_sample": {
			
 
				+            "doc_count": 1000,<1>
			
 
				+            "keywords": {<2>
			
 
				+                "doc_count": 1000,
			
 
				+                "buckets": [
			
 
				+                    {
			
 
				+                        "key": "logstash",
			
 
				+                        "doc_count": 3,
			
 
				+                        "score": 2.213,
			
 
				+                        "bg_count": 50
			
 
				+                    },
			
 
				+                    {
			
 
				+                        "key": "elasticsearch",
			
 
				+                        "doc_count": 3,
			
 
				+                        "score": 1.34,
			
 
				+                        "bg_count": 200
			
 
				+                    },
			
 
				+                ]
			
 
				             }
			
 
				         }
			
 
				     }
			
 
				 }
			
 
				 --------------------------------------------------
			
 
				-Note in the above example we chose to use the default `max_docs_per_value` setting of 1 and combine author and genre fields to ensure 
			
 
				-each shard sample has, at most, one match for an author/genre pair.
			
 
				+// TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/]
			
 
				+// TESTRESPONSE[s/1000/6/]
			
 
				+// TESTRESPONSE[s/2.213/$body.aggregations.my_unbiased_sample.keywords.buckets.0.score/]
			
 
				+// TESTRESPONSE[s/1.34/$body.aggregations.my_unbiased_sample.keywords.buckets.1.score/]
			
 
				+
			
 
				+==== shard_size
			
 
				+
			
 
				+The `shard_size` parameter limits how many top-scoring documents are collected in the sample processed on each shard.
			
 
				+The default value is 100.
			
 
				+
			
 
				+==== max_docs_per_value
			
 
				+The `max_docs_per_value` is an optional parameter and limits how many documents are permitted per choice of de-duplicating value.
			
 
				+The default setting is "1".
			
 
				 
			
 
				 
			
 
				 ==== execution_hint
			
 
				 
			
 
				-When using the settings to control diversity, the optional `execution_hint` setting can influence the management of the values used for de-duplication.
			
 
				+The optional `execution_hint` setting can influence the management of the values used for de-duplication.
			
 
				 Each option will hold up to `shard_size` values in memory while performing de-duplication but the type of value held can be controlled as follows:
			
 
				  
			
 
				  - hold field values directly (`map`)
			
@@ -145,12 +189,12 @@ Please note that Elasticsearch will ignore the choice of execution hint if it is
 
				 ==== Limitations
			
 
				 
			
 
				 ===== Cannot be nested under `breadth_first` aggregations
			
 
				-Being a quality-based filter the sampler aggregation needs access to the relevance score produced for each document.
			
 
				+Being a quality-based filter the diversified_sampler aggregation needs access to the relevance score produced for each document.
			
 
				 It therefore cannot be nested under a `terms` aggregation which has the `collect_mode` switched from the default `depth_first` mode to `breadth_first` as this discards scores.
			
 
				 In this situation an error will be thrown.
			
 
				 
			
 
				 ===== Limited de-dup logic.
			
 
				-The de-duplication logic in the diversify settings applies only at a shard level so will not apply across shards.
			
 
				+The de-duplication logic applies only at a shard level so will not apply across shards.
			
 
				 
			
 
				 ===== No specialized syntax for geo/date fields
			
 
				 Currently the syntax for defining the diversifying values is defined by a choice of `field` or
			
--- a/docs/reference/aggregations/bucket/sampler-aggregation.asciidoc
+++ b/docs/reference/aggregations/bucket/sampler-aggregation.asciidoc
@@ -12,12 +12,18 @@ A filtering aggregation used to limit any sub aggregations' processing to a samp
 
				 
			
 
				 Example:
			
 
				 
			
 
				+A query on StackOverflow data for the popular term `javascript` OR the rarer term
			
 
				+`kibana` will match many documents - most of them missing the word Kibana. To focus
			
 
				+the `significant_terms` aggregation on top-scoring documents that are more likely to match 
			
 
				+the most interesting parts of our query we use a sample.
			
 
				+
			
 
				 [source,js]
			
 
				 --------------------------------------------------
			
 
				+POST /stackoverflow/_search?size=0
			
 
				 {
			
 
				     "query": {
			
 
				-        "match": {
			
 
				-            "text": "iphone"
			
 
				+        "query_string": {
			
 
				+            "query": "tags:kibana OR tags:javascript"
			
 
				         }
			
 
				     },
			
 
				     "aggs": {
			
@@ -28,7 +34,8 @@ Example:
 
				             "aggs": {
			
 
				                 "keywords": {
			
 
				                     "significant_terms": {
			
 
				-                        "field": "text"
			
 
				+                        "field": "tags",
			
 
				+                        "exclude": ["kibana", "javascript"]
			
 
				                     }
			
 
				                 }
			
 
				             }
			
@@ -36,6 +43,8 @@ Example:
 
				     }
			
 
				 }
			
 
				 --------------------------------------------------
			
 
				+// CONSOLE
			
 
				+// TEST[setup:stackoverflow]
			
 
				 
			
 
				 Response:
			
 
				 
			
@@ -43,26 +52,103 @@ Response:
 
				 --------------------------------------------------
			
 
				 {
			
 
				     ...
			
 
				-        "aggregations": {
			
 
				+    "aggregations": {
			
 
				         "sample": {
			
 
				             "doc_count": 1000,<1>
			
 
				             "keywords": {
			
 
				                 "doc_count": 1000,
			
 
				                 "buckets": [
			
 
				-                    ...
			
 
				                     {
			
 
				-                        "key": "bend",
			
 
				-                        "doc_count": 58,
			
 
				-                        "score": 37.982536582524276,
			
 
				-                        "bg_count": 103
			
 
				+                        "key": "elasticsearch",
			
 
				+                        "doc_count": 150,
			
 
				+                        "score": 1.078125,
			
 
				+                        "bg_count": 200
			
 
				                     },
			
 
				-                    ....
			
 
				+                    {
			
 
				+                        "key": "logstash",
			
 
				+                        "doc_count": 50,
			
 
				+                        "score": 0.5625,
			
 
				+                        "bg_count": 50
			
 
				+                    }
			
 
				+                ]
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				 }
			
 
				 --------------------------------------------------
			
 
				+// TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/]
			
 
				+// TESTRESPONSE[s/1000/200/]
			
 
				 
			
 
				 <1> 1000 documents were sampled in total because we asked for a maximum of 200 from an index with 5 shards. The cost of performing the nested significant_terms aggregation was therefore limited rather than unbounded.
			
 
				 
			
 
				 
			
 
				+Without the `sampler` aggregation the request query considers the full "long tail" of low-quality matches and therefore identifies
			
 
				+less significant terms such as `jquery` and `angular` rather than focusing on the more insightful Kibana-related terms.
			
 
				+
			
 
				+
			
 
				+[source,js]
			
 
				+--------------------------------------------------
			
 
				+POST /stackoverflow/_search?size=0
			
 
				+{
			
 
				+    "query": {
			
 
				+        "query_string": {
			
 
				+            "query": "tags:kibana OR tags:javascript"
			
 
				+        }
			
 
				+    },
			
 
				+    "aggs": {
			
 
				+             "low_quality_keywords": {
			
 
				+                "significant_terms": {
			
 
				+                    "field": "tags",
			
 
				+                    "size": 3,
			
 
				+                    "exclude":["kibana", "javascript"]
			
 
				+                }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+// CONSOLE
			
 
				+// TEST[setup:stackoverflow]
			
 
				+
			
 
				+Response:
			
 
				+
			
 
				+[source,js]
			
 
				+--------------------------------------------------
			
 
				+{
			
 
				+    ...
			
 
				+    "aggregations": {
			
 
				+        "low_quality_keywords": {
			
 
				+            "doc_count": 1000,
			
 
				+            "buckets": [
			
 
				+                {
			
 
				+                    "key": "angular",
			
 
				+                    "doc_count": 200,
			
 
				+                    "score": 0.02777,
			
 
				+                   "bg_count": 200
			
 
				+                },
			
 
				+                {
			
 
				+                    "key": "jquery",
			
 
				+                    "doc_count": 200,
			
 
				+                    "score": 0.02777,
			
 
				+                    "bg_count": 200
			
 
				+                },
			
 
				+                {
			
 
				+                    "key": "logstash",
			
 
				+                    "doc_count": 50,
			
 
				+                    "score": 0.0069,
			
 
				+                    "bg_count": 50
			
 
				+                }
			
 
				+            ]
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+// TESTRESPONSE[s/\.\.\./"took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,/]
			
 
				+// TESTRESPONSE[s/1000/600/]
			
 
				+// TESTRESPONSE[s/0.02777/$body.aggregations.low_quality_keywords.buckets.0.score/]
			
 
				+// TESTRESPONSE[s/0.0069/$body.aggregations.low_quality_keywords.buckets.2.score/]
			
 
				+
			
 
				+
			
 
				+
			
 
				 ==== shard_size
			
 
				 
			
 
				 The `shard_size` parameter limits how many top-scoring documents are collected in the sample processed on each shard.