4 years ago · a43b166d11
--- a/docs/reference/aggregations/bucket/significantterms-aggregation.asciidoc
+++ b/docs/reference/aggregations/bucket/significantterms-aggregation.asciidoc
@@ -374,7 +374,7 @@ Chi square behaves like mutual information and can be configured with the same p
 
				 
			
 
				 
			
 
				 ===== Google normalized distance
			
 
				-Google normalized distance as described in "The Google Similarity Distance", Cilibrasi and Vitanyi, 2007 (https://arxiv.org/pdf/cs/0412098v3.pdf) can be used as significance score by adding the parameter
			
 
				+Google normalized distance as described in https://arxiv.org/pdf/cs/0412098v3.pdf["The Google Similarity Distance", Cilibrasi and Vitanyi, 2007] can be used as significance score by adding the parameter
			
 
				 
			
 
				 [source,js]
			
 
				 --------------------------------------------------
			
@@ -408,7 +408,7 @@ Multiple observations are typically required to reinforce a view so it is recomm
 
				 
			
 
				 Roughly, `mutual_information` prefers high frequent terms even if they occur also frequently in the background. For example, in an analysis of natural language text this might lead to selection of stop words. `mutual_information` is unlikely to select very rare terms like misspellings. `gnd` prefers terms with a high co-occurrence and avoids selection of stopwords. It might be better suited for synonym detection. However, `gnd` has a tendency to select very rare terms that are, for example, a result of misspelling. `chi_square` and `jlh` are somewhat in-between.
			
 
				 
			
 
				-It is hard to say which one of the different heuristics will be the best choice as it depends on what the significant terms are used for (see for example [Yang and Pedersen, "A Comparative Study on Feature Selection in Text Categorization", 1997](http://courses.ischool.berkeley.edu/i256/f06/papers/yang97comparative.pdf) for a study on using significant terms for feature selection for text classification).
			
 
				+It is hard to say which one of the different heuristics will be the best choice as it depends on what the significant terms are used for (see for example http://courses.ischool.berkeley.edu/i256/f06/papers/yang97comparative.pdf[Yang and Pedersen, "A Comparative Study on Feature Selection in Text Categorization", 1997] for a study on using significant terms for feature selection for text classification).
			
 
				 
			
 
				 If none of the above measures suits your usecase than another option is to implement a custom significance measure:
			
 
				 
			
--- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.aggregation/90_sig_text.yml
+++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.aggregation/90_sig_text.yml
@@ -1,151 +0,0 @@
 
				----
			
 
				-"Default index":
			
 
				-
			
 
				-  - do:
			
 
				-      indices.create:
			
 
				-          index:  goodbad
			
 
				-          body:
			
 
				-            settings:
			
 
				-                number_of_shards: "1"
			
 
				-            mappings:
			
 
				-                properties:
			
 
				-                    text:
			
 
				-                        type: text
			
 
				-                        fielddata: false
			
 
				-                    class:
			
 
				-                        type: keyword
			
 
				-
			
 
				-  - do:
			
 
				-      index:
			
 
				-          index:  goodbad
			
 
				-          id:     1
			
 
				-          body:   { text: "good", class: "good" }
			
 
				-  - do:
			
 
				-      index:
			
 
				-          index:  goodbad
			
 
				-          id:     2
			
 
				-          body:   { text: "good", class: "good" }
			
 
				-  - do:
			
 
				-      index:
			
 
				-          index:  goodbad
			
 
				-          id:     3
			
 
				-          body:   { text: "bad", class: "bad" }
			
 
				-  - do:
			
 
				-      index:
			
 
				-          index:  goodbad
			
 
				-          id:     4
			
 
				-          body:   { text: "bad", class: "bad" }
			
 
				-  - do:
			
 
				-      index:
			
 
				-          index:  goodbad
			
 
				-          id:     5
			
 
				-          body:   { text: "good bad", class: "good" }
			
 
				-  - do:
			
 
				-      index:
			
 
				-          index:  goodbad
			
 
				-          id:     6
			
 
				-          body:   { text: "good bad", class: "bad" }
			
 
				-  - do:
			
 
				-      index:
			
 
				-          index:  goodbad
			
 
				-          id:     7
			
 
				-          body:   { text: "bad", class: "bad" }
			
 
				-
			
 
				-
			
 
				-
			
 
				-  - do:
			
 
				-      indices.refresh:
			
 
				-        index: [goodbad]
			
 
				-
			
 
				-  - do:
			
 
				-      search:
			
 
				-        rest_total_hits_as_int: true
			
 
				-        index: goodbad
			
 
				-
			
 
				-  - match: {hits.total: 7}
			
 
				-
			
 
				-  - do:
			
 
				-      search:
			
 
				-        rest_total_hits_as_int: true
			
 
				-        index: goodbad
			
 
				-        body: {"aggs": {"class": {"terms": {"field": "class"},"aggs": {"sig_text": {"significant_text": {"field": "text"}}}}}}
			
 
				-
			
 
				-  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: "bad"}
			
 
				-  - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: "good"}
			
 
				-
			
 
				----
			
 
				-"Dedup noise":
			
 
				-
			
 
				-  - do:
			
 
				-      indices.create:
			
 
				-          index:  goodbad
			
 
				-          body:
			
 
				-            settings:
			
 
				-                number_of_shards: "1"
			
 
				-            mappings:
			
 
				-                properties:
			
 
				-                    text:
			
 
				-                        type: text
			
 
				-                        fielddata: false
			
 
				-                    class:
			
 
				-                        type: keyword
			
 
				-
			
 
				-  - do:
			
 
				-      index:
			
 
				-          index:  goodbad
			
 
				-          id:     1
			
 
				-          body:   { text: "good noisewords1 g1 g2 g3 g4 g5 g6", class: "good" }
			
 
				-  - do:
			
 
				-      index:
			
 
				-          index:  goodbad
			
 
				-          id:     2
			
 
				-          body:   { text: "good  noisewords2 g1 g2 g3 g4 g5 g6", class: "good" }
			
 
				-  - do:
			
 
				-      index:
			
 
				-          index:  goodbad
			
 
				-          id:     3
			
 
				-          body:   { text: "bad noisewords3 b1 b2 b3 b4 b5 b6", class: "bad" }
			
 
				-  - do:
			
 
				-      index:
			
 
				-          index:  goodbad
			
 
				-          id:     4
			
 
				-          body:   { text: "bad noisewords4 b1 b2 b3 b4 b5 b6", class: "bad" }
			
 
				-  - do:
			
 
				-      index:
			
 
				-          index:  goodbad
			
 
				-          id:     5
			
 
				-          body:   { text: "good bad noisewords5 gb1 gb2 gb3 gb4 gb5 gb6", class: "good" }
			
 
				-  - do:
			
 
				-      index:
			
 
				-          index:  goodbad
			
 
				-          id:     6
			
 
				-          body:   { text: "good bad noisewords6 gb1 gb2 gb3 gb4 gb5 gb6", class: "bad" }
			
 
				-  - do:
			
 
				-      index:
			
 
				-          index:  goodbad
			
 
				-          id:     7
			
 
				-          body:   { text: "bad noisewords7 b1 b2 b3 b4 b5 b6", class: "bad" }
			
 
				-
			
 
				-
			
 
				-
			
 
				-  - do:
			
 
				-      indices.refresh:
			
 
				-        index: [goodbad]
			
 
				-
			
 
				-  - do:
			
 
				-      search:
			
 
				-        rest_total_hits_as_int: true
			
 
				-        index: goodbad
			
 
				-
			
 
				-  - match: {hits.total: 7}
			
 
				-
			
 
				-  - do:
			
 
				-      search:
			
 
				-        rest_total_hits_as_int: true
			
 
				-        index: goodbad
			
 
				-        body: {"aggs": {"class": {"terms": {"field": "class"},"aggs": {"sig_text": {"significant_text": {"field": "text", "filter_duplicate_text": true}}}}}}
			
 
				-
			
 
				-  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: "bad"}
			
 
				-  - length: { aggregations.class.buckets.0.sig_text.buckets: 1 }
			
 
				-  - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: "good"}
			
 
				-  - length: { aggregations.class.buckets.1.sig_text.buckets: 1 }
			
--- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.aggregation/90_significant_text.yml
+++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.aggregation/90_significant_text.yml
@@ -0,0 +1,525 @@
 
				+setup:
			
 
				+  - do:
			
 
				+      indices.create:
			
 
				+        index:  goodbad
			
 
				+        body:
			
 
				+          settings:
			
 
				+            number_of_shards: "1"
			
 
				+          mappings:
			
 
				+            properties:
			
 
				+              text:
			
 
				+                type: text
			
 
				+                fielddata: false
			
 
				+              class:
			
 
				+                type: keyword
			
 
				+
			
 
				+  - do:
			
 
				+      bulk:
			
 
				+        index: goodbad
			
 
				+        refresh: true
			
 
				+        body:
			
 
				+          - '{ "index": {"_id": "1"} }'
			
 
				+          - '{ "text": "good", "class": "good" }'
			
 
				+          - '{ "index": {"_id": "2"} }'
			
 
				+          - '{ "text": "good", "class": "good" }'
			
 
				+          - '{ "index": {"_id": "3"} }'
			
 
				+          - '{ "text": "bad", "class": "bad" }'
			
 
				+          - '{ "index": {"_id": "4"} }'
			
 
				+          - '{ "text": "bad", "class": "bad" }'
			
 
				+          - '{ "index": {"_id": "5"} }'
			
 
				+          - '{ "text": "good bad", "class": "good" }'
			
 
				+          - '{ "index": {"_id": "6"} }'
			
 
				+          - '{ "text": "good bad", "class": "bad" }'
			
 
				+          - '{ "index": {"_id": "7"} }'
			
 
				+          - '{ "text": "bad", "class": "bad" }'
			
 
				+
			
 
				+---
			
 
				+simple:
			
 
				+  - do:
			
 
				+      search:
			
 
				+        index: goodbad
			
 
				+        body:
			
 
				+          aggs:
			
 
				+            class:
			
 
				+              terms:
			
 
				+                field: class
			
 
				+              aggs:
			
 
				+                sig_text:
			
 
				+                  significant_text:
			
 
				+                    field: text
			
 
				+  - match: {hits.total.value: 7}
			
 
				+  - length: {aggregations.class.buckets: 2}
			
 
				+  - match: {aggregations.class.buckets.0.key: bad}
			
 
				+  - length: { aggregations.class.buckets.1.sig_text.buckets: 1 }
			
 
				+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad}
			
 
				+  - match: {aggregations.class.buckets.1.key: good}
			
 
				+  - length: { aggregations.class.buckets.1.sig_text.buckets: 1 }
			
 
				+  - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: good}
			
 
				+
			
 
				+---
			
 
				+"Dedup noise":
			
 
				+  - do:
			
 
				+      indices.create:
			
 
				+        index:  noisy
			
 
				+        body:
			
 
				+          settings:
			
 
				+            number_of_shards: "1"
			
 
				+          mappings:
			
 
				+            properties:
			
 
				+              text:
			
 
				+                type: text
			
 
				+                fielddata: false
			
 
				+              class:
			
 
				+                type: keyword
			
 
				+
			
 
				+  - do:
			
 
				+      bulk:
			
 
				+        index: noisy
			
 
				+        refresh: true
			
 
				+        body:
			
 
				+          - '{ "index": {"_id": "1"} }'
			
 
				+          - '{ "text": "good noisewords1 g1 g2 g3 g4 g5 g6", "class": "good" }'
			
 
				+          - '{ "index": {"_id": "2"} }'
			
 
				+          - '{ "text": "good  noisewords2 g1 g2 g3 g4 g5 g6", "class": "good" }'
			
 
				+          - '{ "index": {"_id": "3"} }'
			
 
				+          - '{ "text": "bad noisewords3 b1 b2 b3 b4 b5 b6", "class": "bad" }'
			
 
				+          - '{ "index": {"_id": "4"} }'
			
 
				+          - '{ "text": "bad noisewords4 b1 b2 b3 b4 b5 b6", "class": "bad" }'
			
 
				+          - '{ "index": {"_id": "5"} }'
			
 
				+          - '{ "text": "good bad noisewords5 gb1 gb2 gb3 gb4 gb5 gb6", "class": "good" }'
			
 
				+          - '{ "index": {"_id": "6"} }'
			
 
				+          - '{ "text": "good bad noisewords6 gb1 gb2 gb3 gb4 gb5 gb6", "class": "bad" }'
			
 
				+          - '{ "index": {"_id": "7"} }'
			
 
				+          - '{ "text": "bad noisewords7 b1 b2 b3 b4 b5 b6", "class": "bad" }'
			
 
				+
			
 
				+  - do:
			
 
				+      search:
			
 
				+        rest_total_hits_as_int: true
			
 
				+        index: noisy
			
 
				+        body:
			
 
				+          size: 0
			
 
				+          aggs:
			
 
				+            class:
			
 
				+              terms:
			
 
				+                field: class
			
 
				+              aggs:
			
 
				+                sig_text:
			
 
				+                  significant_text:
			
 
				+                    field: text
			
 
				+                    filter_duplicate_text: true
			
 
				+  - match: {hits.total: 7}
			
 
				+  - length: {aggregations.class.buckets: 2}
			
 
				+  - match: {aggregations.class.buckets.0.key: bad}
			
 
				+  - length: { aggregations.class.buckets.0.sig_text.buckets: 1 }
			
 
				+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad}
			
 
				+  - match: {aggregations.class.buckets.1.key: good}
			
 
				+  - length: { aggregations.class.buckets.1.sig_text.buckets: 1 }
			
 
				+  - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: good}
			
 
				+
			
 
				+---
			
 
				+profile:
			
 
				+  - skip:
			
 
				+      version: " - 7.99.99"
			
 
				+      reason: extra profiling added in 8.0.0 to be backported to 7.14.0
			
 
				+
			
 
				+  - do:
			
 
				+      search:
			
 
				+        index: goodbad
			
 
				+        body:
			
 
				+          size: 0
			
 
				+          profile: true
			
 
				+          aggs:
			
 
				+            class:
			
 
				+              terms:
			
 
				+                field: class
			
 
				+              aggs:
			
 
				+                sig_text:
			
 
				+                  significant_text:
			
 
				+                    field: text
			
 
				+  - match: {hits.total.value: 7}
			
 
				+  - length: {aggregations.class.buckets: 2}
			
 
				+  - match: {aggregations.class.buckets.0.key: bad}
			
 
				+  - length: { aggregations.class.buckets.0.sig_text.buckets: 1 }
			
 
				+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad}
			
 
				+  - match: {aggregations.class.buckets.1.key: good}
			
 
				+  - length: { aggregations.class.buckets.1.sig_text.buckets: 1 }
			
 
				+  - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: good}
			
 
				+  - match: { profile.shards.0.aggregations.0.description: class }
			
 
				+  - match: { profile.shards.0.aggregations.0.children.0.type: MapStringTermsAggregator }
			
 
				+  - match: { profile.shards.0.aggregations.0.children.0.description: sig_text }
			
 
				+  - match: { profile.shards.0.aggregations.0.children.0.debug.collection_strategy: analyze text from _source }
			
 
				+  - match: { profile.shards.0.aggregations.0.children.0.debug.result_strategy: significant_terms }
			
 
				+  - match: { profile.shards.0.aggregations.0.children.0.debug.total_buckets: 4 }
			
 
				+  - match: { profile.shards.0.aggregations.0.children.0.debug.values_fetched: 7 }
			
 
				+  - match: { profile.shards.0.aggregations.0.children.0.debug.chars_fetched: 33 }
			
 
				+  - gt: { profile.shards.0.aggregations.0.children.0.debug.extract_ns: 0 }
			
 
				+  - match: { profile.shards.0.aggregations.0.children.0.debug.extract_count: 7 }
			
 
				+  - gt: { profile.shards.0.aggregations.0.children.0.debug.collect_analyzed_ns: 0 }
			
 
				+  - match: { profile.shards.0.aggregations.0.children.0.debug.collect_analyzed_count: 9 }
			
 
				+
			
 
				+---
			
 
				+include:
			
 
				+  - do:
			
 
				+      search:
			
 
				+        index: goodbad
			
 
				+        body:
			
 
				+          size: 0
			
 
				+          aggs:
			
 
				+            class:
			
 
				+              terms:
			
 
				+                field: class
			
 
				+              aggs:
			
 
				+                sig_text:
			
 
				+                  significant_text:
			
 
				+                    field: text
			
 
				+                    include: bad
			
 
				+  - match: {hits.total.value: 7}
			
 
				+  - length: {aggregations.class.buckets: 2}
			
 
				+  - match: {aggregations.class.buckets.0.key: bad}
			
 
				+  - length: { aggregations.class.buckets.0.sig_text.buckets: 1 }
			
 
				+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad}
			
 
				+  - match: {aggregations.class.buckets.1.key: good}
			
 
				+  - length: { aggregations.class.buckets.1.sig_text.buckets: 0 }
			
 
				+
			
 
				+---
			
 
				+exclude:
			
 
				+  - do:
			
 
				+      search:
			
 
				+        index: goodbad
			
 
				+        body:
			
 
				+          size: 0
			
 
				+          aggs:
			
 
				+            class:
			
 
				+              terms:
			
 
				+                field: class
			
 
				+              aggs:
			
 
				+                sig_text:
			
 
				+                  significant_text:
			
 
				+                    field: text
			
 
				+                    exclude: good
			
 
				+  - match: {hits.total.value: 7}
			
 
				+  - length: {aggregations.class.buckets: 2}
			
 
				+  - match: {aggregations.class.buckets.0.key: bad}
			
 
				+  - length: { aggregations.class.buckets.0.sig_text.buckets: 1 }
			
 
				+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad}
			
 
				+  - match: {aggregations.class.buckets.1.key: good}
			
 
				+  - length: { aggregations.class.buckets.1.sig_text.buckets: 0 }
			
 
				+
			
 
				+---
			
 
				+min_doc_count:
			
 
				+  - do:
			
 
				+      search:
			
 
				+        index: goodbad
			
 
				+        body:
			
 
				+          size: 0
			
 
				+          aggs:
			
 
				+            class:
			
 
				+              terms:
			
 
				+                field: class
			
 
				+              aggs:
			
 
				+                sig_text:
			
 
				+                  significant_text:
			
 
				+                    field: text
			
 
				+                    min_doc_count: 4
			
 
				+  - match: {hits.total.value: 7}
			
 
				+  - length: {aggregations.class.buckets: 2}
			
 
				+  - match: {aggregations.class.buckets.0.key: bad}
			
 
				+  - length: { aggregations.class.buckets.0.sig_text.buckets: 1 }
			
 
				+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad}
			
 
				+  - match: {aggregations.class.buckets.1.key: good}
			
 
				+  - length: { aggregations.class.buckets.1.sig_text.buckets: 0 }
			
 
				+
			
 
				+---
			
 
				+size:
			
 
				+  - do:
			
 
				+      bulk:
			
 
				+        index: goodbad
			
 
				+        refresh: true
			
 
				+        body:
			
 
				+          - '{ "index": {"_id": "101"} }'
			
 
				+          - '{ "text": "caterpillar eat snacks", "class": "good" }'
			
 
				+          - '{ "index": {"_id": "102"} }'
			
 
				+          - '{ "text": "caterpillar sick", "class": "good" }'
			
 
				+          - '{ "index": {"_id": "103"} }'
			
 
				+          - '{ "text": "caterpillar eat leaf", "class": "good" }'
			
 
				+          - '{ "index": {"_id": "104"} }'
			
 
				+          - '{ "text": "caterpillar build cocoon", "class": "good" }'
			
 
				+          - '{ "index": {"_id": "105"} }'
			
 
				+          - '{ "text": "caterpillar fly away", "class": "good" }'
			
 
				+
			
 
				+  - do:
			
 
				+      search:
			
 
				+        index: goodbad
			
 
				+        body:
			
 
				+          size: 0
			
 
				+          aggs:
			
 
				+            class:
			
 
				+              terms:
			
 
				+                field: class
			
 
				+              aggs:
			
 
				+                sig_text:
			
 
				+                  significant_text:
			
 
				+                    field: text
			
 
				+  - match: {hits.total.value: 12}
			
 
				+  - length: {aggregations.class.buckets: 2}
			
 
				+  - match: {aggregations.class.buckets.0.key: good}
			
 
				+  - length: { aggregations.class.buckets.0.sig_text.buckets: 2 }
			
 
				+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: caterpillar}
			
 
				+  - match: {aggregations.class.buckets.0.sig_text.buckets.1.key: good}
			
 
				+  - match: {aggregations.class.buckets.1.key: bad}
			
 
				+  - length: { aggregations.class.buckets.1.sig_text.buckets: 1 }
			
 
				+  - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: bad}
			
 
				+
			
 
				+  - do:
			
 
				+      search:
			
 
				+        index: goodbad
			
 
				+        body:
			
 
				+          size: 0
			
 
				+          aggs:
			
 
				+            class:
			
 
				+              terms:
			
 
				+                field: class
			
 
				+              aggs:
			
 
				+                sig_text:
			
 
				+                  significant_text:
			
 
				+                    field: text
			
 
				+                    size: 1
			
 
				+  - match: {hits.total.value: 12}
			
 
				+  - length: {aggregations.class.buckets: 2}
			
 
				+  - match: {aggregations.class.buckets.0.key: good}
			
 
				+  - length: { aggregations.class.buckets.0.sig_text.buckets: 1 }
			
 
				+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: caterpillar}
			
 
				+  - match: {aggregations.class.buckets.1.key: bad}
			
 
				+  - length: { aggregations.class.buckets.1.sig_text.buckets: 1 }
			
 
				+  - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: bad}
			
 
				+
			
 
				+---
			
 
				+shard_size:
			
 
				+  # We can't perform a great test for shard_size without lots of control over
			
 
				+  # routing here and it isn't worh being that brittle. So we'll just test that
			
 
				+  # we parse it.
			
 
				+  - do:
			
 
				+      search:
			
 
				+        index: goodbad
			
 
				+        body:
			
 
				+          aggs:
			
 
				+            class:
			
 
				+              terms:
			
 
				+                field: class
			
 
				+              aggs:
			
 
				+                sig_text:
			
 
				+                  significant_text:
			
 
				+                    field: text
			
 
				+                    size: 1
			
 
				+                    shard_size: 1
			
 
				+  - match: {hits.total.value: 7}
			
 
				+  - length: {aggregations.class.buckets: 2}
			
 
				+  - match: {aggregations.class.buckets.0.key: bad}
			
 
				+  - length: { aggregations.class.buckets.1.sig_text.buckets: 1 }
			
 
				+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad}
			
 
				+  - match: {aggregations.class.buckets.1.key: good}
			
 
				+  - length: { aggregations.class.buckets.1.sig_text.buckets: 1 }
			
 
				+  - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: good}
			
 
				+
			
 
				+---
			
 
				+significance_heuristics:
			
 
				+  - do:
			
 
				+      bulk:
			
 
				+        index: goodbad
			
 
				+        refresh: true
			
 
				+        body:
			
 
				+          - '{ "index": {"_id": "101"} }'
			
 
				+          - '{ "text": "caterpillar eat snacks", "class": "good" }'
			
 
				+          - '{ "index": {"_id": "102"} }'
			
 
				+          - '{ "text": "caterpillar sick", "class": "good" }'
			
 
				+          - '{ "index": {"_id": "103"} }'
			
 
				+          - '{ "text": "caterpillar eat leaf", "class": "good" }'
			
 
				+          - '{ "index": {"_id": "104"} }'
			
 
				+          - '{ "text": "caterpillar build cocoon", "class": "good" }'
			
 
				+          - '{ "index": {"_id": "105"} }'
			
 
				+          - '{ "text": "caterpillar fly away", "class": "good" }'
			
 
				+          - '{ "index": {"_id": "107"} }'
			
 
				+          - '{ "text": "caterpillar bad", "class": "bad" }'
			
 
				+          - '{ "index": {"_id": "108"} }'
			
 
				+          - '{ "text": "caterpillar very bad", "class": "bad" }'
			
 
				+          - '{ "index": {"_id": "110"} }'
			
 
				+          - '{ "text": "caterpillar will eat you", "class": "bad" }'
			
 
				+          - '{ "index": {"_id": "110"} }'
			
 
				+          - '{ "text": "caterpillar is the enemy", "class": "bad" }'
			
 
				+          - '{ "index": {"_id": "113"} }'
			
 
				+          - '{ "text": "good", "class": "good" }'
			
 
				+          - '{ "index": {"_id": "114"} }'
			
 
				+          - '{ "text": "good", "class": "good" }'
			
 
				+
			
 
				+  - do:
			
 
				+      search:
			
 
				+        index: goodbad
			
 
				+        body:
			
 
				+          query:
			
 
				+            match:
			
 
				+              class: good
			
 
				+          size: 0
			
 
				+          aggs:
			
 
				+            sig_text:
			
 
				+              significant_text:
			
 
				+                field: text
			
 
				+                gnd: {}
			
 
				+  - match: {hits.total.value: 10}
			
 
				+  - length: {aggregations.sig_text.buckets: 2}
			
 
				+  - match: {aggregations.sig_text.buckets.0.key: good}
			
 
				+  - match: {aggregations.sig_text.buckets.1.key: caterpillar}
			
 
				+
			
 
				+  # mutual_information doesn't think `caterpillar` is significant because
			
 
				+  # it shows up so much in the backgound set.
			
 
				+  - do:
			
 
				+      search:
			
 
				+        index: goodbad
			
 
				+        body:
			
 
				+          query:
			
 
				+            match:
			
 
				+              class: good
			
 
				+          size: 0
			
 
				+          aggs:
			
 
				+            sig_text:
			
 
				+              significant_text:
			
 
				+                field: text
			
 
				+                mutual_information: {}
			
 
				+  - match: {hits.total.value: 10}
			
 
				+  - length: {aggregations.sig_text.buckets: 1}
			
 
				+  - match: {aggregations.sig_text.buckets.0.key: good}
			
 
				+
			
 
				+---
			
 
				+background_filter:
			
 
				+  - do:
			
 
				+      indices.create:
			
 
				+        index:  goodbadugly
			
 
				+        body:
			
 
				+          settings:
			
 
				+            number_of_shards: "1"
			
 
				+          mappings:
			
 
				+            properties:
			
 
				+              text:
			
 
				+                type: text
			
 
				+                fielddata: false
			
 
				+              class:
			
 
				+                type: keyword
			
 
				+              ugly:
			
 
				+                type: boolean
			
 
				+
			
 
				+  - do:
			
 
				+      bulk:
			
 
				+        index: goodbadugly
			
 
				+        refresh: true
			
 
				+        body:
			
 
				+          - '{ "index": {"_id": "1"} }'
			
 
				+          - '{ "text": "good", "class": "good", "ugly": true }'
			
 
				+          - '{ "index": {"_id": "2"} }'
			
 
				+          - '{ "text": "good", "class": "good", "ugly": true }'
			
 
				+          - '{ "index": {"_id": "3"} }'
			
 
				+          - '{ "text": "bad", "class": "bad" }'
			
 
				+          - '{ "index": {"_id": "4"} }'
			
 
				+          - '{ "text": "bad", "class": "bad" }'
			
 
				+          - '{ "index": {"_id": "5"} }'
			
 
				+          - '{ "text": "good bad", "class": "good" }'
			
 
				+          - '{ "index": {"_id": "6"} }'
			
 
				+          - '{ "text": "good bad", "class": "bad" }'
			
 
				+          - '{ "index": {"_id": "7"} }'
			
 
				+          - '{ "text": "bad", "class": "bad" }'
			
 
				+
			
 
				+  - do:
			
 
				+      search:
			
 
				+        index: goodbadugly
			
 
				+        body:
			
 
				+          size: 0
			
 
				+          aggs:
			
 
				+            class:
			
 
				+              terms:
			
 
				+                field: class
			
 
				+              aggs:
			
 
				+                sig_text:
			
 
				+                  significant_text:
			
 
				+                    field: text
			
 
				+                    # only use background frequency information from "ugly"
			
 
				+                    # documents. All "ugly" documents have the "good" text so
			
 
				+                    # so "good" isn't significant at all!
			
 
				+                    background_filter:
			
 
				+                      match:
			
 
				+                        ugly: true
			
 
				+  - match: {hits.total.value: 7}
			
 
				+  - length: {aggregations.class.buckets: 2}
			
 
				+  - match: {aggregations.class.buckets.0.key: bad}
			
 
				+  - length: { aggregations.class.buckets.0.sig_text.buckets: 1 }
			
 
				+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad}
			
 
				+  - match: {aggregations.class.buckets.1.key: good}
			
 
				+  - length: { aggregations.class.buckets.1.sig_text.buckets: 0 }
			
 
				+
			
 
				+---
			
 
				+copy_to:
			
 
				+  # Tests the special configuration that `significant_text` needs in order to
			
 
				+  # do sensible things with fields built with `copy_to`.
			
 
				+  - do:
			
 
				+      indices.create:
			
 
				+        index:  has_copy_to
			
 
				+        body:
			
 
				+          settings:
			
 
				+            number_of_shards: "1"
			
 
				+          mappings:
			
 
				+            properties:
			
 
				+              text:
			
 
				+                type: text
			
 
				+                fielddata: false
			
 
				+              class:
			
 
				+                type: keyword
			
 
				+              a:
			
 
				+                type: keyword
			
 
				+                index: false
			
 
				+                doc_values: false
			
 
				+                copy_to: a
			
 
				+              b:
			
 
				+                type: keyword
			
 
				+                index: false
			
 
				+                doc_values: false
			
 
				+                copy_to: a
			
 
				+
			
 
				+  - do:
			
 
				+      bulk:
			
 
				+        index: has_copy_to
			
 
				+        refresh: true
			
 
				+        body:
			
 
				+          - '{ "index": {"_id": "1"} }'
			
 
				+          - '{ "a": "good", "class": "good" }'
			
 
				+          - '{ "index": {"_id": "2"} }'
			
 
				+          - '{ "b": "good", "class": "good" }'
			
 
				+          - '{ "index": {"_id": "3"} }'
			
 
				+          - '{ "a": "bad", "class": "bad" }'
			
 
				+          - '{ "index": {"_id": "4"} }'
			
 
				+          - '{ "b": "bad", "class": "bad" }'
			
 
				+          - '{ "index": {"_id": "5"} }'
			
 
				+          - '{ "a": "good", "b": "bad", "class": "good" }'
			
 
				+          - '{ "index": {"_id": "6"} }'
			
 
				+          - '{ "b": "good bad", "class": "bad" }'
			
 
				+          - '{ "index": {"_id": "7"} }'
			
 
				+          - '{ "a": "bad", "b": "", "class": "bad" }'
			
 
				+
			
 
				+  - do:
			
 
				+      search:
			
 
				+        index: has_copy_to
			
 
				+        body:
			
 
				+          size: 0
			
 
				+          aggs:
			
 
				+            class:
			
 
				+              terms:
			
 
				+                field: class
			
 
				+              aggs:
			
 
				+                sig_text:
			
 
				+                  significant_text:
			
 
				+                    field: text
			
 
				+                    source_fields: [a, b]
			
 
				+  - match: {hits.total.value: 7}
			
 
				+  - length: {aggregations.class.buckets: 2}
			
 
				+  - match: {aggregations.class.buckets.0.key: bad}
			
 
				+  - length: { aggregations.class.buckets.0.sig_text.buckets: 1 }
			
 
				+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad}
			
 
				+  - match: {aggregations.class.buckets.1.key: good}
			
 
				+  - length: { aggregations.class.buckets.1.sig_text.buckets: 1 }
			
 
				+  - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: good}
			
--- a/server/src/internalClusterTest/java/org/elasticsearch/search/profile/aggregation/AggregationProfilerIT.java
+++ b/server/src/internalClusterTest/java/org/elasticsearch/search/profile/aggregation/AggregationProfilerIT.java
@@ -224,12 +224,7 @@ public class AggregationProfilerIT extends ESIntegTestCase {
 
				         assertThat(termsAggResult.getDebugInfo(), hasEntry(COLLECTION_STRAT, "remap using many bucket ords"));
			
 
				         assertThat(termsAggResult.getDebugInfo(), hasEntry(RESULT_STRAT, "terms"));
			
 
				         assertThat(termsAggResult.getDebugInfo(), hasEntry(HAS_FILTER, false));
			
 
				-        // TODO we only index single valued docs but the ordinals ends up with multi valued sometimes
			
 
				-        assertThat(
			
 
				-            termsAggResult.getDebugInfo().toString(),
			
 
				-            (int) termsAggResult.getDebugInfo().get(SEGMENTS_WITH_SINGLE) + (int) termsAggResult.getDebugInfo().get(SEGMENTS_WITH_MULTI),
			
 
				-            greaterThan(0)
			
 
				-        );
			
 
				+        assertThat(termsAggResult.getDebugInfo().toString(), (int) termsAggResult.getDebugInfo().get(SEGMENTS_WITH_SINGLE), greaterThan(0));
			
 
				     }
			
 
				 
			
 
				     public void testMultiLevelProfileBreadthFirst() {
			
--- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java
+++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java
@@ -30,6 +30,7 @@ import org.elasticsearch.search.aggregations.bucket.terms.SignificanceLookup.Bac
 
				 import org.elasticsearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
			
 
				 import org.elasticsearch.search.aggregations.support.AggregationContext;
			
 
				 import org.elasticsearch.search.aggregations.support.ValuesSource;
			
 
				+import org.elasticsearch.search.aggregations.support.ValuesSourceConfig;
			
 
				 
			
 
				 import java.io.IOException;
			
 
				 import java.util.Arrays;
			
@@ -117,6 +118,8 @@ public class MapStringTermsAggregator extends AbstractStringTermsAggregator {
 
				     public void collectDebugInfo(BiConsumer<String, Object> add) {
			
 
				         super.collectDebugInfo(add);
			
 
				         add.accept("total_buckets", bucketOrds.size());
			
 
				+        add.accept("collection_strategy", collectorSource.describe());
			
 
				+        collectorSource.collectDebugInfo(add);
			
 
				         add.accept("result_strategy", resultStrategy.describe());
			
 
				     }
			
 
				 
			
@@ -126,11 +129,30 @@ public class MapStringTermsAggregator extends AbstractStringTermsAggregator {
 
				     }
			
 
				 
			
 
				     /**
			
 
				-     * Abstaction on top of building collectors to fetch values.
			
 
				+     * Abstraction on top of building collectors to fetch values so {@code terms},
			
 
				+     * {@code significant_terms}, and {@code significant_text} can share a bunch of
			
 
				+     * aggregation code.
			
 
				      */
			
 
				     public interface CollectorSource extends Releasable {
			
 
				+        /**
			
 
				+         * A description of the strategy to include in profile results.
			
 
				+         */
			
 
				+        String describe();
			
 
				+
			
 
				+        /**
			
 
				+         * Collect debug information to add to the profiling results. This will
			
 
				+         * only be called if the aggregation is being profiled.
			
 
				+         */
			
 
				+        void collectDebugInfo(BiConsumer<String, Object> add);
			
 
				+
			
 
				+        /**
			
 
				+         * Does this {@link CollectorSource} need queries to calculate the score?
			
 
				+         */
			
 
				         boolean needsScores();
			
 
				 
			
 
				+        /**
			
 
				+         * Build the collector.
			
 
				+         */
			
 
				         LeafBucketCollector getLeafCollector(
			
 
				             IncludeExclude.StringFilter includeExclude,
			
 
				             LeafReaderContext ctx,
			
@@ -148,15 +170,23 @@ public class MapStringTermsAggregator extends AbstractStringTermsAggregator {
 
				      * Fetch values from a {@link ValuesSource}.
			
 
				      */
			
 
				     public static class ValuesSourceCollectorSource implements CollectorSource {
			
 
				-        private final ValuesSource valuesSource;
			
 
				+        private final ValuesSourceConfig valuesSourceConfig;
			
 
				 
			
 
				-        public ValuesSourceCollectorSource(ValuesSource valuesSource) {
			
 
				-            this.valuesSource = valuesSource;
			
 
				+        public ValuesSourceCollectorSource(ValuesSourceConfig valuesSourceConfig) {
			
 
				+            this.valuesSourceConfig = valuesSourceConfig;
			
 
				+        }
			
 
				+
			
 
				+        @Override
			
 
				+        public String describe() {
			
 
				+            return "from " + valuesSourceConfig.getDescription();
			
 
				         }
			
 
				 
			
 
				+        @Override
			
 
				+        public void collectDebugInfo(BiConsumer<String, Object> add) {}
			
 
				+
			
 
				         @Override
			
 
				         public boolean needsScores() {
			
 
				-            return valuesSource.needsScores();
			
 
				+            return valuesSourceConfig.getValuesSource().needsScores();
			
 
				         }
			
 
				 
			
 
				         @Override
			
@@ -167,7 +197,7 @@ public class MapStringTermsAggregator extends AbstractStringTermsAggregator {
 
				             LongConsumer addRequestCircuitBreakerBytes,
			
 
				             CollectConsumer consumer
			
 
				         ) throws IOException {
			
 
				-            SortedBinaryDocValues values = valuesSource.bytesValues(ctx);
			
 
				+            SortedBinaryDocValues values = valuesSourceConfig.getValuesSource().bytesValues(ctx);
			
 
				             return new LeafBucketCollectorBase(sub, values) {
			
 
				                 final BytesRefBuilder previous = new BytesRefBuilder();
			
 
				 
			
--- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregationBuilder.java
+++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregationBuilder.java
@@ -42,7 +42,6 @@ public class SignificantTermsAggregationBuilder extends ValuesSourceAggregationB
 
				         new ValuesSourceRegistry.RegistryKey<>(NAME, SignificantTermsAggregatorSupplier.class);
			
 
				 
			
 
				     static final ParseField BACKGROUND_FILTER = new ParseField("background_filter");
			
 
				-    static final ParseField HEURISTIC = new ParseField("significance_heuristic");
			
 
				 
			
 
				     static final TermsAggregator.BucketCountThresholds DEFAULT_BUCKET_COUNT_THRESHOLDS = new TermsAggregator.BucketCountThresholds(
			
 
				             3, 0, 10, -1);
			
--- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorFactory.java
+++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorFactory.java
@@ -57,7 +57,7 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
 
				             @Override
			
 
				             public Aggregator build(String name,
			
 
				                                     AggregatorFactories factories,
			
 
				-                                    ValuesSource valuesSource,
			
 
				+                                    ValuesSourceConfig valuesSourceConfig,
			
 
				                                     DocValueFormat format,
			
 
				                                     TermsAggregator.BucketCountThresholds bucketCountThresholds,
			
 
				                                     IncludeExclude includeExclude,
			
@@ -73,7 +73,7 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
 
				                 if (executionHint != null) {
			
 
				                     execution = ExecutionMode.fromString(executionHint, deprecationLogger);
			
 
				                 }
			
 
				-                if (valuesSource instanceof ValuesSource.Bytes.WithOrdinals == false) {
			
 
				+                if (valuesSourceConfig.hasOrdinals() == false) {
			
 
				                     execution = ExecutionMode.MAP;
			
 
				                 }
			
 
				                 if (execution == null) {
			
@@ -86,7 +86,7 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
 
				                         + "include/exclude clauses");
			
 
				                 }
			
 
				 
			
 
				-                return execution.create(name, factories, valuesSource, format, bucketCountThresholds, includeExclude, context, parent,
			
 
				+                return execution.create(name, factories, valuesSourceConfig, format, bucketCountThresholds, includeExclude, context, parent,
			
 
				                     significanceHeuristic, lookup, cardinality, metadata);
			
 
				             }
			
 
				         };
			
@@ -101,7 +101,7 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
 
				             @Override
			
 
				             public Aggregator build(String name,
			
 
				                                     AggregatorFactories factories,
			
 
				-                                    ValuesSource valuesSource,
			
 
				+                                    ValuesSourceConfig valuesSourceConfig,
			
 
				                                     DocValueFormat format,
			
 
				                                     TermsAggregator.BucketCountThresholds bucketCountThresholds,
			
 
				                                     IncludeExclude includeExclude,
			
@@ -119,7 +119,7 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
 
				                         "values for include/exclude clauses used to filter numeric fields");
			
 
				                 }
			
 
				 
			
 
				-                ValuesSource.Numeric numericValuesSource = (ValuesSource.Numeric) valuesSource;
			
 
				+                ValuesSource.Numeric numericValuesSource = (ValuesSource.Numeric) valuesSourceConfig.getValuesSource();
			
 
				                 if (numericValuesSource.isFloatingPoint()) {
			
 
				                     throw new UnsupportedOperationException("No support for examining floating point numerics");
			
 
				                 }
			
@@ -218,7 +218,7 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
 
				         return aggregatorSupplier.build(
			
 
				             name,
			
 
				             factories,
			
 
				-            config.getValuesSource(),
			
 
				+            config,
			
 
				             config.format(),
			
 
				             bucketCountThresholds,
			
 
				             includeExclude,
			
@@ -239,7 +239,7 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
 
				             @Override
			
 
				             Aggregator create(String name,
			
 
				                               AggregatorFactories factories,
			
 
				-                              ValuesSource valuesSource,
			
 
				+                              ValuesSourceConfig valuesSourceConfig,
			
 
				                               DocValueFormat format,
			
 
				                               TermsAggregator.BucketCountThresholds bucketCountThresholds,
			
 
				                               IncludeExclude includeExclude,
			
@@ -254,7 +254,7 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
 
				                 return new MapStringTermsAggregator(
			
 
				                     name,
			
 
				                     factories,
			
 
				-                    new MapStringTermsAggregator.ValuesSourceCollectorSource(valuesSource),
			
 
				+                    new MapStringTermsAggregator.ValuesSourceCollectorSource(valuesSourceConfig),
			
 
				                     a -> a.new SignificantTermsResults(lookup, significanceHeuristic, cardinality),
			
 
				                     null,
			
 
				                     format,
			
@@ -276,7 +276,7 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
 
				             @Override
			
 
				             Aggregator create(String name,
			
 
				                               AggregatorFactories factories,
			
 
				-                              ValuesSource valuesSource,
			
 
				+                              ValuesSourceConfig valuesSourceConfig,
			
 
				                               DocValueFormat format,
			
 
				                               TermsAggregator.BucketCountThresholds bucketCountThresholds,
			
 
				                               IncludeExclude includeExclude,
			
@@ -298,7 +298,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
 
				                     remapGlobalOrd = false;
			
 
				                 }
			
 
				 
			
 
				-                ValuesSource.Bytes.WithOrdinals.FieldData ordinalsValuesSource = (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource;
			
 
				+                ValuesSource.Bytes.WithOrdinals.FieldData ordinalsValuesSource =
			
 
				+                    (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSourceConfig.getValuesSource();
			
 
				                 SortedSetDocValues values = TermsAggregatorFactory.globalOrdsValues(context, ordinalsValuesSource); 
			
 
				                 return new GlobalOrdinalsStringTermsAggregator(
			
 
				                     name,
			
@@ -342,7 +343,7 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
 
				 
			
 
				         abstract Aggregator create(String name,
			
 
				                                    AggregatorFactories factories,
			
 
				-                                   ValuesSource valuesSource,
			
 
				+                                   ValuesSourceConfig valuesSourceConfig,
			
 
				                                    DocValueFormat format,
			
 
				                                    TermsAggregator.BucketCountThresholds bucketCountThresholds,
			
 
				                                    IncludeExclude includeExclude,
			
--- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorSupplier.java
+++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorSupplier.java
@@ -13,7 +13,7 @@ import org.elasticsearch.search.aggregations.AggregatorFactories;
 
				 import org.elasticsearch.search.aggregations.CardinalityUpperBound;
			
 
				 import org.elasticsearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
			
 
				 import org.elasticsearch.search.aggregations.support.AggregationContext;
			
 
				-import org.elasticsearch.search.aggregations.support.ValuesSource;
			
 
				+import org.elasticsearch.search.aggregations.support.ValuesSourceConfig;
			
 
				 
			
 
				 import java.io.IOException;
			
 
				 import java.util.Map;
			
@@ -21,7 +21,7 @@ import java.util.Map;
 
				 interface SignificantTermsAggregatorSupplier {
			
 
				     Aggregator build(String name,
			
 
				                      AggregatorFactories factories,
			
 
				-                     ValuesSource valuesSource,
			
 
				+                     ValuesSourceConfig valuesSourceConfig,
			
 
				                      DocValueFormat format,
			
 
				                      TermsAggregator.BucketCountThresholds bucketCountThresholds,
			
 
				                      IncludeExclude includeExclude,
			
--- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTextAggregatorFactory.java
+++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTextAggregatorFactory.java
@@ -36,14 +36,18 @@ import org.elasticsearch.search.aggregations.NonCollectingAggregator;
 
				 import org.elasticsearch.search.aggregations.bucket.BucketUtils;
			
 
				 import org.elasticsearch.search.aggregations.bucket.terms.IncludeExclude.StringFilter;
			
 
				 import org.elasticsearch.search.aggregations.bucket.terms.MapStringTermsAggregator.CollectConsumer;
			
 
				+import org.elasticsearch.search.aggregations.bucket.terms.MapStringTermsAggregator.CollectorSource;
			
 
				 import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregator.BucketCountThresholds;
			
 
				 import org.elasticsearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
			
 
				 import org.elasticsearch.search.aggregations.support.AggregationContext;
			
 
				 import org.elasticsearch.search.lookup.SourceLookup;
			
 
				+import org.elasticsearch.search.profile.Timer;
			
 
				 
			
 
				 import java.io.IOException;
			
 
				 import java.util.Iterator;
			
 
				+import java.util.List;
			
 
				 import java.util.Map;
			
 
				+import java.util.function.BiConsumer;
			
 
				 import java.util.function.LongConsumer;
			
 
				 
			
 
				 public class SignificantTextAggregatorFactory extends AggregatorFactory {
			
@@ -66,7 +70,7 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
 
				                                                 AggregatorFactory parent,
			
 
				                                                 AggregatorFactories.Builder subFactoriesBuilder,
			
 
				                                                 String fieldName,
			
 
				-                                                String [] sourceFieldNames,
			
 
				+                                                String[] sourceFieldNames,
			
 
				                                                 boolean filterDuplicateText,
			
 
				                                                 Map<String, Object> metadata) throws IOException {
			
 
				         super(name, context, parent, subFactoriesBuilder, metadata);
			
@@ -76,7 +80,7 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
 
				             if (supportsAgg(fieldType) == false) {
			
 
				                 throw new IllegalArgumentException("Field [" + fieldType.name() + "] has no analyzer, but SignificantText " +
			
 
				                     "requires an analyzed field");
			
 
				-            }            
			
 
				+            }
			
 
				             String indexedFieldName = fieldType.name();
			
 
				             this.sourceFieldNames = sourceFieldNames == null ? new String[] {indexedFieldName} : sourceFieldNames;
			
 
				         } else {
			
@@ -89,7 +93,7 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
 
				         this.bucketCountThresholds = bucketCountThresholds;
			
 
				         this.significanceHeuristic = significanceHeuristic;
			
 
				     }
			
 
				-    
			
 
				+
			
 
				     protected Aggregator createUnmapped(Aggregator parent, Map<String, Object> metadata) throws IOException {
			
 
				         final InternalAggregation aggregation = new UnmappedSignificantTerms(name, bucketCountThresholds.getRequiredSize(),
			
 
				                 bucketCountThresholds.getMinDocCount(), metadata);
			
@@ -99,7 +103,7 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
 
				                 return aggregation;
			
 
				             }
			
 
				         };
			
 
				-    }    
			
 
				+    }
			
 
				 
			
 
				     private static boolean supportsAgg(MappedFieldType ft) {
			
 
				         return ft.getTextSearchInfo() != TextSearchInfo.NONE
			
@@ -109,11 +113,11 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
 
				     @Override
			
 
				     protected Aggregator createInternal(Aggregator parent, CardinalityUpperBound cardinality, Map<String, Object> metadata)
			
 
				         throws IOException {
			
 
				-        
			
 
				+
			
 
				         if (fieldType == null) {
			
 
				             return createUnmapped(parent, metadata);
			
 
				         }
			
 
				-        
			
 
				+
			
 
				         BucketCountThresholds bucketCountThresholds = new BucketCountThresholds(this.bucketCountThresholds);
			
 
				         if (bucketCountThresholds.getShardSize() == SignificantTextAggregationBuilder.DEFAULT_BUCKET_COUNT_THRESHOLDS.getShardSize()) {
			
 
				             // The user has not made a shardSize selection.
			
@@ -133,21 +137,12 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
 
				         IncludeExclude.StringFilter incExcFilter = includeExclude == null ? null:
			
 
				             includeExclude.convertToStringFilter(DocValueFormat.RAW);
			
 
				 
			
 
				-        MapStringTermsAggregator.CollectorSource collectorSource = new SignificantTextCollectorSource(
			
 
				-            context.lookup().source(),
			
 
				-            context.bigArrays(),
			
 
				-            fieldType,
			
 
				-            context.getIndexAnalyzer(f -> {
			
 
				-                throw new IllegalArgumentException("No analyzer configured for field " + f);
			
 
				-            }),
			
 
				-            sourceFieldNames,
			
 
				-            filterDuplicateText
			
 
				-        );
			
 
				+
			
 
				         SignificanceLookup lookup = new SignificanceLookup(context, fieldType, DocValueFormat.RAW, backgroundFilter);
			
 
				         return new MapStringTermsAggregator(
			
 
				             name,
			
 
				             factories,
			
 
				-            collectorSource,
			
 
				+            createCollectorSource(),
			
 
				             a -> a.new SignificantTermsResults(lookup, significanceHeuristic, cardinality),
			
 
				             null,
			
 
				             DocValueFormat.RAW,
			
@@ -162,12 +157,58 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
 
				         );
			
 
				     }
			
 
				 
			
 
				+    /**
			
 
				+     * Create the {@link CollectorSource}, gathering some timing information
			
 
				+     * if we're profiling.
			
 
				+     * <p>
			
 
				+     * When profiling aggregations {@link LeafBucketCollector#collect(int, long)} method
			
 
				+     * out of the box but our implementation of that method does three things that is
			
 
				+     * useful to get timing for:
			
 
				+     * <ul>
			
 
				+     * <li>Fetch field values from {@code _source}
			
 
				+     * <li>Analyze the field
			
 
				+     * <li>Do all the normal {@code terms} agg stuff with its terms
			
 
				+     * </ul>
			
 
				+     * <p>
			
 
				+     * The most convenient way to measure all of these is to time the fetch and all
			
 
				+     * the normal {@code terms} agg stuff. You can then subtract those timings from
			
 
				+     * the overall collect time to get the analyze time. You can also get the total
			
 
				+     * number of terms that we analyzed by looking at the invocation count on the
			
 
				+     * {@code terms} agg stuff.
			
 
				+     * <p>
			
 
				+     * While we're at it we count the number of values we fetch from source.
			
 
				+     */
			
 
				+    private CollectorSource createCollectorSource() {
			
 
				+        Analyzer analyzer = context.getIndexAnalyzer(f -> {
			
 
				+            throw new IllegalArgumentException("No analyzer configured for field " + f);
			
 
				+        });
			
 
				+        if (context.profiling()) {
			
 
				+            return new ProfilingSignificantTextCollectorSource(
			
 
				+                context.lookup().source(),
			
 
				+                context.bigArrays(),
			
 
				+                fieldType,
			
 
				+                analyzer,
			
 
				+                sourceFieldNames,
			
 
				+                filterDuplicateText
			
 
				+            );
			
 
				+        }
			
 
				+        return new SignificantTextCollectorSource(
			
 
				+            context.lookup().source(),
			
 
				+            context.bigArrays(),
			
 
				+            fieldType,
			
 
				+            analyzer,
			
 
				+            sourceFieldNames,
			
 
				+            filterDuplicateText
			
 
				+        );
			
 
				+    }
			
 
				+
			
 
				     private static class SignificantTextCollectorSource implements MapStringTermsAggregator.CollectorSource {
			
 
				         private final SourceLookup sourceLookup;
			
 
				         private final BigArrays bigArrays;
			
 
				         private final MappedFieldType fieldType;
			
 
				         private final Analyzer analyzer;
			
 
				         private final String[] sourceFieldNames;
			
 
				+        private final BytesRefBuilder scratch = new BytesRefBuilder();
			
 
				         private ObjectArray<DuplicateByteSequenceSpotter> dupSequenceSpotters;
			
 
				 
			
 
				         SignificantTextCollectorSource(
			
@@ -186,6 +227,15 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
 
				             dupSequenceSpotters = filterDuplicateText ? bigArrays.newObjectArray(1) : null;
			
 
				         }
			
 
				 
			
 
				+        @Override
			
 
				+        public String describe() {
			
 
				+            return "analyze " + fieldType.name() + " from _source";
			
 
				+        }
			
 
				+
			
 
				+        @Override
			
 
				+        public void collectDebugInfo(BiConsumer<String, Object> add) {
			
 
				+        }
			
 
				+
			
 
				         @Override
			
 
				         public boolean needsScores() {
			
 
				             return false;
			
@@ -200,8 +250,6 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
 
				             CollectConsumer consumer
			
 
				         ) throws IOException {
			
 
				             return new LeafBucketCollectorBase(sub, null) {
			
 
				-                private final BytesRefBuilder scratch = new BytesRefBuilder();
			
 
				-
			
 
				                 @Override
			
 
				                 public void collect(int doc, long owningBucketOrd) throws IOException {
			
 
				                     if (dupSequenceSpotters == null) {
			
@@ -224,7 +272,7 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
 
				 
			
 
				                     try {
			
 
				                         for (String sourceField : sourceFieldNames) {
			
 
				-                            Iterator<String> itr = sourceLookup.extractRawValues(sourceField).stream()
			
 
				+                            Iterator<String> itr = extractRawValues(sourceField).stream()
			
 
				                                 .map(obj -> {
			
 
				                                     if (obj == null) {
			
 
				                                         return null;
			
@@ -236,63 +284,87 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
 
				                                 })
			
 
				                                 .iterator();
			
 
				                             while (itr.hasNext()) {
			
 
				-                                TokenStream ts = analyzer.tokenStream(fieldType.name(), itr.next());
			
 
				-                                processTokenStream(doc, owningBucketOrd, ts, inDocTerms, spotter);
			
 
				+                                String text = itr.next();
			
 
				+                                TokenStream ts = analyzer.tokenStream(fieldType.name(), text);
			
 
				+                                processTokenStream(
			
 
				+                                    includeExclude,
			
 
				+                                    doc,
			
 
				+                                    owningBucketOrd,
			
 
				+                                    text,
			
 
				+                                    ts,
			
 
				+                                    inDocTerms,
			
 
				+                                    spotter,
			
 
				+                                    addRequestCircuitBreakerBytes,
			
 
				+                                    sub,
			
 
				+                                    consumer
			
 
				+                                );
			
 
				                             }
			
 
				                         }
			
 
				                     } finally {
			
 
				                         Releasables.close(inDocTerms);
			
 
				                     }
			
 
				                 }
			
 
				+            };
			
 
				+        }
			
 
				 
			
 
				-                private void processTokenStream(
			
 
				-                    int doc,
			
 
				-                    long owningBucketOrd,
			
 
				-                    TokenStream ts,
			
 
				-                    BytesRefHash inDocTerms,
			
 
				-                    DuplicateByteSequenceSpotter spotter
			
 
				-                ) throws IOException {
			
 
				-                    long lastTrieSize = 0;
			
 
				+        protected void processTokenStream(
			
 
				+            StringFilter includeExclude,
			
 
				+            int doc,
			
 
				+            long owningBucketOrd,
			
 
				+            String text,
			
 
				+            TokenStream ts,
			
 
				+            BytesRefHash inDocTerms,
			
 
				+            DuplicateByteSequenceSpotter spotter,
			
 
				+            LongConsumer addRequestCircuitBreakerBytes,
			
 
				+            LeafBucketCollector sub,
			
 
				+            CollectConsumer consumer
			
 
				+        ) throws IOException {
			
 
				+            long lastTrieSize = 0;
			
 
				+            if (spotter != null) {
			
 
				+                lastTrieSize = spotter.getEstimatedSizeInBytes();
			
 
				+                ts = new DeDuplicatingTokenFilter(ts, spotter);
			
 
				+            }
			
 
				+            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
			
 
				+            ts.reset();
			
 
				+            try {
			
 
				+                while (ts.incrementToken()) {
			
 
				                     if (spotter != null) {
			
 
				-                        lastTrieSize = spotter.getEstimatedSizeInBytes();
			
 
				-                        ts = new DeDuplicatingTokenFilter(ts, spotter);
			
 
				+                        long newTrieSize = spotter.getEstimatedSizeInBytes();
			
 
				+                        long growth = newTrieSize - lastTrieSize;
			
 
				+                        // Only update the circuitbreaker after
			
 
				+                        if (growth > MEMORY_GROWTH_REPORTING_INTERVAL_BYTES) {
			
 
				+                            addRequestCircuitBreakerBytes.accept(growth);
			
 
				+                            lastTrieSize = newTrieSize;
			
 
				+                        }
			
 
				                     }
			
 
				-                    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
			
 
				-                    ts.reset();
			
 
				-                    try {
			
 
				-                        while (ts.incrementToken()) {
			
 
				-                            if (spotter != null) {
			
 
				-                                long newTrieSize = spotter.getEstimatedSizeInBytes();
			
 
				-                                long growth = newTrieSize - lastTrieSize;
			
 
				-                                // Only update the circuitbreaker after
			
 
				-                                if (growth > MEMORY_GROWTH_REPORTING_INTERVAL_BYTES) {
			
 
				-                                    addRequestCircuitBreakerBytes.accept(growth);
			
 
				-                                    lastTrieSize = newTrieSize;
			
 
				-                                }
			
 
				-                            }
			
 
				 
			
 
				-                            scratch.clear();
			
 
				-                            scratch.copyChars(termAtt);
			
 
				-                            BytesRef bytes = scratch.get();
			
 
				-                            if (includeExclude != null && false == includeExclude.accept(bytes)) {
			
 
				-                                continue;
			
 
				-                            }
			
 
				-                            if (inDocTerms.add(bytes) < 0) {
			
 
				-                                continue;
			
 
				-                            }
			
 
				-                            consumer.accept(sub, doc, owningBucketOrd, bytes);
			
 
				-                        }
			
 
				-                    } finally {
			
 
				-                        ts.close();
			
 
				+                    scratch.clear();
			
 
				+                    scratch.copyChars(termAtt);
			
 
				+                    BytesRef bytes = scratch.get();
			
 
				+                    if (includeExclude != null && false == includeExclude.accept(bytes)) {
			
 
				+                        continue;
			
 
				                     }
			
 
				-                    if (spotter != null) {
			
 
				-                        long growth = spotter.getEstimatedSizeInBytes() - lastTrieSize;
			
 
				-                        if (growth > 0) {
			
 
				-                            addRequestCircuitBreakerBytes.accept(growth);
			
 
				-                        }
			
 
				+                    if (inDocTerms.add(bytes) < 0) {
			
 
				+                        continue;
			
 
				                     }
			
 
				+                    consumer.accept(sub, doc, owningBucketOrd, bytes);
			
 
				                 }
			
 
				-            };
			
 
				+            } finally {
			
 
				+                ts.close();
			
 
				+            }
			
 
				+            if (spotter != null) {
			
 
				+                long growth = spotter.getEstimatedSizeInBytes() - lastTrieSize;
			
 
				+                if (growth > 0) {
			
 
				+                    addRequestCircuitBreakerBytes.accept(growth);
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        /**
			
 
				+         * Extract values from {@code _source}.
			
 
				+         */
			
 
				+        protected List<Object> extractRawValues(String field) {
			
 
				+            return sourceLookup.extractRawValues(field);
			
 
				         }
			
 
				 
			
 
				         @Override
			
@@ -300,4 +372,79 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
 
				             Releasables.close(dupSequenceSpotters);
			
 
				         }
			
 
				     }
			
 
				+
			
 
				+    private static class ProfilingSignificantTextCollectorSource extends SignificantTextCollectorSource {
			
 
				+        private final Timer extract = new Timer();
			
 
				+        private final Timer collectAnalyzed = new Timer();
			
 
				+        private long valuesFetched;
			
 
				+        private long charsFetched;
			
 
				+
			
 
				+        private ProfilingSignificantTextCollectorSource(
			
 
				+            SourceLookup sourceLookup,
			
 
				+            BigArrays bigArrays,
			
 
				+            MappedFieldType fieldType,
			
 
				+            Analyzer analyzer,
			
 
				+            String[] sourceFieldNames,
			
 
				+            boolean filterDuplicateText
			
 
				+        ) {
			
 
				+            super(sourceLookup, bigArrays, fieldType, analyzer, sourceFieldNames, filterDuplicateText);
			
 
				+        }
			
 
				+
			
 
				+        @Override
			
 
				+        protected void processTokenStream(
			
 
				+            StringFilter includeExclude,
			
 
				+            int doc,
			
 
				+            long owningBucketOrd,
			
 
				+            String text,
			
 
				+            TokenStream ts,
			
 
				+            BytesRefHash inDocTerms,
			
 
				+            DuplicateByteSequenceSpotter spotter,
			
 
				+            LongConsumer addRequestCircuitBreakerBytes,
			
 
				+            LeafBucketCollector sub,
			
 
				+            CollectConsumer consumer
			
 
				+        ) throws IOException {
			
 
				+            valuesFetched++;
			
 
				+            charsFetched += text.length();
			
 
				+            super.processTokenStream(
			
 
				+                includeExclude,
			
 
				+                doc,
			
 
				+                owningBucketOrd,
			
 
				+                text,
			
 
				+                ts,
			
 
				+                inDocTerms,
			
 
				+                spotter,
			
 
				+                addRequestCircuitBreakerBytes,
			
 
				+                sub,
			
 
				+                (subCollector, d, o, bytes) -> {
			
 
				+                    collectAnalyzed.start();
			
 
				+                    try {
			
 
				+                        consumer.accept(subCollector, d, o, bytes);
			
 
				+                    } finally {
			
 
				+                        collectAnalyzed.stop();
			
 
				+                    }
			
 
				+                }
			
 
				+            );
			
 
				+        }
			
 
				+
			
 
				+        @Override
			
 
				+        protected List<Object> extractRawValues(String field) {
			
 
				+            extract.start();
			
 
				+            try {
			
 
				+                return super.extractRawValues(field);
			
 
				+            } finally {
			
 
				+                extract.stop();
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        @Override
			
 
				+        public void collectDebugInfo(BiConsumer<String, Object> add) {
			
 
				+            super.collectDebugInfo(add);
			
 
				+            add.accept("extract_ns", extract.getApproximateTiming());
			
 
				+            add.accept("extract_count", extract.getCount());
			
 
				+            add.accept("collect_analyzed_ns", collectAnalyzed.getApproximateTiming());
			
 
				+            add.accept("collect_analyzed_count", collectAnalyzed.getCount());
			
 
				+            add.accept("values_fetched", valuesFetched);
			
 
				+            add.accept("chars_fetched", charsFetched);
			
 
				+        }
			
 
				+    }
			
 
				 }
			
--- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java
+++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java
@@ -325,7 +325,7 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory {
 
				                 return new MapStringTermsAggregator(
			
 
				                     name,
			
 
				                     factories,
			
 
				-                    new MapStringTermsAggregator.ValuesSourceCollectorSource(valuesSourceConfig.getValuesSource()),
			
 
				+                    new MapStringTermsAggregator.ValuesSourceCollectorSource(valuesSourceConfig),
			
 
				                     a -> a.new StandardTermsResults(valuesSourceConfig.getValuesSource()),
			
 
				                     order,
			
 
				                     valuesSourceConfig.format(),