Browse Source

More debugging info for significant_text (#72727)

Adds some extra debugging information to make it clear that you are
running `significant_text`. Also adds some using timing information
around the `_source` fetch and the `terms` accumulation. This lets you
calculate a third useful timing number: the analysis time. It is
`collect_ns - fetch_ns - accumulation_ns`.

This also adds a half dozen extra REST tests to get a *fairly*
comprehensive set of the operations this supports. It doesn't cover all
of the significance heuristic parsing, but its certainly much better
than what we had.
Nik Everett 4 years ago
parent
commit
a43b166d11

+ 2 - 2
docs/reference/aggregations/bucket/significantterms-aggregation.asciidoc

@@ -374,7 +374,7 @@ Chi square behaves like mutual information and can be configured with the same p
 
 
 ===== Google normalized distance
-Google normalized distance as described in "The Google Similarity Distance", Cilibrasi and Vitanyi, 2007 (https://arxiv.org/pdf/cs/0412098v3.pdf) can be used as significance score by adding the parameter
+Google normalized distance as described in https://arxiv.org/pdf/cs/0412098v3.pdf["The Google Similarity Distance", Cilibrasi and Vitanyi, 2007] can be used as significance score by adding the parameter
 
 [source,js]
 --------------------------------------------------
@@ -408,7 +408,7 @@ Multiple observations are typically required to reinforce a view so it is recomm
 
 Roughly, `mutual_information` prefers high frequent terms even if they occur also frequently in the background. For example, in an analysis of natural language text this might lead to selection of stop words. `mutual_information` is unlikely to select very rare terms like misspellings. `gnd` prefers terms with a high co-occurrence and avoids selection of stopwords. It might be better suited for synonym detection. However, `gnd` has a tendency to select very rare terms that are, for example, a result of misspelling. `chi_square` and `jlh` are somewhat in-between.
 
-It is hard to say which one of the different heuristics will be the best choice as it depends on what the significant terms are used for (see for example [Yang and Pedersen, "A Comparative Study on Feature Selection in Text Categorization", 1997](http://courses.ischool.berkeley.edu/i256/f06/papers/yang97comparative.pdf) for a study on using significant terms for feature selection for text classification).
+It is hard to say which one of the different heuristics will be the best choice as it depends on what the significant terms are used for (see for example http://courses.ischool.berkeley.edu/i256/f06/papers/yang97comparative.pdf[Yang and Pedersen, "A Comparative Study on Feature Selection in Text Categorization", 1997] for a study on using significant terms for feature selection for text classification).
 
 If none of the above measures suits your usecase than another option is to implement a custom significance measure:
 

+ 0 - 151
rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.aggregation/90_sig_text.yml

@@ -1,151 +0,0 @@
----
-"Default index":
-
-  - do:
-      indices.create:
-          index:  goodbad
-          body:
-            settings:
-                number_of_shards: "1"
-            mappings:
-                properties:
-                    text:
-                        type: text
-                        fielddata: false
-                    class:
-                        type: keyword
-
-  - do:
-      index:
-          index:  goodbad
-          id:     1
-          body:   { text: "good", class: "good" }
-  - do:
-      index:
-          index:  goodbad
-          id:     2
-          body:   { text: "good", class: "good" }
-  - do:
-      index:
-          index:  goodbad
-          id:     3
-          body:   { text: "bad", class: "bad" }
-  - do:
-      index:
-          index:  goodbad
-          id:     4
-          body:   { text: "bad", class: "bad" }
-  - do:
-      index:
-          index:  goodbad
-          id:     5
-          body:   { text: "good bad", class: "good" }
-  - do:
-      index:
-          index:  goodbad
-          id:     6
-          body:   { text: "good bad", class: "bad" }
-  - do:
-      index:
-          index:  goodbad
-          id:     7
-          body:   { text: "bad", class: "bad" }
-
-
-
-  - do:
-      indices.refresh:
-        index: [goodbad]
-
-  - do:
-      search:
-        rest_total_hits_as_int: true
-        index: goodbad
-
-  - match: {hits.total: 7}
-
-  - do:
-      search:
-        rest_total_hits_as_int: true
-        index: goodbad
-        body: {"aggs": {"class": {"terms": {"field": "class"},"aggs": {"sig_text": {"significant_text": {"field": "text"}}}}}}
-
-  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: "bad"}
-  - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: "good"}
-
----
-"Dedup noise":
-
-  - do:
-      indices.create:
-          index:  goodbad
-          body:
-            settings:
-                number_of_shards: "1"
-            mappings:
-                properties:
-                    text:
-                        type: text
-                        fielddata: false
-                    class:
-                        type: keyword
-
-  - do:
-      index:
-          index:  goodbad
-          id:     1
-          body:   { text: "good noisewords1 g1 g2 g3 g4 g5 g6", class: "good" }
-  - do:
-      index:
-          index:  goodbad
-          id:     2
-          body:   { text: "good  noisewords2 g1 g2 g3 g4 g5 g6", class: "good" }
-  - do:
-      index:
-          index:  goodbad
-          id:     3
-          body:   { text: "bad noisewords3 b1 b2 b3 b4 b5 b6", class: "bad" }
-  - do:
-      index:
-          index:  goodbad
-          id:     4
-          body:   { text: "bad noisewords4 b1 b2 b3 b4 b5 b6", class: "bad" }
-  - do:
-      index:
-          index:  goodbad
-          id:     5
-          body:   { text: "good bad noisewords5 gb1 gb2 gb3 gb4 gb5 gb6", class: "good" }
-  - do:
-      index:
-          index:  goodbad
-          id:     6
-          body:   { text: "good bad noisewords6 gb1 gb2 gb3 gb4 gb5 gb6", class: "bad" }
-  - do:
-      index:
-          index:  goodbad
-          id:     7
-          body:   { text: "bad noisewords7 b1 b2 b3 b4 b5 b6", class: "bad" }
-
-
-
-  - do:
-      indices.refresh:
-        index: [goodbad]
-
-  - do:
-      search:
-        rest_total_hits_as_int: true
-        index: goodbad
-
-  - match: {hits.total: 7}
-
-  - do:
-      search:
-        rest_total_hits_as_int: true
-        index: goodbad
-        body: {"aggs": {"class": {"terms": {"field": "class"},"aggs": {"sig_text": {"significant_text": {"field": "text", "filter_duplicate_text": true}}}}}}
-
-  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: "bad"}
-  - length: { aggregations.class.buckets.0.sig_text.buckets: 1 }
-  - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: "good"}
-  - length: { aggregations.class.buckets.1.sig_text.buckets: 1 }

+ 525 - 0
rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.aggregation/90_significant_text.yml

@@ -0,0 +1,525 @@
+setup:
+  - do:
+      indices.create:
+        index:  goodbad
+        body:
+          settings:
+            number_of_shards: "1"
+          mappings:
+            properties:
+              text:
+                type: text
+                fielddata: false
+              class:
+                type: keyword
+
+  - do:
+      bulk:
+        index: goodbad
+        refresh: true
+        body:
+          - '{ "index": {"_id": "1"} }'
+          - '{ "text": "good", "class": "good" }'
+          - '{ "index": {"_id": "2"} }'
+          - '{ "text": "good", "class": "good" }'
+          - '{ "index": {"_id": "3"} }'
+          - '{ "text": "bad", "class": "bad" }'
+          - '{ "index": {"_id": "4"} }'
+          - '{ "text": "bad", "class": "bad" }'
+          - '{ "index": {"_id": "5"} }'
+          - '{ "text": "good bad", "class": "good" }'
+          - '{ "index": {"_id": "6"} }'
+          - '{ "text": "good bad", "class": "bad" }'
+          - '{ "index": {"_id": "7"} }'
+          - '{ "text": "bad", "class": "bad" }'
+
+---
+simple:
+  - do:
+      search:
+        index: goodbad
+        body:
+          aggs:
+            class:
+              terms:
+                field: class
+              aggs:
+                sig_text:
+                  significant_text:
+                    field: text
+  - match: {hits.total.value: 7}
+  - length: {aggregations.class.buckets: 2}
+  - match: {aggregations.class.buckets.0.key: bad}
+  - length: { aggregations.class.buckets.1.sig_text.buckets: 1 }
+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad}
+  - match: {aggregations.class.buckets.1.key: good}
+  - length: { aggregations.class.buckets.1.sig_text.buckets: 1 }
+  - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: good}
+
+---
+"Dedup noise":
+  - do:
+      indices.create:
+        index:  noisy
+        body:
+          settings:
+            number_of_shards: "1"
+          mappings:
+            properties:
+              text:
+                type: text
+                fielddata: false
+              class:
+                type: keyword
+
+  - do:
+      bulk:
+        index: noisy
+        refresh: true
+        body:
+          - '{ "index": {"_id": "1"} }'
+          - '{ "text": "good noisewords1 g1 g2 g3 g4 g5 g6", "class": "good" }'
+          - '{ "index": {"_id": "2"} }'
+          - '{ "text": "good  noisewords2 g1 g2 g3 g4 g5 g6", "class": "good" }'
+          - '{ "index": {"_id": "3"} }'
+          - '{ "text": "bad noisewords3 b1 b2 b3 b4 b5 b6", "class": "bad" }'
+          - '{ "index": {"_id": "4"} }'
+          - '{ "text": "bad noisewords4 b1 b2 b3 b4 b5 b6", "class": "bad" }'
+          - '{ "index": {"_id": "5"} }'
+          - '{ "text": "good bad noisewords5 gb1 gb2 gb3 gb4 gb5 gb6", "class": "good" }'
+          - '{ "index": {"_id": "6"} }'
+          - '{ "text": "good bad noisewords6 gb1 gb2 gb3 gb4 gb5 gb6", "class": "bad" }'
+          - '{ "index": {"_id": "7"} }'
+          - '{ "text": "bad noisewords7 b1 b2 b3 b4 b5 b6", "class": "bad" }'
+
+  - do:
+      search:
+        rest_total_hits_as_int: true
+        index: noisy
+        body:
+          size: 0
+          aggs:
+            class:
+              terms:
+                field: class
+              aggs:
+                sig_text:
+                  significant_text:
+                    field: text
+                    filter_duplicate_text: true
+  - match: {hits.total: 7}
+  - length: {aggregations.class.buckets: 2}
+  - match: {aggregations.class.buckets.0.key: bad}
+  - length: { aggregations.class.buckets.0.sig_text.buckets: 1 }
+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad}
+  - match: {aggregations.class.buckets.1.key: good}
+  - length: { aggregations.class.buckets.1.sig_text.buckets: 1 }
+  - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: good}
+
+---
+profile:
+  - skip:
+      version: " - 7.99.99"
+      reason: extra profiling added in 8.0.0 to be backported to 7.14.0
+
+  - do:
+      search:
+        index: goodbad
+        body:
+          size: 0
+          profile: true
+          aggs:
+            class:
+              terms:
+                field: class
+              aggs:
+                sig_text:
+                  significant_text:
+                    field: text
+  - match: {hits.total.value: 7}
+  - length: {aggregations.class.buckets: 2}
+  - match: {aggregations.class.buckets.0.key: bad}
+  - length: { aggregations.class.buckets.0.sig_text.buckets: 1 }
+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad}
+  - match: {aggregations.class.buckets.1.key: good}
+  - length: { aggregations.class.buckets.1.sig_text.buckets: 1 }
+  - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: good}
+  - match: { profile.shards.0.aggregations.0.description: class }
+  - match: { profile.shards.0.aggregations.0.children.0.type: MapStringTermsAggregator }
+  - match: { profile.shards.0.aggregations.0.children.0.description: sig_text }
+  - match: { profile.shards.0.aggregations.0.children.0.debug.collection_strategy: analyze text from _source }
+  - match: { profile.shards.0.aggregations.0.children.0.debug.result_strategy: significant_terms }
+  - match: { profile.shards.0.aggregations.0.children.0.debug.total_buckets: 4 }
+  - match: { profile.shards.0.aggregations.0.children.0.debug.values_fetched: 7 }
+  - match: { profile.shards.0.aggregations.0.children.0.debug.chars_fetched: 33 }
+  - gt: { profile.shards.0.aggregations.0.children.0.debug.extract_ns: 0 }
+  - match: { profile.shards.0.aggregations.0.children.0.debug.extract_count: 7 }
+  - gt: { profile.shards.0.aggregations.0.children.0.debug.collect_analyzed_ns: 0 }
+  - match: { profile.shards.0.aggregations.0.children.0.debug.collect_analyzed_count: 9 }
+
+---
+include:
+  - do:
+      search:
+        index: goodbad
+        body:
+          size: 0
+          aggs:
+            class:
+              terms:
+                field: class
+              aggs:
+                sig_text:
+                  significant_text:
+                    field: text
+                    include: bad
+  - match: {hits.total.value: 7}
+  - length: {aggregations.class.buckets: 2}
+  - match: {aggregations.class.buckets.0.key: bad}
+  - length: { aggregations.class.buckets.0.sig_text.buckets: 1 }
+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad}
+  - match: {aggregations.class.buckets.1.key: good}
+  - length: { aggregations.class.buckets.1.sig_text.buckets: 0 }
+
+---
+exclude:
+  - do:
+      search:
+        index: goodbad
+        body:
+          size: 0
+          aggs:
+            class:
+              terms:
+                field: class
+              aggs:
+                sig_text:
+                  significant_text:
+                    field: text
+                    exclude: good
+  - match: {hits.total.value: 7}
+  - length: {aggregations.class.buckets: 2}
+  - match: {aggregations.class.buckets.0.key: bad}
+  - length: { aggregations.class.buckets.0.sig_text.buckets: 1 }
+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad}
+  - match: {aggregations.class.buckets.1.key: good}
+  - length: { aggregations.class.buckets.1.sig_text.buckets: 0 }
+
+---
+min_doc_count:
+  - do:
+      search:
+        index: goodbad
+        body:
+          size: 0
+          aggs:
+            class:
+              terms:
+                field: class
+              aggs:
+                sig_text:
+                  significant_text:
+                    field: text
+                    min_doc_count: 4
+  - match: {hits.total.value: 7}
+  - length: {aggregations.class.buckets: 2}
+  - match: {aggregations.class.buckets.0.key: bad}
+  - length: { aggregations.class.buckets.0.sig_text.buckets: 1 }
+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad}
+  - match: {aggregations.class.buckets.1.key: good}
+  - length: { aggregations.class.buckets.1.sig_text.buckets: 0 }
+
+---
+size:
+  - do:
+      bulk:
+        index: goodbad
+        refresh: true
+        body:
+          - '{ "index": {"_id": "101"} }'
+          - '{ "text": "caterpillar eat snacks", "class": "good" }'
+          - '{ "index": {"_id": "102"} }'
+          - '{ "text": "caterpillar sick", "class": "good" }'
+          - '{ "index": {"_id": "103"} }'
+          - '{ "text": "caterpillar eat leaf", "class": "good" }'
+          - '{ "index": {"_id": "104"} }'
+          - '{ "text": "caterpillar build cocoon", "class": "good" }'
+          - '{ "index": {"_id": "105"} }'
+          - '{ "text": "caterpillar fly away", "class": "good" }'
+
+  - do:
+      search:
+        index: goodbad
+        body:
+          size: 0
+          aggs:
+            class:
+              terms:
+                field: class
+              aggs:
+                sig_text:
+                  significant_text:
+                    field: text
+  - match: {hits.total.value: 12}
+  - length: {aggregations.class.buckets: 2}
+  - match: {aggregations.class.buckets.0.key: good}
+  - length: { aggregations.class.buckets.0.sig_text.buckets: 2 }
+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: caterpillar}
+  - match: {aggregations.class.buckets.0.sig_text.buckets.1.key: good}
+  - match: {aggregations.class.buckets.1.key: bad}
+  - length: { aggregations.class.buckets.1.sig_text.buckets: 1 }
+  - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: bad}
+
+  - do:
+      search:
+        index: goodbad
+        body:
+          size: 0
+          aggs:
+            class:
+              terms:
+                field: class
+              aggs:
+                sig_text:
+                  significant_text:
+                    field: text
+                    size: 1
+  - match: {hits.total.value: 12}
+  - length: {aggregations.class.buckets: 2}
+  - match: {aggregations.class.buckets.0.key: good}
+  - length: { aggregations.class.buckets.0.sig_text.buckets: 1 }
+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: caterpillar}
+  - match: {aggregations.class.buckets.1.key: bad}
+  - length: { aggregations.class.buckets.1.sig_text.buckets: 1 }
+  - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: bad}
+
+---
+shard_size:
+  # We can't perform a great test for shard_size without lots of control over
+  # routing here and it isn't worh being that brittle. So we'll just test that
+  # we parse it.
+  - do:
+      search:
+        index: goodbad
+        body:
+          aggs:
+            class:
+              terms:
+                field: class
+              aggs:
+                sig_text:
+                  significant_text:
+                    field: text
+                    size: 1
+                    shard_size: 1
+  - match: {hits.total.value: 7}
+  - length: {aggregations.class.buckets: 2}
+  - match: {aggregations.class.buckets.0.key: bad}
+  - length: { aggregations.class.buckets.1.sig_text.buckets: 1 }
+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad}
+  - match: {aggregations.class.buckets.1.key: good}
+  - length: { aggregations.class.buckets.1.sig_text.buckets: 1 }
+  - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: good}
+
+---
+significance_heuristics:
+  - do:
+      bulk:
+        index: goodbad
+        refresh: true
+        body:
+          - '{ "index": {"_id": "101"} }'
+          - '{ "text": "caterpillar eat snacks", "class": "good" }'
+          - '{ "index": {"_id": "102"} }'
+          - '{ "text": "caterpillar sick", "class": "good" }'
+          - '{ "index": {"_id": "103"} }'
+          - '{ "text": "caterpillar eat leaf", "class": "good" }'
+          - '{ "index": {"_id": "104"} }'
+          - '{ "text": "caterpillar build cocoon", "class": "good" }'
+          - '{ "index": {"_id": "105"} }'
+          - '{ "text": "caterpillar fly away", "class": "good" }'
+          - '{ "index": {"_id": "107"} }'
+          - '{ "text": "caterpillar bad", "class": "bad" }'
+          - '{ "index": {"_id": "108"} }'
+          - '{ "text": "caterpillar very bad", "class": "bad" }'
+          - '{ "index": {"_id": "110"} }'
+          - '{ "text": "caterpillar will eat you", "class": "bad" }'
+          - '{ "index": {"_id": "110"} }'
+          - '{ "text": "caterpillar is the enemy", "class": "bad" }'
+          - '{ "index": {"_id": "113"} }'
+          - '{ "text": "good", "class": "good" }'
+          - '{ "index": {"_id": "114"} }'
+          - '{ "text": "good", "class": "good" }'
+
+  - do:
+      search:
+        index: goodbad
+        body:
+          query:
+            match:
+              class: good
+          size: 0
+          aggs:
+            sig_text:
+              significant_text:
+                field: text
+                gnd: {}
+  - match: {hits.total.value: 10}
+  - length: {aggregations.sig_text.buckets: 2}
+  - match: {aggregations.sig_text.buckets.0.key: good}
+  - match: {aggregations.sig_text.buckets.1.key: caterpillar}
+
+  # mutual_information doesn't think `caterpillar` is significant because
+  # it shows up so much in the backgound set.
+  - do:
+      search:
+        index: goodbad
+        body:
+          query:
+            match:
+              class: good
+          size: 0
+          aggs:
+            sig_text:
+              significant_text:
+                field: text
+                mutual_information: {}
+  - match: {hits.total.value: 10}
+  - length: {aggregations.sig_text.buckets: 1}
+  - match: {aggregations.sig_text.buckets.0.key: good}
+
+---
+background_filter:
+  - do:
+      indices.create:
+        index:  goodbadugly
+        body:
+          settings:
+            number_of_shards: "1"
+          mappings:
+            properties:
+              text:
+                type: text
+                fielddata: false
+              class:
+                type: keyword
+              ugly:
+                type: boolean
+
+  - do:
+      bulk:
+        index: goodbadugly
+        refresh: true
+        body:
+          - '{ "index": {"_id": "1"} }'
+          - '{ "text": "good", "class": "good", "ugly": true }'
+          - '{ "index": {"_id": "2"} }'
+          - '{ "text": "good", "class": "good", "ugly": true }'
+          - '{ "index": {"_id": "3"} }'
+          - '{ "text": "bad", "class": "bad" }'
+          - '{ "index": {"_id": "4"} }'
+          - '{ "text": "bad", "class": "bad" }'
+          - '{ "index": {"_id": "5"} }'
+          - '{ "text": "good bad", "class": "good" }'
+          - '{ "index": {"_id": "6"} }'
+          - '{ "text": "good bad", "class": "bad" }'
+          - '{ "index": {"_id": "7"} }'
+          - '{ "text": "bad", "class": "bad" }'
+
+  - do:
+      search:
+        index: goodbadugly
+        body:
+          size: 0
+          aggs:
+            class:
+              terms:
+                field: class
+              aggs:
+                sig_text:
+                  significant_text:
+                    field: text
+                    # only use background frequency information from "ugly"
+                    # documents. All "ugly" documents have the "good" text so
+                    # so "good" isn't significant at all!
+                    background_filter:
+                      match:
+                        ugly: true
+  - match: {hits.total.value: 7}
+  - length: {aggregations.class.buckets: 2}
+  - match: {aggregations.class.buckets.0.key: bad}
+  - length: { aggregations.class.buckets.0.sig_text.buckets: 1 }
+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad}
+  - match: {aggregations.class.buckets.1.key: good}
+  - length: { aggregations.class.buckets.1.sig_text.buckets: 0 }
+
+---
+copy_to:
+  # Tests the special configuration that `significant_text` needs in order to
+  # do sensible things with fields built with `copy_to`.
+  - do:
+      indices.create:
+        index:  has_copy_to
+        body:
+          settings:
+            number_of_shards: "1"
+          mappings:
+            properties:
+              text:
+                type: text
+                fielddata: false
+              class:
+                type: keyword
+              a:
+                type: keyword
+                index: false
+                doc_values: false
+                copy_to: a
+              b:
+                type: keyword
+                index: false
+                doc_values: false
+                copy_to: a
+
+  - do:
+      bulk:
+        index: has_copy_to
+        refresh: true
+        body:
+          - '{ "index": {"_id": "1"} }'
+          - '{ "a": "good", "class": "good" }'
+          - '{ "index": {"_id": "2"} }'
+          - '{ "b": "good", "class": "good" }'
+          - '{ "index": {"_id": "3"} }'
+          - '{ "a": "bad", "class": "bad" }'
+          - '{ "index": {"_id": "4"} }'
+          - '{ "b": "bad", "class": "bad" }'
+          - '{ "index": {"_id": "5"} }'
+          - '{ "a": "good", "b": "bad", "class": "good" }'
+          - '{ "index": {"_id": "6"} }'
+          - '{ "b": "good bad", "class": "bad" }'
+          - '{ "index": {"_id": "7"} }'
+          - '{ "a": "bad", "b": "", "class": "bad" }'
+
+  - do:
+      search:
+        index: has_copy_to
+        body:
+          size: 0
+          aggs:
+            class:
+              terms:
+                field: class
+              aggs:
+                sig_text:
+                  significant_text:
+                    field: text
+                    source_fields: [a, b]
+  - match: {hits.total.value: 7}
+  - length: {aggregations.class.buckets: 2}
+  - match: {aggregations.class.buckets.0.key: bad}
+  - length: { aggregations.class.buckets.0.sig_text.buckets: 1 }
+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: bad}
+  - match: {aggregations.class.buckets.1.key: good}
+  - length: { aggregations.class.buckets.1.sig_text.buckets: 1 }
+  - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: good}

+ 1 - 6
server/src/internalClusterTest/java/org/elasticsearch/search/profile/aggregation/AggregationProfilerIT.java

@@ -224,12 +224,7 @@ public class AggregationProfilerIT extends ESIntegTestCase {
         assertThat(termsAggResult.getDebugInfo(), hasEntry(COLLECTION_STRAT, "remap using many bucket ords"));
         assertThat(termsAggResult.getDebugInfo(), hasEntry(RESULT_STRAT, "terms"));
         assertThat(termsAggResult.getDebugInfo(), hasEntry(HAS_FILTER, false));
-        // TODO we only index single valued docs but the ordinals ends up with multi valued sometimes
-        assertThat(
-            termsAggResult.getDebugInfo().toString(),
-            (int) termsAggResult.getDebugInfo().get(SEGMENTS_WITH_SINGLE) + (int) termsAggResult.getDebugInfo().get(SEGMENTS_WITH_MULTI),
-            greaterThan(0)
-        );
+        assertThat(termsAggResult.getDebugInfo().toString(), (int) termsAggResult.getDebugInfo().get(SEGMENTS_WITH_SINGLE), greaterThan(0));
     }
 
     public void testMultiLevelProfileBreadthFirst() {

+ 36 - 6
server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java

@@ -30,6 +30,7 @@ import org.elasticsearch.search.aggregations.bucket.terms.SignificanceLookup.Bac
 import org.elasticsearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
 import org.elasticsearch.search.aggregations.support.AggregationContext;
 import org.elasticsearch.search.aggregations.support.ValuesSource;
+import org.elasticsearch.search.aggregations.support.ValuesSourceConfig;
 
 import java.io.IOException;
 import java.util.Arrays;
@@ -117,6 +118,8 @@ public class MapStringTermsAggregator extends AbstractStringTermsAggregator {
     public void collectDebugInfo(BiConsumer<String, Object> add) {
         super.collectDebugInfo(add);
         add.accept("total_buckets", bucketOrds.size());
+        add.accept("collection_strategy", collectorSource.describe());
+        collectorSource.collectDebugInfo(add);
         add.accept("result_strategy", resultStrategy.describe());
     }
 
@@ -126,11 +129,30 @@ public class MapStringTermsAggregator extends AbstractStringTermsAggregator {
     }
 
     /**
-     * Abstaction on top of building collectors to fetch values.
+     * Abstraction on top of building collectors to fetch values so {@code terms},
+     * {@code significant_terms}, and {@code significant_text} can share a bunch of
+     * aggregation code.
      */
     public interface CollectorSource extends Releasable {
+        /**
+         * A description of the strategy to include in profile results.
+         */
+        String describe();
+
+        /**
+         * Collect debug information to add to the profiling results. This will
+         * only be called if the aggregation is being profiled.
+         */
+        void collectDebugInfo(BiConsumer<String, Object> add);
+
+        /**
+         * Does this {@link CollectorSource} need queries to calculate the score?
+         */
         boolean needsScores();
 
+        /**
+         * Build the collector.
+         */
         LeafBucketCollector getLeafCollector(
             IncludeExclude.StringFilter includeExclude,
             LeafReaderContext ctx,
@@ -148,15 +170,23 @@ public class MapStringTermsAggregator extends AbstractStringTermsAggregator {
      * Fetch values from a {@link ValuesSource}.
      */
     public static class ValuesSourceCollectorSource implements CollectorSource {
-        private final ValuesSource valuesSource;
+        private final ValuesSourceConfig valuesSourceConfig;
 
-        public ValuesSourceCollectorSource(ValuesSource valuesSource) {
-            this.valuesSource = valuesSource;
+        public ValuesSourceCollectorSource(ValuesSourceConfig valuesSourceConfig) {
+            this.valuesSourceConfig = valuesSourceConfig;
+        }
+
+        @Override
+        public String describe() {
+            return "from " + valuesSourceConfig.getDescription();
         }
 
+        @Override
+        public void collectDebugInfo(BiConsumer<String, Object> add) {}
+
         @Override
         public boolean needsScores() {
-            return valuesSource.needsScores();
+            return valuesSourceConfig.getValuesSource().needsScores();
         }
 
         @Override
@@ -167,7 +197,7 @@ public class MapStringTermsAggregator extends AbstractStringTermsAggregator {
             LongConsumer addRequestCircuitBreakerBytes,
             CollectConsumer consumer
         ) throws IOException {
-            SortedBinaryDocValues values = valuesSource.bytesValues(ctx);
+            SortedBinaryDocValues values = valuesSourceConfig.getValuesSource().bytesValues(ctx);
             return new LeafBucketCollectorBase(sub, values) {
                 final BytesRefBuilder previous = new BytesRefBuilder();
 

+ 0 - 1
server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregationBuilder.java

@@ -42,7 +42,6 @@ public class SignificantTermsAggregationBuilder extends ValuesSourceAggregationB
         new ValuesSourceRegistry.RegistryKey<>(NAME, SignificantTermsAggregatorSupplier.class);
 
     static final ParseField BACKGROUND_FILTER = new ParseField("background_filter");
-    static final ParseField HEURISTIC = new ParseField("significance_heuristic");
 
     static final TermsAggregator.BucketCountThresholds DEFAULT_BUCKET_COUNT_THRESHOLDS = new TermsAggregator.BucketCountThresholds(
             3, 0, 10, -1);

+ 12 - 11
server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorFactory.java

@@ -57,7 +57,7 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
             @Override
             public Aggregator build(String name,
                                     AggregatorFactories factories,
-                                    ValuesSource valuesSource,
+                                    ValuesSourceConfig valuesSourceConfig,
                                     DocValueFormat format,
                                     TermsAggregator.BucketCountThresholds bucketCountThresholds,
                                     IncludeExclude includeExclude,
@@ -73,7 +73,7 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
                 if (executionHint != null) {
                     execution = ExecutionMode.fromString(executionHint, deprecationLogger);
                 }
-                if (valuesSource instanceof ValuesSource.Bytes.WithOrdinals == false) {
+                if (valuesSourceConfig.hasOrdinals() == false) {
                     execution = ExecutionMode.MAP;
                 }
                 if (execution == null) {
@@ -86,7 +86,7 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
                         + "include/exclude clauses");
                 }
 
-                return execution.create(name, factories, valuesSource, format, bucketCountThresholds, includeExclude, context, parent,
+                return execution.create(name, factories, valuesSourceConfig, format, bucketCountThresholds, includeExclude, context, parent,
                     significanceHeuristic, lookup, cardinality, metadata);
             }
         };
@@ -101,7 +101,7 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
             @Override
             public Aggregator build(String name,
                                     AggregatorFactories factories,
-                                    ValuesSource valuesSource,
+                                    ValuesSourceConfig valuesSourceConfig,
                                     DocValueFormat format,
                                     TermsAggregator.BucketCountThresholds bucketCountThresholds,
                                     IncludeExclude includeExclude,
@@ -119,7 +119,7 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
                         "values for include/exclude clauses used to filter numeric fields");
                 }
 
-                ValuesSource.Numeric numericValuesSource = (ValuesSource.Numeric) valuesSource;
+                ValuesSource.Numeric numericValuesSource = (ValuesSource.Numeric) valuesSourceConfig.getValuesSource();
                 if (numericValuesSource.isFloatingPoint()) {
                     throw new UnsupportedOperationException("No support for examining floating point numerics");
                 }
@@ -218,7 +218,7 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
         return aggregatorSupplier.build(
             name,
             factories,
-            config.getValuesSource(),
+            config,
             config.format(),
             bucketCountThresholds,
             includeExclude,
@@ -239,7 +239,7 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
             @Override
             Aggregator create(String name,
                               AggregatorFactories factories,
-                              ValuesSource valuesSource,
+                              ValuesSourceConfig valuesSourceConfig,
                               DocValueFormat format,
                               TermsAggregator.BucketCountThresholds bucketCountThresholds,
                               IncludeExclude includeExclude,
@@ -254,7 +254,7 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
                 return new MapStringTermsAggregator(
                     name,
                     factories,
-                    new MapStringTermsAggregator.ValuesSourceCollectorSource(valuesSource),
+                    new MapStringTermsAggregator.ValuesSourceCollectorSource(valuesSourceConfig),
                     a -> a.new SignificantTermsResults(lookup, significanceHeuristic, cardinality),
                     null,
                     format,
@@ -276,7 +276,7 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
             @Override
             Aggregator create(String name,
                               AggregatorFactories factories,
-                              ValuesSource valuesSource,
+                              ValuesSourceConfig valuesSourceConfig,
                               DocValueFormat format,
                               TermsAggregator.BucketCountThresholds bucketCountThresholds,
                               IncludeExclude includeExclude,
@@ -298,7 +298,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
                     remapGlobalOrd = false;
                 }
 
-                ValuesSource.Bytes.WithOrdinals.FieldData ordinalsValuesSource = (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource;
+                ValuesSource.Bytes.WithOrdinals.FieldData ordinalsValuesSource =
+                    (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSourceConfig.getValuesSource();
                 SortedSetDocValues values = TermsAggregatorFactory.globalOrdsValues(context, ordinalsValuesSource); 
                 return new GlobalOrdinalsStringTermsAggregator(
                     name,
@@ -342,7 +343,7 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
 
         abstract Aggregator create(String name,
                                    AggregatorFactories factories,
-                                   ValuesSource valuesSource,
+                                   ValuesSourceConfig valuesSourceConfig,
                                    DocValueFormat format,
                                    TermsAggregator.BucketCountThresholds bucketCountThresholds,
                                    IncludeExclude includeExclude,

+ 2 - 2
server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTermsAggregatorSupplier.java

@@ -13,7 +13,7 @@ import org.elasticsearch.search.aggregations.AggregatorFactories;
 import org.elasticsearch.search.aggregations.CardinalityUpperBound;
 import org.elasticsearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
 import org.elasticsearch.search.aggregations.support.AggregationContext;
-import org.elasticsearch.search.aggregations.support.ValuesSource;
+import org.elasticsearch.search.aggregations.support.ValuesSourceConfig;
 
 import java.io.IOException;
 import java.util.Map;
@@ -21,7 +21,7 @@ import java.util.Map;
 interface SignificantTermsAggregatorSupplier {
     Aggregator build(String name,
                      AggregatorFactories factories,
-                     ValuesSource valuesSource,
+                     ValuesSourceConfig valuesSourceConfig,
                      DocValueFormat format,
                      TermsAggregator.BucketCountThresholds bucketCountThresholds,
                      IncludeExclude includeExclude,

+ 211 - 64
server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/SignificantTextAggregatorFactory.java

@@ -36,14 +36,18 @@ import org.elasticsearch.search.aggregations.NonCollectingAggregator;
 import org.elasticsearch.search.aggregations.bucket.BucketUtils;
 import org.elasticsearch.search.aggregations.bucket.terms.IncludeExclude.StringFilter;
 import org.elasticsearch.search.aggregations.bucket.terms.MapStringTermsAggregator.CollectConsumer;
+import org.elasticsearch.search.aggregations.bucket.terms.MapStringTermsAggregator.CollectorSource;
 import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregator.BucketCountThresholds;
 import org.elasticsearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
 import org.elasticsearch.search.aggregations.support.AggregationContext;
 import org.elasticsearch.search.lookup.SourceLookup;
+import org.elasticsearch.search.profile.Timer;
 
 import java.io.IOException;
 import java.util.Iterator;
+import java.util.List;
 import java.util.Map;
+import java.util.function.BiConsumer;
 import java.util.function.LongConsumer;
 
 public class SignificantTextAggregatorFactory extends AggregatorFactory {
@@ -66,7 +70,7 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
                                                 AggregatorFactory parent,
                                                 AggregatorFactories.Builder subFactoriesBuilder,
                                                 String fieldName,
-                                                String [] sourceFieldNames,
+                                                String[] sourceFieldNames,
                                                 boolean filterDuplicateText,
                                                 Map<String, Object> metadata) throws IOException {
         super(name, context, parent, subFactoriesBuilder, metadata);
@@ -76,7 +80,7 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
             if (supportsAgg(fieldType) == false) {
                 throw new IllegalArgumentException("Field [" + fieldType.name() + "] has no analyzer, but SignificantText " +
                     "requires an analyzed field");
-            }            
+            }
             String indexedFieldName = fieldType.name();
             this.sourceFieldNames = sourceFieldNames == null ? new String[] {indexedFieldName} : sourceFieldNames;
         } else {
@@ -89,7 +93,7 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
         this.bucketCountThresholds = bucketCountThresholds;
         this.significanceHeuristic = significanceHeuristic;
     }
-    
+
     protected Aggregator createUnmapped(Aggregator parent, Map<String, Object> metadata) throws IOException {
         final InternalAggregation aggregation = new UnmappedSignificantTerms(name, bucketCountThresholds.getRequiredSize(),
                 bucketCountThresholds.getMinDocCount(), metadata);
@@ -99,7 +103,7 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
                 return aggregation;
             }
         };
-    }    
+    }
 
     private static boolean supportsAgg(MappedFieldType ft) {
         return ft.getTextSearchInfo() != TextSearchInfo.NONE
@@ -109,11 +113,11 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
     @Override
     protected Aggregator createInternal(Aggregator parent, CardinalityUpperBound cardinality, Map<String, Object> metadata)
         throws IOException {
-        
+
         if (fieldType == null) {
             return createUnmapped(parent, metadata);
         }
-        
+
         BucketCountThresholds bucketCountThresholds = new BucketCountThresholds(this.bucketCountThresholds);
         if (bucketCountThresholds.getShardSize() == SignificantTextAggregationBuilder.DEFAULT_BUCKET_COUNT_THRESHOLDS.getShardSize()) {
             // The user has not made a shardSize selection.
@@ -133,21 +137,12 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
         IncludeExclude.StringFilter incExcFilter = includeExclude == null ? null:
             includeExclude.convertToStringFilter(DocValueFormat.RAW);
 
-        MapStringTermsAggregator.CollectorSource collectorSource = new SignificantTextCollectorSource(
-            context.lookup().source(),
-            context.bigArrays(),
-            fieldType,
-            context.getIndexAnalyzer(f -> {
-                throw new IllegalArgumentException("No analyzer configured for field " + f);
-            }),
-            sourceFieldNames,
-            filterDuplicateText
-        );
+
         SignificanceLookup lookup = new SignificanceLookup(context, fieldType, DocValueFormat.RAW, backgroundFilter);
         return new MapStringTermsAggregator(
             name,
             factories,
-            collectorSource,
+            createCollectorSource(),
             a -> a.new SignificantTermsResults(lookup, significanceHeuristic, cardinality),
             null,
             DocValueFormat.RAW,
@@ -162,12 +157,58 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
         );
     }
 
+    /**
+     * Create the {@link CollectorSource}, gathering some timing information
+     * if we're profiling.
+     * <p>
+     * When profiling aggregations {@link LeafBucketCollector#collect(int, long)} method
+     * out of the box but our implementation of that method does three things that is
+     * useful to get timing for:
+     * <ul>
+     * <li>Fetch field values from {@code _source}
+     * <li>Analyze the field
+     * <li>Do all the normal {@code terms} agg stuff with its terms
+     * </ul>
+     * <p>
+     * The most convenient way to measure all of these is to time the fetch and all
+     * the normal {@code terms} agg stuff. You can then subtract those timings from
+     * the overall collect time to get the analyze time. You can also get the total
+     * number of terms that we analyzed by looking at the invocation count on the
+     * {@code terms} agg stuff.
+     * <p>
+     * While we're at it we count the number of values we fetch from source.
+     */
+    private CollectorSource createCollectorSource() {
+        Analyzer analyzer = context.getIndexAnalyzer(f -> {
+            throw new IllegalArgumentException("No analyzer configured for field " + f);
+        });
+        if (context.profiling()) {
+            return new ProfilingSignificantTextCollectorSource(
+                context.lookup().source(),
+                context.bigArrays(),
+                fieldType,
+                analyzer,
+                sourceFieldNames,
+                filterDuplicateText
+            );
+        }
+        return new SignificantTextCollectorSource(
+            context.lookup().source(),
+            context.bigArrays(),
+            fieldType,
+            analyzer,
+            sourceFieldNames,
+            filterDuplicateText
+        );
+    }
+
     private static class SignificantTextCollectorSource implements MapStringTermsAggregator.CollectorSource {
         private final SourceLookup sourceLookup;
         private final BigArrays bigArrays;
         private final MappedFieldType fieldType;
         private final Analyzer analyzer;
         private final String[] sourceFieldNames;
+        private final BytesRefBuilder scratch = new BytesRefBuilder();
         private ObjectArray<DuplicateByteSequenceSpotter> dupSequenceSpotters;
 
         SignificantTextCollectorSource(
@@ -186,6 +227,15 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
             dupSequenceSpotters = filterDuplicateText ? bigArrays.newObjectArray(1) : null;
         }
 
+        @Override
+        public String describe() {
+            return "analyze " + fieldType.name() + " from _source";
+        }
+
+        @Override
+        public void collectDebugInfo(BiConsumer<String, Object> add) {
+        }
+
         @Override
         public boolean needsScores() {
             return false;
@@ -200,8 +250,6 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
             CollectConsumer consumer
         ) throws IOException {
             return new LeafBucketCollectorBase(sub, null) {
-                private final BytesRefBuilder scratch = new BytesRefBuilder();
-
                 @Override
                 public void collect(int doc, long owningBucketOrd) throws IOException {
                     if (dupSequenceSpotters == null) {
@@ -224,7 +272,7 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
 
                     try {
                         for (String sourceField : sourceFieldNames) {
-                            Iterator<String> itr = sourceLookup.extractRawValues(sourceField).stream()
+                            Iterator<String> itr = extractRawValues(sourceField).stream()
                                 .map(obj -> {
                                     if (obj == null) {
                                         return null;
@@ -236,63 +284,87 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
                                 })
                                 .iterator();
                             while (itr.hasNext()) {
-                                TokenStream ts = analyzer.tokenStream(fieldType.name(), itr.next());
-                                processTokenStream(doc, owningBucketOrd, ts, inDocTerms, spotter);
+                                String text = itr.next();
+                                TokenStream ts = analyzer.tokenStream(fieldType.name(), text);
+                                processTokenStream(
+                                    includeExclude,
+                                    doc,
+                                    owningBucketOrd,
+                                    text,
+                                    ts,
+                                    inDocTerms,
+                                    spotter,
+                                    addRequestCircuitBreakerBytes,
+                                    sub,
+                                    consumer
+                                );
                             }
                         }
                     } finally {
                         Releasables.close(inDocTerms);
                     }
                 }
+            };
+        }
 
-                private void processTokenStream(
-                    int doc,
-                    long owningBucketOrd,
-                    TokenStream ts,
-                    BytesRefHash inDocTerms,
-                    DuplicateByteSequenceSpotter spotter
-                ) throws IOException {
-                    long lastTrieSize = 0;
+        protected void processTokenStream(
+            StringFilter includeExclude,
+            int doc,
+            long owningBucketOrd,
+            String text,
+            TokenStream ts,
+            BytesRefHash inDocTerms,
+            DuplicateByteSequenceSpotter spotter,
+            LongConsumer addRequestCircuitBreakerBytes,
+            LeafBucketCollector sub,
+            CollectConsumer consumer
+        ) throws IOException {
+            long lastTrieSize = 0;
+            if (spotter != null) {
+                lastTrieSize = spotter.getEstimatedSizeInBytes();
+                ts = new DeDuplicatingTokenFilter(ts, spotter);
+            }
+            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+            ts.reset();
+            try {
+                while (ts.incrementToken()) {
                     if (spotter != null) {
-                        lastTrieSize = spotter.getEstimatedSizeInBytes();
-                        ts = new DeDuplicatingTokenFilter(ts, spotter);
+                        long newTrieSize = spotter.getEstimatedSizeInBytes();
+                        long growth = newTrieSize - lastTrieSize;
+                        // Only update the circuitbreaker after
+                        if (growth > MEMORY_GROWTH_REPORTING_INTERVAL_BYTES) {
+                            addRequestCircuitBreakerBytes.accept(growth);
+                            lastTrieSize = newTrieSize;
+                        }
                     }
-                    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
-                    ts.reset();
-                    try {
-                        while (ts.incrementToken()) {
-                            if (spotter != null) {
-                                long newTrieSize = spotter.getEstimatedSizeInBytes();
-                                long growth = newTrieSize - lastTrieSize;
-                                // Only update the circuitbreaker after
-                                if (growth > MEMORY_GROWTH_REPORTING_INTERVAL_BYTES) {
-                                    addRequestCircuitBreakerBytes.accept(growth);
-                                    lastTrieSize = newTrieSize;
-                                }
-                            }
 
-                            scratch.clear();
-                            scratch.copyChars(termAtt);
-                            BytesRef bytes = scratch.get();
-                            if (includeExclude != null && false == includeExclude.accept(bytes)) {
-                                continue;
-                            }
-                            if (inDocTerms.add(bytes) < 0) {
-                                continue;
-                            }
-                            consumer.accept(sub, doc, owningBucketOrd, bytes);
-                        }
-                    } finally {
-                        ts.close();
+                    scratch.clear();
+                    scratch.copyChars(termAtt);
+                    BytesRef bytes = scratch.get();
+                    if (includeExclude != null && false == includeExclude.accept(bytes)) {
+                        continue;
                     }
-                    if (spotter != null) {
-                        long growth = spotter.getEstimatedSizeInBytes() - lastTrieSize;
-                        if (growth > 0) {
-                            addRequestCircuitBreakerBytes.accept(growth);
-                        }
+                    if (inDocTerms.add(bytes) < 0) {
+                        continue;
                     }
+                    consumer.accept(sub, doc, owningBucketOrd, bytes);
                 }
-            };
+            } finally {
+                ts.close();
+            }
+            if (spotter != null) {
+                long growth = spotter.getEstimatedSizeInBytes() - lastTrieSize;
+                if (growth > 0) {
+                    addRequestCircuitBreakerBytes.accept(growth);
+                }
+            }
+        }
+
+        /**
+         * Extract values from {@code _source}.
+         */
+        protected List<Object> extractRawValues(String field) {
+            return sourceLookup.extractRawValues(field);
         }
 
         @Override
@@ -300,4 +372,79 @@ public class SignificantTextAggregatorFactory extends AggregatorFactory {
             Releasables.close(dupSequenceSpotters);
         }
     }
+
+    private static class ProfilingSignificantTextCollectorSource extends SignificantTextCollectorSource {
+        private final Timer extract = new Timer();
+        private final Timer collectAnalyzed = new Timer();
+        private long valuesFetched;
+        private long charsFetched;
+
+        private ProfilingSignificantTextCollectorSource(
+            SourceLookup sourceLookup,
+            BigArrays bigArrays,
+            MappedFieldType fieldType,
+            Analyzer analyzer,
+            String[] sourceFieldNames,
+            boolean filterDuplicateText
+        ) {
+            super(sourceLookup, bigArrays, fieldType, analyzer, sourceFieldNames, filterDuplicateText);
+        }
+
+        @Override
+        protected void processTokenStream(
+            StringFilter includeExclude,
+            int doc,
+            long owningBucketOrd,
+            String text,
+            TokenStream ts,
+            BytesRefHash inDocTerms,
+            DuplicateByteSequenceSpotter spotter,
+            LongConsumer addRequestCircuitBreakerBytes,
+            LeafBucketCollector sub,
+            CollectConsumer consumer
+        ) throws IOException {
+            valuesFetched++;
+            charsFetched += text.length();
+            super.processTokenStream(
+                includeExclude,
+                doc,
+                owningBucketOrd,
+                text,
+                ts,
+                inDocTerms,
+                spotter,
+                addRequestCircuitBreakerBytes,
+                sub,
+                (subCollector, d, o, bytes) -> {
+                    collectAnalyzed.start();
+                    try {
+                        consumer.accept(subCollector, d, o, bytes);
+                    } finally {
+                        collectAnalyzed.stop();
+                    }
+                }
+            );
+        }
+
+        @Override
+        protected List<Object> extractRawValues(String field) {
+            extract.start();
+            try {
+                return super.extractRawValues(field);
+            } finally {
+                extract.stop();
+            }
+        }
+
+        @Override
+        public void collectDebugInfo(BiConsumer<String, Object> add) {
+            super.collectDebugInfo(add);
+            add.accept("extract_ns", extract.getApproximateTiming());
+            add.accept("extract_count", extract.getCount());
+            add.accept("collect_analyzed_ns", collectAnalyzed.getApproximateTiming());
+            add.accept("collect_analyzed_count", collectAnalyzed.getCount());
+            add.accept("values_fetched", valuesFetched);
+            add.accept("chars_fetched", charsFetched);
+        }
+    }
 }

+ 1 - 1
server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java

@@ -325,7 +325,7 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory {
                 return new MapStringTermsAggregator(
                     name,
                     factories,
-                    new MapStringTermsAggregator.ValuesSourceCollectorSource(valuesSourceConfig.getValuesSource()),
+                    new MapStringTermsAggregator.ValuesSourceCollectorSource(valuesSourceConfig),
                     a -> a.new StandardTermsResults(valuesSourceConfig.getValuesSource()),
                     order,
                     valuesSourceConfig.format(),