bucket-count-ks-test-aggregation.asciidoc 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. [role="xpack"]
  2. [testenv="basic"]
  3. [[search-aggregations-bucket-count-ks-test-aggregation]]
  4. === Bucket count K-S test correlation aggregation
  5. ++++
  6. <titleabbrev>Bucket count K-S test</titleabbrev>
  7. ++++
  8. experimental::[]
  9. A sibling pipeline aggregation which executes a two sample Kolmogorov–Smirnov test
  10. (referred to as a "K-S test" from now on) against a provided distribution, and the
  11. distribution implied by the documents counts in the configured sibling aggregation.
  12. Specifically, for some metric, assuming that the percentile intervals of the metric are
  13. known beforehand or have been computed by an aggregation, then one would use range
  14. aggregation for the sibling to compute the p-value of the distribution difference between
  15. the metric and the restriction of that metric to a subset of the documents. A natural use
  16. case is if the sibling aggregation range aggregation nested in a terms aggregation, in
  17. which case one compares the overall distribution of metric to its restriction to each term.
  18. [[bucket-count-ks-test-agg-syntax]]
  19. ==== Parameters
  20. `buckets_path`::
  21. (Required, string)
  22. Path to the buckets that contain one set of values to correlate. Must be a `_count` path
  23. For syntax, see <<buckets-path-syntax>>.
  24. `alternative`::
  25. (Optional, list)
  26. A list of string values indicating which K-S test alternative to calculate.
  27. The valid values are: "greater", "less", "two_sided". This parameter is key for
  28. determining the K-S statistic used when calculating the K-S test. Default value is
  29. all possible alternative hypotheses.
  30. `fractions`::
  31. (Optional, list)
  32. A list of doubles indicating the distribution of the samples with which to compare to the
  33. `buckets_path` results. In typical usage this is the overall proportion of documents in
  34. each bucket, which is compared with the actual document proportions in each bucket
  35. from the sibling aggregation counts. The default is to assume that overall documents
  36. are uniformly distributed on these buckets, which they would be if one used equal
  37. percentiles of a metric to define the bucket end points.
  38. `sampling_method`::
  39. (Optional, string)
  40. Indicates the sampling methodology when calculating the K-S test. Note, this is sampling
  41. of the returned values. This determines the cumulative distribution function (CDF) points
  42. used comparing the two samples. Default is `upper_tail`, which emphasizes the upper
  43. end of the CDF points. Valid options are: `upper_tail`, `uniform`, and `lower_tail`.
  44. ==== Syntax
  45. A `bucket_count_ks_test` aggregation looks like this in isolation:
  46. [source,js]
  47. --------------------------------------------------
  48. {
  49. "bucket_count_ks_test": {
  50. "buckets_path": "range_values>_count", <1>
  51. "alternative": ["less", "greater", "two_sided"], <2>
  52. "sampling_method": "upper_tail" <3>
  53. }
  54. }
  55. --------------------------------------------------
  56. // NOTCONSOLE
  57. <1> The buckets containing the values to test against.
  58. <2> The alternatives to calculate.
  59. <3> The sampling method for the K-S statistic.
  60. [[bucket-count-ks-test-agg-example]]
  61. ==== Example
  62. The following snippet runs the `bucket_count_ks_test` on the individual terms in the field `version` against a uniform distribution.
  63. The uniform distribution reflects the `latency` percentile buckets. Not shown is the pre-calculation of the `latency` indicator values,
  64. which was done utilizing the
  65. <<search-aggregations-metrics-percentile-aggregation,percentiles>> aggregation.
  66. This example is only using the deciles of `latency`.
  67. [source,console]
  68. -------------------------------------------------
  69. POST correlate_latency/_search?size=0&filter_path=aggregations
  70. {
  71. "aggs": {
  72. "buckets": {
  73. "terms": { <1>
  74. "field": "version",
  75. "size": 2
  76. },
  77. "aggs": {
  78. "latency_ranges": {
  79. "range": { <2>
  80. "field": "latency",
  81. "ranges": [
  82. { "to": 0 },
  83. { "from": 0, "to": 105 },
  84. { "from": 105, "to": 225 },
  85. { "from": 225, "to": 445 },
  86. { "from": 445, "to": 665 },
  87. { "from": 665, "to": 885 },
  88. { "from": 885, "to": 1115 },
  89. { "from": 1115, "to": 1335 },
  90. { "from": 1335, "to": 1555 },
  91. { "from": 1555, "to": 1775 },
  92. { "from": 1775 }
  93. ]
  94. }
  95. },
  96. "ks_test": { <3>
  97. "bucket_count_ks_test": {
  98. "buckets_path": "latency_ranges>_count",
  99. "alternative": ["less", "greater", "two_sided"]
  100. }
  101. }
  102. }
  103. }
  104. }
  105. }
  106. -------------------------------------------------
  107. // TEST[setup:correlate_latency]
  108. <1> The term buckets containing a range aggregation and the bucket correlation aggregation. Both are utilized to calculate
  109. the correlation of the term values with the latency.
  110. <2> The range aggregation on the latency field. The ranges were created referencing the percentiles of the latency field.
  111. <3> The bucket count K-S test aggregation that tests if the bucket counts comes from the same distribution as `fractions`;
  112. where `fractions` is a uniform distribution.
  113. And the following may be the response:
  114. [source,console-result]
  115. ----
  116. {
  117. "aggregations" : {
  118. "buckets" : {
  119. "doc_count_error_upper_bound" : 0,
  120. "sum_other_doc_count" : 0,
  121. "buckets" : [
  122. {
  123. "key" : "1.0",
  124. "doc_count" : 100,
  125. "latency_ranges" : {
  126. "buckets" : [
  127. {
  128. "key" : "*-0.0",
  129. "to" : 0.0,
  130. "doc_count" : 0
  131. },
  132. {
  133. "key" : "0.0-105.0",
  134. "from" : 0.0,
  135. "to" : 105.0,
  136. "doc_count" : 1
  137. },
  138. {
  139. "key" : "105.0-225.0",
  140. "from" : 105.0,
  141. "to" : 225.0,
  142. "doc_count" : 9
  143. },
  144. {
  145. "key" : "225.0-445.0",
  146. "from" : 225.0,
  147. "to" : 445.0,
  148. "doc_count" : 0
  149. },
  150. {
  151. "key" : "445.0-665.0",
  152. "from" : 445.0,
  153. "to" : 665.0,
  154. "doc_count" : 0
  155. },
  156. {
  157. "key" : "665.0-885.0",
  158. "from" : 665.0,
  159. "to" : 885.0,
  160. "doc_count" : 0
  161. },
  162. {
  163. "key" : "885.0-1115.0",
  164. "from" : 885.0,
  165. "to" : 1115.0,
  166. "doc_count" : 10
  167. },
  168. {
  169. "key" : "1115.0-1335.0",
  170. "from" : 1115.0,
  171. "to" : 1335.0,
  172. "doc_count" : 20
  173. },
  174. {
  175. "key" : "1335.0-1555.0",
  176. "from" : 1335.0,
  177. "to" : 1555.0,
  178. "doc_count" : 20
  179. },
  180. {
  181. "key" : "1555.0-1775.0",
  182. "from" : 1555.0,
  183. "to" : 1775.0,
  184. "doc_count" : 20
  185. },
  186. {
  187. "key" : "1775.0-*",
  188. "from" : 1775.0,
  189. "doc_count" : 20
  190. }
  191. ]
  192. },
  193. "ks_test" : {
  194. "less" : 2.248673241788478E-4,
  195. "greater" : 1.0,
  196. "two_sided" : 5.791639181800257E-4
  197. }
  198. },
  199. {
  200. "key" : "2.0",
  201. "doc_count" : 100,
  202. "latency_ranges" : {
  203. "buckets" : [
  204. {
  205. "key" : "*-0.0",
  206. "to" : 0.0,
  207. "doc_count" : 0
  208. },
  209. {
  210. "key" : "0.0-105.0",
  211. "from" : 0.0,
  212. "to" : 105.0,
  213. "doc_count" : 19
  214. },
  215. {
  216. "key" : "105.0-225.0",
  217. "from" : 105.0,
  218. "to" : 225.0,
  219. "doc_count" : 11
  220. },
  221. {
  222. "key" : "225.0-445.0",
  223. "from" : 225.0,
  224. "to" : 445.0,
  225. "doc_count" : 20
  226. },
  227. {
  228. "key" : "445.0-665.0",
  229. "from" : 445.0,
  230. "to" : 665.0,
  231. "doc_count" : 20
  232. },
  233. {
  234. "key" : "665.0-885.0",
  235. "from" : 665.0,
  236. "to" : 885.0,
  237. "doc_count" : 20
  238. },
  239. {
  240. "key" : "885.0-1115.0",
  241. "from" : 885.0,
  242. "to" : 1115.0,
  243. "doc_count" : 10
  244. },
  245. {
  246. "key" : "1115.0-1335.0",
  247. "from" : 1115.0,
  248. "to" : 1335.0,
  249. "doc_count" : 0
  250. },
  251. {
  252. "key" : "1335.0-1555.0",
  253. "from" : 1335.0,
  254. "to" : 1555.0,
  255. "doc_count" : 0
  256. },
  257. {
  258. "key" : "1555.0-1775.0",
  259. "from" : 1555.0,
  260. "to" : 1775.0,
  261. "doc_count" : 0
  262. },
  263. {
  264. "key" : "1775.0-*",
  265. "from" : 1775.0,
  266. "doc_count" : 0
  267. }
  268. ]
  269. },
  270. "ks_test" : {
  271. "less" : 0.9642895789647244,
  272. "greater" : 4.58718174664754E-9,
  273. "two_sided" : 5.916656831139733E-9
  274. }
  275. }
  276. ]
  277. }
  278. }
  279. }
  280. ----