ml-configuring-transform.asciidoc 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604
  1. [role="xpack"]
  2. [[ml-configuring-transform]]
  3. = Altering data in your {dfeed} with runtime fields
  4. If you use {dfeeds}, you can use runtime fields to alter your data before it
  5. is analyzed. You can add an optional `runtime_mappings` property to your
  6. {dfeeds}, where you can specify field types and scripts that evaluate custom
  7. expressions without affecting the indices that you're retrieving the data from.
  8. If your {dfeed} defines runtime fields, you can use those fields in your
  9. {anomaly-job}. For example, you can use the runtime fields in the analysis
  10. functions in one or more detectors. Runtime fields can impact search performance
  11. based on the computation defined in the runtime script.
  12. * <<ml-configuring-transform1>>
  13. * <<ml-configuring-transform2>>
  14. * <<ml-configuring-transform3>>
  15. * <<ml-configuring-transform4>>
  16. * <<ml-configuring-transform5>>
  17. * <<ml-configuring-transform6>>
  18. * <<ml-configuring-transform7>>
  19. * <<ml-configuring-transform8>>
  20. // * <<ml-configuring-transform9>>
  21. The following index APIs create and add content to an index that is used in
  22. subsequent examples:
  23. [source,console]
  24. ----------------------------------
  25. PUT /my-index-000001
  26. {
  27. "mappings":{
  28. "properties": {
  29. "@timestamp": {
  30. "type": "date"
  31. },
  32. "aborted_count": {
  33. "type": "long"
  34. },
  35. "another_field": {
  36. "type": "keyword" <1>
  37. },
  38. "clientip": {
  39. "type": "keyword"
  40. },
  41. "coords": {
  42. "properties": {
  43. "lat": {
  44. "type": "keyword"
  45. },
  46. "lon": {
  47. "type": "keyword"
  48. }
  49. }
  50. },
  51. "error_count": {
  52. "type": "long"
  53. },
  54. "query": {
  55. "type": "keyword"
  56. },
  57. "some_field": {
  58. "type": "keyword"
  59. },
  60. "tokenstring1":{
  61. "type":"keyword"
  62. },
  63. "tokenstring2":{
  64. "type":"keyword"
  65. },
  66. "tokenstring3":{
  67. "type":"keyword"
  68. }
  69. }
  70. }
  71. }
  72. PUT /my-index-000001/_doc/1
  73. {
  74. "@timestamp":"2017-03-23T13:00:00",
  75. "error_count":36320,
  76. "aborted_count":4156,
  77. "some_field":"JOE",
  78. "another_field":"SMITH ",
  79. "tokenstring1":"foo-bar-baz",
  80. "tokenstring2":"foo bar baz",
  81. "tokenstring3":"foo-bar-19",
  82. "query":"www.ml.elastic.co",
  83. "clientip":"123.456.78.900",
  84. "coords": {
  85. "lat" : 41.44,
  86. "lon":90.5
  87. }
  88. }
  89. ----------------------------------
  90. // TEST[skip:SETUP]
  91. <1> In this example, string fields are mapped as `keyword` fields to support
  92. aggregation. If you want both a full text (`text`) and a keyword (`keyword`)
  93. version of the same field, use multi-fields. For more information, see
  94. {ref}/multi-fields.html[fields].
  95. [[ml-configuring-transform1]]
  96. .Example 1: Adding two numerical fields
  97. [source,console]
  98. ----------------------------------
  99. PUT _ml/anomaly_detectors/test1
  100. {
  101. "analysis_config":{
  102. "bucket_span": "10m",
  103. "detectors":[
  104. {
  105. "function":"mean",
  106. "field_name": "total_error_count", <1>
  107. "detector_description": "Custom script field transformation"
  108. }
  109. ]
  110. },
  111. "data_description": {
  112. "time_field":"@timestamp",
  113. "time_format":"epoch_ms"
  114. }
  115. }
  116. PUT _ml/datafeeds/datafeed-test1
  117. {
  118. "job_id": "test1",
  119. "indices": [
  120. "my-index-000001"
  121. ],
  122. "query": {
  123. "match_all": {
  124. "boost": 1
  125. }
  126. },
  127. "runtime_mappings": {
  128. "total_error_count": { <2>
  129. "type": "long",
  130. "script": {
  131. "source": "emit(doc['error_count'].value + doc['aborted_count'].value)"
  132. }
  133. }
  134. }
  135. }
  136. ----------------------------------
  137. // TEST[skip:needs-licence]
  138. <1> A runtime field named `total_error_count` is referenced in the detector
  139. within the job.
  140. <2> The runtime field is defined in the {dfeed}.
  141. This `test1` {anomaly-job} contains a detector that uses a runtime field in a
  142. mean analysis function. The `datafeed-test1` {dfeed} defines the runtime field.
  143. It contains a script that adds two fields in the document to produce a "total"
  144. error count.
  145. The syntax for the `runtime_mappings` property is identical to that used by
  146. {es}. For more information, see {ref}/runtime.html[Runtime fields].
  147. You can preview the contents of the {dfeed} by using the following API:
  148. [source,console]
  149. ----------------------------------
  150. GET _ml/datafeeds/datafeed-test1/_preview
  151. ----------------------------------
  152. // TEST[skip:continued]
  153. In this example, the API returns the following results, which contain a sum of
  154. the `error_count` and `aborted_count` values:
  155. [source,js]
  156. ----------------------------------
  157. [
  158. {
  159. "@timestamp": 1490274000000,
  160. "total_error_count": 40476
  161. }
  162. ]
  163. ----------------------------------
  164. NOTE: This example demonstrates how to use runtime fields, but it contains
  165. insufficient data to generate meaningful results.
  166. //For a full demonstration of
  167. //how to create jobs with sample data, see <<ml-getting-started>>.
  168. You can alternatively use {kib} to create an advanced {anomaly-job} that uses
  169. runtime fields. To add the `runtime_mappings` property to your {dfeed}, you must
  170. use the **Edit JSON** tab. For example:
  171. [role="screenshot"]
  172. image::images/ml-runtimefields.jpg[Using runtime_mappings in {dfeed} config via {kib}]
  173. [[ml-configuring-transform-examples]]
  174. == Common runtime field examples
  175. While the possibilities are limitless, there are a number of common scenarios
  176. where you might use runtime fields in your {dfeeds}.
  177. [NOTE]
  178. ===============================
  179. Some of these examples use regular expressions. By default, regular
  180. expressions are disabled because they circumvent the protection that Painless
  181. provides against long running and memory hungry scripts. For more information,
  182. see {ref}/modules-scripting-painless.html[Painless scripting language].
  183. {ml-cap} analysis is case sensitive. For example, "John" is considered to be
  184. different than "john". This is one reason you might consider using scripts that
  185. convert your strings to upper or lowercase letters.
  186. ===============================
  187. [[ml-configuring-transform2]]
  188. .Example 2: Concatenating strings
  189. [source,console]
  190. --------------------------------------------------
  191. PUT _ml/anomaly_detectors/test2
  192. {
  193. "analysis_config":{
  194. "bucket_span": "10m",
  195. "detectors":[
  196. {
  197. "function":"low_info_content",
  198. "field_name":"my_runtime_field", <1>
  199. "detector_description": "Custom script field transformation"
  200. }
  201. ]
  202. },
  203. "data_description": {
  204. "time_field":"@timestamp",
  205. "time_format":"epoch_ms"
  206. }
  207. }
  208. PUT _ml/datafeeds/datafeed-test2
  209. {
  210. "job_id": "test2",
  211. "indices": ["my-index-000001"],
  212. "query": {
  213. "match_all": {
  214. "boost": 1
  215. }
  216. },
  217. "runtime_mappings": {
  218. "my_runtime_field": {
  219. "type": "keyword",
  220. "script": {
  221. "source": "emit(doc['some_field'].value + '_' + doc['another_field'].value)" <2>
  222. }
  223. }
  224. }
  225. }
  226. GET _ml/datafeeds/datafeed-test2/_preview
  227. --------------------------------------------------
  228. // TEST[skip:needs-licence]
  229. <1> The runtime field has a generic name in this case, since it is used for
  230. various tests in the examples.
  231. <2> The runtime field uses the plus (+) operator to concatenate strings.
  232. The preview {dfeed} API returns the following results, which show that "JOE"
  233. and "SMITH " have been concatenated and an underscore was added:
  234. [source,js]
  235. ----------------------------------
  236. [
  237. {
  238. "@timestamp": 1490274000000,
  239. "my_runtime_field": "JOE_SMITH "
  240. }
  241. ]
  242. ----------------------------------
  243. [[ml-configuring-transform3]]
  244. .Example 3: Trimming strings
  245. [source,console]
  246. --------------------------------------------------
  247. POST _ml/datafeeds/datafeed-test2/_update
  248. {
  249. "runtime_mappings": {
  250. "my_runtime_field": {
  251. "type": "keyword",
  252. "script": {
  253. "source": "emit(doc['another_field'].value.trim())" <1>
  254. }
  255. }
  256. }
  257. }
  258. GET _ml/datafeeds/datafeed-test2/_preview
  259. --------------------------------------------------
  260. // TEST[skip:continued]
  261. <1> This runtime field uses the `trim()` function to trim extra white space from
  262. a string.
  263. The preview {dfeed} API returns the following results, which show that "SMITH "
  264. has been trimmed to "SMITH":
  265. [source,js]
  266. ----------------------------------
  267. [
  268. {
  269. "@timestamp": 1490274000000,
  270. "my_script_field": "SMITH"
  271. }
  272. ]
  273. ----------------------------------
  274. [[ml-configuring-transform4]]
  275. .Example 4: Converting strings to lowercase
  276. [source,console]
  277. --------------------------------------------------
  278. POST _ml/datafeeds/datafeed-test2/_update
  279. {
  280. "runtime_mappings": {
  281. "my_runtime_field": {
  282. "type": "keyword",
  283. "script": {
  284. "source": "emit(doc['some_field'].value.toLowerCase())" <1>
  285. }
  286. }
  287. }
  288. }
  289. GET _ml/datafeeds/datafeed-test2/_preview
  290. --------------------------------------------------
  291. // TEST[skip:continued]
  292. <1> This runtime field uses the `toLowerCase` function to convert a string to
  293. all lowercase letters. Likewise, you can use the `toUpperCase{}` function to
  294. convert a string to uppercase letters.
  295. The preview {dfeed} API returns the following results, which show that "JOE"
  296. has been converted to "joe":
  297. [source,js]
  298. ----------------------------------
  299. [
  300. {
  301. "@timestamp": 1490274000000,
  302. "my_script_field": "joe"
  303. }
  304. ]
  305. ----------------------------------
  306. [[ml-configuring-transform5]]
  307. .Example 5: Converting strings to mixed case formats
  308. [source,console]
  309. --------------------------------------------------
  310. POST _ml/datafeeds/datafeed-test2/_update
  311. {
  312. "runtime_mappings": {
  313. "my_runtime_field": {
  314. "type": "keyword",
  315. "script": {
  316. "source": "emit(doc['some_field'].value.substring(0, 1).toUpperCase() + doc['some_field'].value.substring(1).toLowerCase())" <1>
  317. }
  318. }
  319. }
  320. }
  321. GET _ml/datafeeds/datafeed-test2/_preview
  322. --------------------------------------------------
  323. // TEST[skip:continued]
  324. <1> This runtime field is a more complicated example of case manipulation. It
  325. uses the `subString()` function to capitalize the first letter of a string and
  326. converts the remaining characters to lowercase.
  327. The preview {dfeed} API returns the following results, which show that "JOE" has
  328. been converted to "Joe":
  329. [source,js]
  330. ----------------------------------
  331. [
  332. {
  333. "@timestamp": 1490274000000,
  334. "my_script_field": "Joe"
  335. }
  336. ]
  337. ----------------------------------
  338. [[ml-configuring-transform6]]
  339. .Example 6: Replacing tokens
  340. [source,console]
  341. --------------------------------------------------
  342. POST _ml/datafeeds/datafeed-test2/_update
  343. {
  344. "runtime_mappings": {
  345. "my_runtime_field": {
  346. "type": "keyword",
  347. "script": {
  348. "source": "emit(/\\s/.matcher(doc['tokenstring2'].value).replaceAll('_'))" <1>
  349. }
  350. }
  351. }
  352. }
  353. GET _ml/datafeeds/datafeed-test2/_preview
  354. --------------------------------------------------
  355. // TEST[skip:continued]
  356. <1> This script uses regular expressions to replace white space with
  357. underscores.
  358. The preview {dfeed} API returns the following results, which show that "foo bar
  359. baz" has been converted to "foo_bar_baz":
  360. [source,js]
  361. ----------------------------------
  362. [
  363. {
  364. "@timestamp": 1490274000000,
  365. "my_script_field": "foo_bar_baz"
  366. }
  367. ]
  368. ----------------------------------
  369. [[ml-configuring-transform7]]
  370. .Example 7: Regular expression matching and concatenation
  371. [source,console]
  372. --------------------------------------------------
  373. POST _ml/datafeeds/datafeed-test2/_update
  374. {
  375. "runtime_mappings": {
  376. "my_runtime_field": {
  377. "type": "keyword",
  378. "script": {
  379. "source": "emit(def m = /(.*)-bar-([0-9][0-9])/.matcher(doc['tokenstring3'].value); return m.find() ? m.group(1) + '_' + m.group(2) : '';)" <1>
  380. }
  381. }
  382. }
  383. }
  384. GET _ml/datafeeds/datafeed-test2/_preview
  385. --------------------------------------------------
  386. // TEST[skip:continued]
  387. <1> This script looks for a specific regular expression pattern and emits the
  388. matched groups as a concatenated string. If no match is found, it emits an empty
  389. string.
  390. The preview {dfeed} API returns the following results, which show that
  391. "foo-bar-19" has been converted to "foo_19":
  392. [source,js]
  393. ----------------------------------
  394. [
  395. {
  396. "@timestamp": 1490274000000,
  397. "my_script_field": "foo_19"
  398. }
  399. ]
  400. ----------------------------------
  401. [[ml-configuring-transform8]]
  402. .Example 8: Transforming geo_point data
  403. [source,console]
  404. --------------------------------------------------
  405. PUT _ml/anomaly_detectors/test4
  406. {
  407. "analysis_config":{
  408. "bucket_span": "10m",
  409. "detectors":[
  410. {
  411. "function":"lat_long",
  412. "field_name": "my_coordinates"
  413. }
  414. ]
  415. },
  416. "data_description": {
  417. "time_field":"@timestamp",
  418. "time_format":"epoch_ms"
  419. }
  420. }
  421. PUT _ml/datafeeds/datafeed-test4
  422. {
  423. "job_id": "test4",
  424. "indices": ["my-index-000001"],
  425. "query": {
  426. "match_all": {
  427. "boost": 1
  428. }
  429. },
  430. "runtime_mappings": {
  431. "my_coordinates": {
  432. "type": "keyword",
  433. "script": {
  434. "source": "emit(doc['coords.lat'].value + ',' + doc['coords.lon'].value)"
  435. }
  436. }
  437. }
  438. }
  439. GET _ml/datafeeds/datafeed-test4/_preview
  440. --------------------------------------------------
  441. // TEST[skip:needs-licence]
  442. In {es}, location data can be stored in `geo_point` fields but this data type is
  443. not supported natively in {ml} analytics. This example of a runtime field
  444. transforms the data into an appropriate format. For more information,
  445. see <<ml-geo-functions>>.
  446. The preview {dfeed} API returns the following results, which show that
  447. `41.44` and `90.5` have been combined into "41.44,90.5":
  448. [source,js]
  449. ----------------------------------
  450. [
  451. {
  452. "@timestamp": 1490274000000,
  453. "my_coordinates": "41.44,90.5"
  454. }
  455. ]
  456. ----------------------------------
  457. ////
  458. [[ml-configuring-transform9]]
  459. .Example 9: Splitting strings by domain name
  460. [source,console]
  461. --------------------------------------------------
  462. PUT _ml/anomaly_detectors/test3
  463. {
  464. "description":"DNS tunneling",
  465. "analysis_config":{
  466. "bucket_span": "30m",
  467. "influencers": ["clientip","hrd"],
  468. "detectors":[
  469. {
  470. "function":"high_info_content",
  471. "field_name": "sub",
  472. "over_field_name": "hrd",
  473. "exclude_frequent":"all"
  474. }
  475. ]
  476. },
  477. "data_description": {
  478. "time_field":"@timestamp",
  479. "time_format":"epoch_ms"
  480. }
  481. }
  482. PUT _ml/datafeeds/datafeed-test3
  483. {
  484. "job_id": "test3",
  485. "indices": ["my-index-000001"],
  486. "query": {
  487. "match_all": {
  488. "boost": 1
  489. }
  490. },
  491. "script_fields":{
  492. "sub":{
  493. "script":"return domainSplit(doc['query'].value).get(0);"
  494. },
  495. "hrd":{
  496. "script":"return domainSplit(doc['query'].value).get(1);"
  497. }
  498. }
  499. }
  500. GET _ml/datafeeds/datafeed-test3/_preview
  501. --------------------------------------------------
  502. // TEST[skip:needs-licence]
  503. If you have a single field that contains a well-formed DNS domain name, you can
  504. use the `domainSplit()` function to split the string into its highest registered
  505. domain and the sub-domain, which is everything to the left of the highest
  506. registered domain. For example, the highest registered domain of
  507. `www.ml.elastic.co` is `elastic.co` and the sub-domain is `www.ml`. The
  508. `domainSplit()` function returns an array of two values: the first value is the
  509. subdomain; the second value is the highest registered domain.
  510. The preview {dfeed} API returns the following results, which show that
  511. "www.ml.elastic.co" has been split into "elastic.co" and "www.ml":
  512. [source,js]
  513. ----------------------------------
  514. [
  515. {
  516. "@timestamp": 1490274000000,
  517. "clientip.keyword": "123.456.78.900",
  518. "hrd": "elastic.co",
  519. "sub": "www.ml"
  520. }
  521. ]
  522. ----------------------------------
  523. ////