transforms.asciidoc 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588
  1. [role="xpack"]
  2. [[ml-configuring-transform]]
  3. === Transforming data with script fields
  4. If you use {dfeeds}, you can add scripts to transform your data before
  5. it is analyzed. {dfeeds-cap} contain an optional `script_fields` property, where
  6. you can specify scripts that evaluate custom expressions and return script
  7. fields.
  8. If your {dfeed} defines script fields, you can use those fields in your job.
  9. For example, you can use the script fields in the analysis functions in one or
  10. more detectors.
  11. * <<ml-configuring-transform1>>
  12. * <<ml-configuring-transform2>>
  13. * <<ml-configuring-transform3>>
  14. * <<ml-configuring-transform4>>
  15. * <<ml-configuring-transform5>>
  16. * <<ml-configuring-transform6>>
  17. * <<ml-configuring-transform7>>
  18. * <<ml-configuring-transform8>>
  19. * <<ml-configuring-transform9>>
  20. The following indices APIs create and add content to an index that is used in
  21. subsequent examples:
  22. [source,js]
  23. ----------------------------------
  24. PUT /my_index
  25. {
  26. "mappings":{
  27. "_doc":{
  28. "properties": {
  29. "@timestamp": {
  30. "type": "date"
  31. },
  32. "aborted_count": {
  33. "type": "long"
  34. },
  35. "another_field": {
  36. "type": "keyword" <1>
  37. },
  38. "clientip": {
  39. "type": "keyword"
  40. },
  41. "coords": {
  42. "properties": {
  43. "lat": {
  44. "type": "keyword"
  45. },
  46. "lon": {
  47. "type": "keyword"
  48. }
  49. }
  50. },
  51. "error_count": {
  52. "type": "long"
  53. },
  54. "query": {
  55. "type": "keyword"
  56. },
  57. "some_field": {
  58. "type": "keyword"
  59. },
  60. "tokenstring1":{
  61. "type":"keyword"
  62. },
  63. "tokenstring2":{
  64. "type":"keyword"
  65. },
  66. "tokenstring3":{
  67. "type":"keyword"
  68. }
  69. }
  70. }
  71. }
  72. }
  73. PUT /my_index/_doc/1
  74. {
  75. "@timestamp":"2017-03-23T13:00:00",
  76. "error_count":36320,
  77. "aborted_count":4156,
  78. "some_field":"JOE",
  79. "another_field":"SMITH ",
  80. "tokenstring1":"foo-bar-baz",
  81. "tokenstring2":"foo bar baz",
  82. "tokenstring3":"foo-bar-19",
  83. "query":"www.ml.elastic.co",
  84. "clientip":"123.456.78.900",
  85. "coords": {
  86. "lat" : 41.44,
  87. "lon":90.5
  88. }
  89. }
  90. ----------------------------------
  91. // CONSOLE
  92. // TEST[skip:SETUP]
  93. <1> In this example, string fields are mapped as `keyword` fields to support
  94. aggregation. If you want both a full text (`text`) and a keyword (`keyword`)
  95. version of the same field, use multi-fields. For more information, see
  96. {ref}/multi-fields.html[fields].
  97. [[ml-configuring-transform1]]
  98. .Example 1: Adding two numerical fields
  99. [source,js]
  100. ----------------------------------
  101. PUT _ml/anomaly_detectors/test1
  102. {
  103. "analysis_config":{
  104. "bucket_span": "10m",
  105. "detectors":[
  106. {
  107. "function":"mean",
  108. "field_name": "total_error_count", <1>
  109. "detector_description": "Custom script field transformation"
  110. }
  111. ]
  112. },
  113. "data_description": {
  114. "time_field":"@timestamp",
  115. "time_format":"epoch_ms"
  116. }
  117. }
  118. PUT _ml/datafeeds/datafeed-test1
  119. {
  120. "job_id": "test1",
  121. "indices": ["my_index"],
  122. "query": {
  123. "match_all": {
  124. "boost": 1
  125. }
  126. },
  127. "script_fields": {
  128. "total_error_count": { <2>
  129. "script": {
  130. "lang": "expression",
  131. "inline": "doc['error_count'].value + doc['aborted_count'].value"
  132. }
  133. }
  134. }
  135. }
  136. ----------------------------------
  137. // CONSOLE
  138. // TEST[skip:needs-licence]
  139. <1> A script field named `total_error_count` is referenced in the detector
  140. within the job.
  141. <2> The script field is defined in the {dfeed}.
  142. This `test1` job contains a detector that uses a script field in a mean analysis
  143. function. The `datafeed-test1` {dfeed} defines the script field. It contains a
  144. script that adds two fields in the document to produce a "total" error count.
  145. The syntax for the `script_fields` property is identical to that used by {es}.
  146. For more information, see {ref}/search-request-script-fields.html[Script Fields].
  147. You can preview the contents of the {dfeed} by using the following API:
  148. [source,js]
  149. ----------------------------------
  150. GET _ml/datafeeds/datafeed-test1/_preview
  151. ----------------------------------
  152. // CONSOLE
  153. // TEST[skip:continued]
  154. In this example, the API returns the following results, which contain a sum of
  155. the `error_count` and `aborted_count` values:
  156. [source,js]
  157. ----------------------------------
  158. [
  159. {
  160. "@timestamp": 1490274000000,
  161. "total_error_count": 40476
  162. }
  163. ]
  164. ----------------------------------
  165. NOTE: This example demonstrates how to use script fields, but it contains
  166. insufficient data to generate meaningful results. For a full demonstration of
  167. how to create jobs with sample data, see <<ml-getting-started>>.
  168. You can alternatively use {kib} to create an advanced job that uses script
  169. fields. To add the `script_fields` property to your {dfeed}, you must use the
  170. **Edit JSON** tab. For example:
  171. [role="screenshot"]
  172. image::images/ml-scriptfields.jpg[Adding script fields to a {dfeed} in {kib}]
  173. [[ml-configuring-transform-examples]]
  174. ==== Common Script Field Examples
  175. While the possibilities are limitless, there are a number of common scenarios
  176. where you might use script fields in your {dfeeds}.
  177. [NOTE]
  178. ===============================
  179. Some of these examples use regular expressions. By default, regular
  180. expressions are disabled because they circumvent the protection that Painless
  181. provides against long running and memory hungry scripts. For more information,
  182. see {ref}/modules-scripting-painless.html[Painless Scripting Language].
  183. Machine learning analysis is case sensitive. For example, "John" is considered
  184. to be different than "john". This is one reason you might consider using scripts
  185. that convert your strings to upper or lowercase letters.
  186. ===============================
  187. [[ml-configuring-transform2]]
  188. .Example 2: Concatenating strings
  189. [source,js]
  190. --------------------------------------------------
  191. PUT _ml/anomaly_detectors/test2
  192. {
  193. "analysis_config":{
  194. "bucket_span": "10m",
  195. "detectors":[
  196. {
  197. "function":"low_info_content",
  198. "field_name":"my_script_field", <1>
  199. "detector_description": "Custom script field transformation"
  200. }
  201. ]
  202. },
  203. "data_description": {
  204. "time_field":"@timestamp",
  205. "time_format":"epoch_ms"
  206. }
  207. }
  208. PUT _ml/datafeeds/datafeed-test2
  209. {
  210. "job_id": "test2",
  211. "indices": ["my_index"],
  212. "query": {
  213. "match_all": {
  214. "boost": 1
  215. }
  216. },
  217. "script_fields": {
  218. "my_script_field": {
  219. "script": {
  220. "lang": "painless",
  221. "inline": "doc['some_field'].value + '_' + doc['another_field'].value" <2>
  222. }
  223. }
  224. }
  225. }
  226. GET _ml/datafeeds/datafeed-test2/_preview
  227. --------------------------------------------------
  228. // CONSOLE
  229. // TEST[skip:needs-licence]
  230. <1> The script field has a rather generic name in this case, since it will
  231. be used for various tests in the subsequent examples.
  232. <2> The script field uses the plus (+) operator to concatenate strings.
  233. The preview {dfeed} API returns the following results, which show that "JOE"
  234. and "SMITH " have been concatenated and an underscore was added:
  235. [source,js]
  236. ----------------------------------
  237. [
  238. {
  239. "@timestamp": 1490274000000,
  240. "my_script_field": "JOE_SMITH "
  241. }
  242. ]
  243. ----------------------------------
  244. [[ml-configuring-transform3]]
  245. .Example 3: Trimming strings
  246. [source,js]
  247. --------------------------------------------------
  248. POST _ml/datafeeds/datafeed-test2/_update
  249. {
  250. "script_fields": {
  251. "my_script_field": {
  252. "script": {
  253. "lang": "painless",
  254. "inline": "doc['another_field'].value.trim()" <1>
  255. }
  256. }
  257. }
  258. }
  259. GET _ml/datafeeds/datafeed-test2/_preview
  260. --------------------------------------------------
  261. // CONSOLE
  262. // TEST[skip:continued]
  263. <1> This script field uses the `trim()` function to trim extra white space from a
  264. string.
  265. The preview {dfeed} API returns the following results, which show that "SMITH "
  266. has been trimmed to "SMITH":
  267. [source,js]
  268. ----------------------------------
  269. [
  270. {
  271. "@timestamp": 1490274000000,
  272. "my_script_field": "SMITH"
  273. }
  274. ]
  275. ----------------------------------
  276. [[ml-configuring-transform4]]
  277. .Example 4: Converting strings to lowercase
  278. [source,js]
  279. --------------------------------------------------
  280. POST _ml/datafeeds/datafeed-test2/_update
  281. {
  282. "script_fields": {
  283. "my_script_field": {
  284. "script": {
  285. "lang": "painless",
  286. "inline": "doc['some_field'].value.toLowerCase()" <1>
  287. }
  288. }
  289. }
  290. }
  291. GET _ml/datafeeds/datafeed-test2/_preview
  292. --------------------------------------------------
  293. // CONSOLE
  294. // TEST[skip:continued]
  295. <1> This script field uses the `toLowerCase` function to convert a string to all
  296. lowercase letters. Likewise, you can use the `toUpperCase{}` function to convert
  297. a string to uppercase letters.
  298. The preview {dfeed} API returns the following results, which show that "JOE"
  299. has been converted to "joe":
  300. [source,js]
  301. ----------------------------------
  302. [
  303. {
  304. "@timestamp": 1490274000000,
  305. "my_script_field": "joe"
  306. }
  307. ]
  308. ----------------------------------
  309. [[ml-configuring-transform5]]
  310. .Example 5: Converting strings to mixed case formats
  311. [source,js]
  312. --------------------------------------------------
  313. POST _ml/datafeeds/datafeed-test2/_update
  314. {
  315. "script_fields": {
  316. "my_script_field": {
  317. "script": {
  318. "lang": "painless",
  319. "inline": "doc['some_field'].value.substring(0, 1).toUpperCase() + doc['some_field'].value.substring(1).toLowerCase()" <1>
  320. }
  321. }
  322. }
  323. }
  324. GET _ml/datafeeds/datafeed-test2/_preview
  325. --------------------------------------------------
  326. // CONSOLE
  327. // TEST[skip:continued]
  328. <1> This script field is a more complicated example of case manipulation. It uses
  329. the `subString()` function to capitalize the first letter of a string and
  330. converts the remaining characters to lowercase.
  331. The preview {dfeed} API returns the following results, which show that "JOE"
  332. has been converted to "Joe":
  333. [source,js]
  334. ----------------------------------
  335. [
  336. {
  337. "@timestamp": 1490274000000,
  338. "my_script_field": "Joe"
  339. }
  340. ]
  341. ----------------------------------
  342. [[ml-configuring-transform6]]
  343. .Example 6: Replacing tokens
  344. [source,js]
  345. --------------------------------------------------
  346. POST _ml/datafeeds/datafeed-test2/_update
  347. {
  348. "script_fields": {
  349. "my_script_field": {
  350. "script": {
  351. "lang": "painless",
  352. "inline": "/\\s/.matcher(doc['tokenstring2'].value).replaceAll('_')" <1>
  353. }
  354. }
  355. }
  356. }
  357. GET _ml/datafeeds/datafeed-test2/_preview
  358. --------------------------------------------------
  359. // CONSOLE
  360. // TEST[skip:continued]
  361. <1> This script field uses regular expressions to replace white
  362. space with underscores.
  363. The preview {dfeed} API returns the following results, which show that
  364. "foo bar baz" has been converted to "foo_bar_baz":
  365. [source,js]
  366. ----------------------------------
  367. [
  368. {
  369. "@timestamp": 1490274000000,
  370. "my_script_field": "foo_bar_baz"
  371. }
  372. ]
  373. ----------------------------------
  374. [[ml-configuring-transform7]]
  375. .Example 7: Regular expression matching and concatenation
  376. [source,js]
  377. --------------------------------------------------
  378. POST _ml/datafeeds/datafeed-test2/_update
  379. {
  380. "script_fields": {
  381. "my_script_field": {
  382. "script": {
  383. "lang": "painless",
  384. "inline": "def m = /(.*)-bar-([0-9][0-9])/.matcher(doc['tokenstring3'].value); return m.find() ? m.group(1) + '_' + m.group(2) : '';" <1>
  385. }
  386. }
  387. }
  388. }
  389. GET _ml/datafeeds/datafeed-test2/_preview
  390. --------------------------------------------------
  391. // CONSOLE
  392. // TEST[skip:continued]
  393. <1> This script field looks for a specific regular expression pattern and emits the
  394. matched groups as a concatenated string. If no match is found, it emits an empty
  395. string.
  396. The preview {dfeed} API returns the following results, which show that
  397. "foo-bar-19" has been converted to "foo_19":
  398. [source,js]
  399. ----------------------------------
  400. [
  401. {
  402. "@timestamp": 1490274000000,
  403. "my_script_field": "foo_19"
  404. }
  405. ]
  406. ----------------------------------
  407. [[ml-configuring-transform8]]
  408. .Example 8: Splitting strings by domain name
  409. [source,js]
  410. --------------------------------------------------
  411. PUT _ml/anomaly_detectors/test3
  412. {
  413. "description":"DNS tunneling",
  414. "analysis_config":{
  415. "bucket_span": "30m",
  416. "influencers": ["clientip","hrd"],
  417. "detectors":[
  418. {
  419. "function":"high_info_content",
  420. "field_name": "sub",
  421. "over_field_name": "hrd",
  422. "exclude_frequent":"all"
  423. }
  424. ]
  425. },
  426. "data_description": {
  427. "time_field":"@timestamp",
  428. "time_format":"epoch_ms"
  429. }
  430. }
  431. PUT _ml/datafeeds/datafeed-test3
  432. {
  433. "job_id": "test3",
  434. "indices": ["my_index"],
  435. "query": {
  436. "match_all": {
  437. "boost": 1
  438. }
  439. },
  440. "script_fields":{
  441. "sub":{
  442. "script":"return domainSplit(doc['query'].value).get(0);"
  443. },
  444. "hrd":{
  445. "script":"return domainSplit(doc['query'].value).get(1);"
  446. }
  447. }
  448. }
  449. GET _ml/datafeeds/datafeed-test3/_preview
  450. --------------------------------------------------
  451. // CONSOLE
  452. // TEST[skip:needs-licence]
  453. If you have a single field that contains a well-formed DNS domain name, you can
  454. use the `domainSplit()` function to split the string into its highest registered
  455. domain and the sub-domain, which is everything to the left of the highest
  456. registered domain. For example, the highest registered domain of
  457. `www.ml.elastic.co` is `elastic.co` and the sub-domain is `www.ml`. The
  458. `domainSplit()` function returns an array of two values: the first value is the
  459. subdomain; the second value is the highest registered domain.
  460. The preview {dfeed} API returns the following results, which show that
  461. "www.ml.elastic.co" has been split into "elastic.co" and "www.ml":
  462. [source,js]
  463. ----------------------------------
  464. [
  465. {
  466. "@timestamp": 1490274000000,
  467. "clientip.keyword": "123.456.78.900",
  468. "hrd": "elastic.co",
  469. "sub": "www.ml"
  470. }
  471. ]
  472. ----------------------------------
  473. [[ml-configuring-transform9]]
  474. .Example 9: Transforming geo_point data
  475. [source,js]
  476. --------------------------------------------------
  477. PUT _ml/anomaly_detectors/test4
  478. {
  479. "analysis_config":{
  480. "bucket_span": "10m",
  481. "detectors":[
  482. {
  483. "function":"lat_long",
  484. "field_name": "my_coordinates"
  485. }
  486. ]
  487. },
  488. "data_description": {
  489. "time_field":"@timestamp",
  490. "time_format":"epoch_ms"
  491. }
  492. }
  493. PUT _ml/datafeeds/datafeed-test4
  494. {
  495. "job_id": "test4",
  496. "indices": ["my_index"],
  497. "query": {
  498. "match_all": {
  499. "boost": 1
  500. }
  501. },
  502. "script_fields": {
  503. "my_coordinates": {
  504. "script": {
  505. "inline": "doc['coords.lat'].value + ',' + doc['coords.lon'].value",
  506. "lang": "painless"
  507. }
  508. }
  509. }
  510. }
  511. GET _ml/datafeeds/datafeed-test4/_preview
  512. --------------------------------------------------
  513. // CONSOLE
  514. // TEST[skip:needs-licence]
  515. In {es}, location data can be stored in `geo_point` fields but this data type is
  516. not supported natively in {ml} analytics. This example of a script field
  517. transforms the data into an appropriate format. For more information,
  518. see <<ml-geo-functions>>.
  519. The preview {dfeed} API returns the following results, which show that
  520. `41.44` and `90.5` have been combined into "41.44,90.5":
  521. [source,js]
  522. ----------------------------------
  523. [
  524. {
  525. "@timestamp": 1490274000000,
  526. "my_coordinates": "41.44,90.5"
  527. }
  528. ]
  529. ----------------------------------