transforms.asciidoc 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588
  1. [role="xpack"]
  2. [[ml-configuring-transform]]
  3. === Transforming data with script fields
  4. If you use {dfeeds}, you can add scripts to transform your data before
  5. it is analyzed. {dfeeds-cap} contain an optional `script_fields` property, where
  6. you can specify scripts that evaluate custom expressions and return script
  7. fields.
  8. If your {dfeed} defines script fields, you can use those fields in your job.
  9. For example, you can use the script fields in the analysis functions in one or
  10. more detectors.
  11. * <<ml-configuring-transform1>>
  12. * <<ml-configuring-transform2>>
  13. * <<ml-configuring-transform3>>
  14. * <<ml-configuring-transform4>>
  15. * <<ml-configuring-transform5>>
  16. * <<ml-configuring-transform6>>
  17. * <<ml-configuring-transform7>>
  18. * <<ml-configuring-transform8>>
  19. * <<ml-configuring-transform9>>
  20. The following indices APIs create and add content to an index that is used in
  21. subsequent examples:
  22. [source,js]
  23. ----------------------------------
  24. PUT /my_index
  25. {
  26. "mappings":{
  27. "properties": {
  28. "@timestamp": {
  29. "type": "date"
  30. },
  31. "aborted_count": {
  32. "type": "long"
  33. },
  34. "another_field": {
  35. "type": "keyword" <1>
  36. },
  37. "clientip": {
  38. "type": "keyword"
  39. },
  40. "coords": {
  41. "properties": {
  42. "lat": {
  43. "type": "keyword"
  44. },
  45. "lon": {
  46. "type": "keyword"
  47. }
  48. }
  49. },
  50. "error_count": {
  51. "type": "long"
  52. },
  53. "query": {
  54. "type": "keyword"
  55. },
  56. "some_field": {
  57. "type": "keyword"
  58. },
  59. "tokenstring1":{
  60. "type":"keyword"
  61. },
  62. "tokenstring2":{
  63. "type":"keyword"
  64. },
  65. "tokenstring3":{
  66. "type":"keyword"
  67. }
  68. }
  69. }
  70. }
  71. PUT /my_index/_doc/1
  72. {
  73. "@timestamp":"2017-03-23T13:00:00",
  74. "error_count":36320,
  75. "aborted_count":4156,
  76. "some_field":"JOE",
  77. "another_field":"SMITH ",
  78. "tokenstring1":"foo-bar-baz",
  79. "tokenstring2":"foo bar baz",
  80. "tokenstring3":"foo-bar-19",
  81. "query":"www.ml.elastic.co",
  82. "clientip":"123.456.78.900",
  83. "coords": {
  84. "lat" : 41.44,
  85. "lon":90.5
  86. }
  87. }
  88. ----------------------------------
  89. // CONSOLE
  90. // TEST[skip:SETUP]
  91. <1> In this example, string fields are mapped as `keyword` fields to support
  92. aggregation. If you want both a full text (`text`) and a keyword (`keyword`)
  93. version of the same field, use multi-fields. For more information, see
  94. {ref}/multi-fields.html[fields].
  95. [[ml-configuring-transform1]]
  96. .Example 1: Adding two numerical fields
  97. [source,js]
  98. ----------------------------------
  99. PUT _ml/anomaly_detectors/test1
  100. {
  101. "analysis_config":{
  102. "bucket_span": "10m",
  103. "detectors":[
  104. {
  105. "function":"mean",
  106. "field_name": "total_error_count", <1>
  107. "detector_description": "Custom script field transformation"
  108. }
  109. ]
  110. },
  111. "data_description": {
  112. "time_field":"@timestamp",
  113. "time_format":"epoch_ms"
  114. }
  115. }
  116. PUT _ml/datafeeds/datafeed-test1
  117. {
  118. "job_id": "test1",
  119. "indices": ["my_index"],
  120. "query": {
  121. "match_all": {
  122. "boost": 1
  123. }
  124. },
  125. "script_fields": {
  126. "total_error_count": { <2>
  127. "script": {
  128. "lang": "expression",
  129. "source": "doc['error_count'].value + doc['aborted_count'].value"
  130. }
  131. }
  132. }
  133. }
  134. ----------------------------------
  135. // CONSOLE
  136. // TEST[skip:needs-licence]
  137. <1> A script field named `total_error_count` is referenced in the detector
  138. within the job.
  139. <2> The script field is defined in the {dfeed}.
  140. This `test1` job contains a detector that uses a script field in a mean analysis
  141. function. The `datafeed-test1` {dfeed} defines the script field. It contains a
  142. script that adds two fields in the document to produce a "total" error count.
  143. The syntax for the `script_fields` property is identical to that used by {es}.
  144. For more information, see {ref}/search-request-body.html#request-body-search-script-fields[Script Fields].
  145. You can preview the contents of the {dfeed} by using the following API:
  146. [source,js]
  147. ----------------------------------
  148. GET _ml/datafeeds/datafeed-test1/_preview
  149. ----------------------------------
  150. // CONSOLE
  151. // TEST[skip:continued]
  152. In this example, the API returns the following results, which contain a sum of
  153. the `error_count` and `aborted_count` values:
  154. [source,js]
  155. ----------------------------------
  156. [
  157. {
  158. "@timestamp": 1490274000000,
  159. "total_error_count": 40476
  160. }
  161. ]
  162. ----------------------------------
  163. NOTE: This example demonstrates how to use script fields, but it contains
  164. insufficient data to generate meaningful results.
  165. //For a full demonstration of
  166. //how to create jobs with sample data, see <<ml-getting-started>>.
  167. You can alternatively use {kib} to create an advanced job that uses script
  168. fields. To add the `script_fields` property to your {dfeed}, you must use the
  169. **Edit JSON** tab. For example:
  170. [role="screenshot"]
  171. image::images/ml-scriptfields.jpg[Adding script fields to a {dfeed} in {kib}]
  172. [[ml-configuring-transform-examples]]
  173. ==== Common Script Field Examples
  174. While the possibilities are limitless, there are a number of common scenarios
  175. where you might use script fields in your {dfeeds}.
  176. [NOTE]
  177. ===============================
  178. Some of these examples use regular expressions. By default, regular
  179. expressions are disabled because they circumvent the protection that Painless
  180. provides against long running and memory hungry scripts. For more information,
  181. see {ref}/modules-scripting-painless.html[Painless Scripting Language].
  182. Machine learning analysis is case sensitive. For example, "John" is considered
  183. to be different than "john". This is one reason you might consider using scripts
  184. that convert your strings to upper or lowercase letters.
  185. ===============================
  186. [[ml-configuring-transform2]]
  187. .Example 2: Concatenating strings
  188. [source,js]
  189. --------------------------------------------------
  190. PUT _ml/anomaly_detectors/test2
  191. {
  192. "analysis_config":{
  193. "bucket_span": "10m",
  194. "detectors":[
  195. {
  196. "function":"low_info_content",
  197. "field_name":"my_script_field", <1>
  198. "detector_description": "Custom script field transformation"
  199. }
  200. ]
  201. },
  202. "data_description": {
  203. "time_field":"@timestamp",
  204. "time_format":"epoch_ms"
  205. }
  206. }
  207. PUT _ml/datafeeds/datafeed-test2
  208. {
  209. "job_id": "test2",
  210. "indices": ["my_index"],
  211. "query": {
  212. "match_all": {
  213. "boost": 1
  214. }
  215. },
  216. "script_fields": {
  217. "my_script_field": {
  218. "script": {
  219. "lang": "painless",
  220. "source": "doc['some_field'].value + '_' + doc['another_field'].value" <2>
  221. }
  222. }
  223. }
  224. }
  225. GET _ml/datafeeds/datafeed-test2/_preview
  226. --------------------------------------------------
  227. // CONSOLE
  228. // TEST[skip:needs-licence]
  229. <1> The script field has a rather generic name in this case, since it will
  230. be used for various tests in the subsequent examples.
  231. <2> The script field uses the plus (+) operator to concatenate strings.
  232. The preview {dfeed} API returns the following results, which show that "JOE"
  233. and "SMITH " have been concatenated and an underscore was added:
  234. [source,js]
  235. ----------------------------------
  236. [
  237. {
  238. "@timestamp": 1490274000000,
  239. "my_script_field": "JOE_SMITH "
  240. }
  241. ]
  242. ----------------------------------
  243. [[ml-configuring-transform3]]
  244. .Example 3: Trimming strings
  245. [source,js]
  246. --------------------------------------------------
  247. POST _ml/datafeeds/datafeed-test2/_update
  248. {
  249. "script_fields": {
  250. "my_script_field": {
  251. "script": {
  252. "lang": "painless",
  253. "source": "doc['another_field'].value.trim()" <1>
  254. }
  255. }
  256. }
  257. }
  258. GET _ml/datafeeds/datafeed-test2/_preview
  259. --------------------------------------------------
  260. // CONSOLE
  261. // TEST[skip:continued]
  262. <1> This script field uses the `trim()` function to trim extra white space from a
  263. string.
  264. The preview {dfeed} API returns the following results, which show that "SMITH "
  265. has been trimmed to "SMITH":
  266. [source,js]
  267. ----------------------------------
  268. [
  269. {
  270. "@timestamp": 1490274000000,
  271. "my_script_field": "SMITH"
  272. }
  273. ]
  274. ----------------------------------
  275. [[ml-configuring-transform4]]
  276. .Example 4: Converting strings to lowercase
  277. [source,js]
  278. --------------------------------------------------
  279. POST _ml/datafeeds/datafeed-test2/_update
  280. {
  281. "script_fields": {
  282. "my_script_field": {
  283. "script": {
  284. "lang": "painless",
  285. "source": "doc['some_field'].value.toLowerCase()" <1>
  286. }
  287. }
  288. }
  289. }
  290. GET _ml/datafeeds/datafeed-test2/_preview
  291. --------------------------------------------------
  292. // CONSOLE
  293. // TEST[skip:continued]
  294. <1> This script field uses the `toLowerCase` function to convert a string to all
  295. lowercase letters. Likewise, you can use the `toUpperCase{}` function to convert
  296. a string to uppercase letters.
  297. The preview {dfeed} API returns the following results, which show that "JOE"
  298. has been converted to "joe":
  299. [source,js]
  300. ----------------------------------
  301. [
  302. {
  303. "@timestamp": 1490274000000,
  304. "my_script_field": "joe"
  305. }
  306. ]
  307. ----------------------------------
  308. [[ml-configuring-transform5]]
  309. .Example 5: Converting strings to mixed case formats
  310. [source,js]
  311. --------------------------------------------------
  312. POST _ml/datafeeds/datafeed-test2/_update
  313. {
  314. "script_fields": {
  315. "my_script_field": {
  316. "script": {
  317. "lang": "painless",
  318. "source": "doc['some_field'].value.substring(0, 1).toUpperCase() + doc['some_field'].value.substring(1).toLowerCase()" <1>
  319. }
  320. }
  321. }
  322. }
  323. GET _ml/datafeeds/datafeed-test2/_preview
  324. --------------------------------------------------
  325. // CONSOLE
  326. // TEST[skip:continued]
  327. <1> This script field is a more complicated example of case manipulation. It uses
  328. the `subString()` function to capitalize the first letter of a string and
  329. converts the remaining characters to lowercase.
  330. The preview {dfeed} API returns the following results, which show that "JOE"
  331. has been converted to "Joe":
  332. [source,js]
  333. ----------------------------------
  334. [
  335. {
  336. "@timestamp": 1490274000000,
  337. "my_script_field": "Joe"
  338. }
  339. ]
  340. ----------------------------------
  341. [[ml-configuring-transform6]]
  342. .Example 6: Replacing tokens
  343. [source,js]
  344. --------------------------------------------------
  345. POST _ml/datafeeds/datafeed-test2/_update
  346. {
  347. "script_fields": {
  348. "my_script_field": {
  349. "script": {
  350. "lang": "painless",
  351. "source": "/\\s/.matcher(doc['tokenstring2'].value).replaceAll('_')" <1>
  352. }
  353. }
  354. }
  355. }
  356. GET _ml/datafeeds/datafeed-test2/_preview
  357. --------------------------------------------------
  358. // CONSOLE
  359. // TEST[skip:continued]
  360. <1> This script field uses regular expressions to replace white
  361. space with underscores.
  362. The preview {dfeed} API returns the following results, which show that
  363. "foo bar baz" has been converted to "foo_bar_baz":
  364. [source,js]
  365. ----------------------------------
  366. [
  367. {
  368. "@timestamp": 1490274000000,
  369. "my_script_field": "foo_bar_baz"
  370. }
  371. ]
  372. ----------------------------------
  373. [[ml-configuring-transform7]]
  374. .Example 7: Regular expression matching and concatenation
  375. [source,js]
  376. --------------------------------------------------
  377. POST _ml/datafeeds/datafeed-test2/_update
  378. {
  379. "script_fields": {
  380. "my_script_field": {
  381. "script": {
  382. "lang": "painless",
  383. "source": "def m = /(.*)-bar-([0-9][0-9])/.matcher(doc['tokenstring3'].value); return m.find() ? m.group(1) + '_' + m.group(2) : '';" <1>
  384. }
  385. }
  386. }
  387. }
  388. GET _ml/datafeeds/datafeed-test2/_preview
  389. --------------------------------------------------
  390. // CONSOLE
  391. // TEST[skip:continued]
  392. <1> This script field looks for a specific regular expression pattern and emits the
  393. matched groups as a concatenated string. If no match is found, it emits an empty
  394. string.
  395. The preview {dfeed} API returns the following results, which show that
  396. "foo-bar-19" has been converted to "foo_19":
  397. [source,js]
  398. ----------------------------------
  399. [
  400. {
  401. "@timestamp": 1490274000000,
  402. "my_script_field": "foo_19"
  403. }
  404. ]
  405. ----------------------------------
  406. [[ml-configuring-transform8]]
  407. .Example 8: Splitting strings by domain name
  408. [source,js]
  409. --------------------------------------------------
  410. PUT _ml/anomaly_detectors/test3
  411. {
  412. "description":"DNS tunneling",
  413. "analysis_config":{
  414. "bucket_span": "30m",
  415. "influencers": ["clientip","hrd"],
  416. "detectors":[
  417. {
  418. "function":"high_info_content",
  419. "field_name": "sub",
  420. "over_field_name": "hrd",
  421. "exclude_frequent":"all"
  422. }
  423. ]
  424. },
  425. "data_description": {
  426. "time_field":"@timestamp",
  427. "time_format":"epoch_ms"
  428. }
  429. }
  430. PUT _ml/datafeeds/datafeed-test3
  431. {
  432. "job_id": "test3",
  433. "indices": ["my_index"],
  434. "query": {
  435. "match_all": {
  436. "boost": 1
  437. }
  438. },
  439. "script_fields":{
  440. "sub":{
  441. "script":"return domainSplit(doc['query'].value).get(0);"
  442. },
  443. "hrd":{
  444. "script":"return domainSplit(doc['query'].value).get(1);"
  445. }
  446. }
  447. }
  448. GET _ml/datafeeds/datafeed-test3/_preview
  449. --------------------------------------------------
  450. // CONSOLE
  451. // TEST[skip:needs-licence]
  452. If you have a single field that contains a well-formed DNS domain name, you can
  453. use the `domainSplit()` function to split the string into its highest registered
  454. domain and the sub-domain, which is everything to the left of the highest
  455. registered domain. For example, the highest registered domain of
  456. `www.ml.elastic.co` is `elastic.co` and the sub-domain is `www.ml`. The
  457. `domainSplit()` function returns an array of two values: the first value is the
  458. subdomain; the second value is the highest registered domain.
  459. The preview {dfeed} API returns the following results, which show that
  460. "www.ml.elastic.co" has been split into "elastic.co" and "www.ml":
  461. [source,js]
  462. ----------------------------------
  463. [
  464. {
  465. "@timestamp": 1490274000000,
  466. "clientip.keyword": "123.456.78.900",
  467. "hrd": "elastic.co",
  468. "sub": "www.ml"
  469. }
  470. ]
  471. ----------------------------------
  472. [[ml-configuring-transform9]]
  473. .Example 9: Transforming geo_point data
  474. [source,js]
  475. --------------------------------------------------
  476. PUT _ml/anomaly_detectors/test4
  477. {
  478. "analysis_config":{
  479. "bucket_span": "10m",
  480. "detectors":[
  481. {
  482. "function":"lat_long",
  483. "field_name": "my_coordinates"
  484. }
  485. ]
  486. },
  487. "data_description": {
  488. "time_field":"@timestamp",
  489. "time_format":"epoch_ms"
  490. }
  491. }
  492. PUT _ml/datafeeds/datafeed-test4
  493. {
  494. "job_id": "test4",
  495. "indices": ["my_index"],
  496. "query": {
  497. "match_all": {
  498. "boost": 1
  499. }
  500. },
  501. "script_fields": {
  502. "my_coordinates": {
  503. "script": {
  504. "source": "doc['coords.lat'].value + ',' + doc['coords.lon'].value",
  505. "lang": "painless"
  506. }
  507. }
  508. }
  509. }
  510. GET _ml/datafeeds/datafeed-test4/_preview
  511. --------------------------------------------------
  512. // CONSOLE
  513. // TEST[skip:needs-licence]
  514. In {es}, location data can be stored in `geo_point` fields but this data type is
  515. not supported natively in {ml} analytics. This example of a script field
  516. transforms the data into an appropriate format. For more information,
  517. see <<ml-geo-functions>>.
  518. The preview {dfeed} API returns the following results, which show that
  519. `41.44` and `90.5` have been combined into "41.44,90.5":
  520. [source,js]
  521. ----------------------------------
  522. [
  523. {
  524. "@timestamp": 1490274000000,
  525. "my_coordinates": "41.44,90.5"
  526. }
  527. ]
  528. ----------------------------------