transforms.asciidoc 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592
  1. [role="xpack"]
  2. [[ml-configuring-transform]]
  3. === Transforming data with script fields
  4. If you use {dfeeds}, you can add scripts to transform your data before
  5. it is analyzed. {dfeeds-cap} contain an optional `script_fields` property, where
  6. you can specify scripts that evaluate custom expressions and return script
  7. fields.
  8. If your {dfeed} defines script fields, you can use those fields in your job.
  9. For example, you can use the script fields in the analysis functions in one or
  10. more detectors.
  11. * <<ml-configuring-transform1>>
  12. * <<ml-configuring-transform2>>
  13. * <<ml-configuring-transform3>>
  14. * <<ml-configuring-transform4>>
  15. * <<ml-configuring-transform5>>
  16. * <<ml-configuring-transform6>>
  17. * <<ml-configuring-transform7>>
  18. * <<ml-configuring-transform8>>
  19. * <<ml-configuring-transform9>>
  20. The following indices APIs create and add content to an index that is used in
  21. subsequent examples:
  22. [source,js]
  23. ----------------------------------
  24. PUT /my_index
  25. {
  26. "mappings":{
  27. "_doc":{
  28. "properties": {
  29. "@timestamp": {
  30. "type": "date"
  31. },
  32. "aborted_count": {
  33. "type": "long"
  34. },
  35. "another_field": {
  36. "type": "keyword" <1>
  37. },
  38. "clientip": {
  39. "type": "keyword"
  40. },
  41. "coords": {
  42. "properties": {
  43. "lat": {
  44. "type": "keyword"
  45. },
  46. "lon": {
  47. "type": "keyword"
  48. }
  49. }
  50. },
  51. "error_count": {
  52. "type": "long"
  53. },
  54. "query": {
  55. "type": "keyword"
  56. },
  57. "some_field": {
  58. "type": "keyword"
  59. },
  60. "tokenstring1":{
  61. "type":"keyword"
  62. },
  63. "tokenstring2":{
  64. "type":"keyword"
  65. },
  66. "tokenstring3":{
  67. "type":"keyword"
  68. }
  69. }
  70. }
  71. }
  72. }
  73. PUT /my_index/_doc/1
  74. {
  75. "@timestamp":"2017-03-23T13:00:00",
  76. "error_count":36320,
  77. "aborted_count":4156,
  78. "some_field":"JOE",
  79. "another_field":"SMITH ",
  80. "tokenstring1":"foo-bar-baz",
  81. "tokenstring2":"foo bar baz",
  82. "tokenstring3":"foo-bar-19",
  83. "query":"www.ml.elastic.co",
  84. "clientip":"123.456.78.900",
  85. "coords": {
  86. "lat" : 41.44,
  87. "lon":90.5
  88. }
  89. }
  90. ----------------------------------
  91. // CONSOLE
  92. // TEST[skip:SETUP]
  93. <1> In this example, string fields are mapped as `keyword` fields to support
  94. aggregation. If you want both a full text (`text`) and a keyword (`keyword`)
  95. version of the same field, use multi-fields. For more information, see
  96. {ref}/multi-fields.html[fields].
  97. [[ml-configuring-transform1]]
  98. .Example 1: Adding two numerical fields
  99. [source,js]
  100. ----------------------------------
  101. PUT _xpack/ml/anomaly_detectors/test1
  102. {
  103. "analysis_config":{
  104. "bucket_span": "10m",
  105. "detectors":[
  106. {
  107. "function":"mean",
  108. "field_name": "total_error_count", <1>
  109. "detector_description": "Custom script field transformation"
  110. }
  111. ]
  112. },
  113. "data_description": {
  114. "time_field":"@timestamp",
  115. "time_format":"epoch_ms"
  116. }
  117. }
  118. PUT _xpack/ml/datafeeds/datafeed-test1
  119. {
  120. "job_id": "test1",
  121. "indices": ["my_index"],
  122. "types": ["_doc"],
  123. "query": {
  124. "match_all": {
  125. "boost": 1
  126. }
  127. },
  128. "script_fields": {
  129. "total_error_count": { <2>
  130. "script": {
  131. "lang": "expression",
  132. "inline": "doc['error_count'].value + doc['aborted_count'].value"
  133. }
  134. }
  135. }
  136. }
  137. ----------------------------------
  138. // CONSOLE
  139. // TEST[skip:needs-licence]
  140. <1> A script field named `total_error_count` is referenced in the detector
  141. within the job.
  142. <2> The script field is defined in the {dfeed}.
  143. This `test1` job contains a detector that uses a script field in a mean analysis
  144. function. The `datafeed-test1` {dfeed} defines the script field. It contains a
  145. script that adds two fields in the document to produce a "total" error count.
  146. The syntax for the `script_fields` property is identical to that used by {es}.
  147. For more information, see {ref}/search-request-script-fields.html[Script Fields].
  148. You can preview the contents of the {dfeed} by using the following API:
  149. [source,js]
  150. ----------------------------------
  151. GET _xpack/ml/datafeeds/datafeed-test1/_preview
  152. ----------------------------------
  153. // CONSOLE
  154. // TEST[skip:continued]
  155. In this example, the API returns the following results, which contain a sum of
  156. the `error_count` and `aborted_count` values:
  157. [source,js]
  158. ----------------------------------
  159. [
  160. {
  161. "@timestamp": 1490274000000,
  162. "total_error_count": 40476
  163. }
  164. ]
  165. ----------------------------------
  166. NOTE: This example demonstrates how to use script fields, but it contains
  167. insufficient data to generate meaningful results. For a full demonstration of
  168. how to create jobs with sample data, see <<ml-getting-started>>.
  169. You can alternatively use {kib} to create an advanced job that uses script
  170. fields. To add the `script_fields` property to your {dfeed}, you must use the
  171. **Edit JSON** tab. For example:
  172. [role="screenshot"]
  173. image::images/ml-scriptfields.jpg[Adding script fields to a {dfeed} in {kib}]
  174. [[ml-configuring-transform-examples]]
  175. ==== Common Script Field Examples
  176. While the possibilities are limitless, there are a number of common scenarios
  177. where you might use script fields in your {dfeeds}.
  178. [NOTE]
  179. ===============================
  180. Some of these examples use regular expressions. By default, regular
  181. expressions are disabled because they circumvent the protection that Painless
  182. provides against long running and memory hungry scripts. For more information,
  183. see {ref}/modules-scripting-painless.html[Painless Scripting Language].
  184. Machine learning analysis is case sensitive. For example, "John" is considered
  185. to be different than "john". This is one reason you might consider using scripts
  186. that convert your strings to upper or lowercase letters.
  187. ===============================
  188. [[ml-configuring-transform2]]
  189. .Example 2: Concatenating strings
  190. [source,js]
  191. --------------------------------------------------
  192. PUT _xpack/ml/anomaly_detectors/test2
  193. {
  194. "analysis_config":{
  195. "bucket_span": "10m",
  196. "detectors":[
  197. {
  198. "function":"low_info_content",
  199. "field_name":"my_script_field", <1>
  200. "detector_description": "Custom script field transformation"
  201. }
  202. ]
  203. },
  204. "data_description": {
  205. "time_field":"@timestamp",
  206. "time_format":"epoch_ms"
  207. }
  208. }
  209. PUT _xpack/ml/datafeeds/datafeed-test2
  210. {
  211. "job_id": "test2",
  212. "indices": ["my_index"],
  213. "types": ["_doc"],
  214. "query": {
  215. "match_all": {
  216. "boost": 1
  217. }
  218. },
  219. "script_fields": {
  220. "my_script_field": {
  221. "script": {
  222. "lang": "painless",
  223. "inline": "doc['some_field'].value + '_' + doc['another_field'].value" <2>
  224. }
  225. }
  226. }
  227. }
  228. GET _xpack/ml/datafeeds/datafeed-test2/_preview
  229. --------------------------------------------------
  230. // CONSOLE
  231. // TEST[skip:needs-licence]
  232. <1> The script field has a rather generic name in this case, since it will
  233. be used for various tests in the subsequent examples.
  234. <2> The script field uses the plus (+) operator to concatenate strings.
  235. The preview {dfeed} API returns the following results, which show that "JOE"
  236. and "SMITH " have been concatenated and an underscore was added:
  237. [source,js]
  238. ----------------------------------
  239. [
  240. {
  241. "@timestamp": 1490274000000,
  242. "my_script_field": "JOE_SMITH "
  243. }
  244. ]
  245. ----------------------------------
  246. [[ml-configuring-transform3]]
  247. .Example 3: Trimming strings
  248. [source,js]
  249. --------------------------------------------------
  250. POST _xpack/ml/datafeeds/datafeed-test2/_update
  251. {
  252. "script_fields": {
  253. "my_script_field": {
  254. "script": {
  255. "lang": "painless",
  256. "inline": "doc['another_field'].value.trim()" <1>
  257. }
  258. }
  259. }
  260. }
  261. GET _xpack/ml/datafeeds/datafeed-test2/_preview
  262. --------------------------------------------------
  263. // CONSOLE
  264. // TEST[skip:continued]
  265. <1> This script field uses the `trim()` function to trim extra white space from a
  266. string.
  267. The preview {dfeed} API returns the following results, which show that "SMITH "
  268. has been trimmed to "SMITH":
  269. [source,js]
  270. ----------------------------------
  271. [
  272. {
  273. "@timestamp": 1490274000000,
  274. "my_script_field": "SMITH"
  275. }
  276. ]
  277. ----------------------------------
  278. [[ml-configuring-transform4]]
  279. .Example 4: Converting strings to lowercase
  280. [source,js]
  281. --------------------------------------------------
  282. POST _xpack/ml/datafeeds/datafeed-test2/_update
  283. {
  284. "script_fields": {
  285. "my_script_field": {
  286. "script": {
  287. "lang": "painless",
  288. "inline": "doc['some_field'].value.toLowerCase()" <1>
  289. }
  290. }
  291. }
  292. }
  293. GET _xpack/ml/datafeeds/datafeed-test2/_preview
  294. --------------------------------------------------
  295. // CONSOLE
  296. // TEST[skip:continued]
  297. <1> This script field uses the `toLowerCase` function to convert a string to all
  298. lowercase letters. Likewise, you can use the `toUpperCase{}` function to convert
  299. a string to uppercase letters.
  300. The preview {dfeed} API returns the following results, which show that "JOE"
  301. has been converted to "joe":
  302. [source,js]
  303. ----------------------------------
  304. [
  305. {
  306. "@timestamp": 1490274000000,
  307. "my_script_field": "joe"
  308. }
  309. ]
  310. ----------------------------------
  311. [[ml-configuring-transform5]]
  312. .Example 5: Converting strings to mixed case formats
  313. [source,js]
  314. --------------------------------------------------
  315. POST _xpack/ml/datafeeds/datafeed-test2/_update
  316. {
  317. "script_fields": {
  318. "my_script_field": {
  319. "script": {
  320. "lang": "painless",
  321. "inline": "doc['some_field'].value.substring(0, 1).toUpperCase() + doc['some_field'].value.substring(1).toLowerCase()" <1>
  322. }
  323. }
  324. }
  325. }
  326. GET _xpack/ml/datafeeds/datafeed-test2/_preview
  327. --------------------------------------------------
  328. // CONSOLE
  329. // TEST[skip:continued]
  330. <1> This script field is a more complicated example of case manipulation. It uses
  331. the `subString()` function to capitalize the first letter of a string and
  332. converts the remaining characters to lowercase.
  333. The preview {dfeed} API returns the following results, which show that "JOE"
  334. has been converted to "Joe":
  335. [source,js]
  336. ----------------------------------
  337. [
  338. {
  339. "@timestamp": 1490274000000,
  340. "my_script_field": "Joe"
  341. }
  342. ]
  343. ----------------------------------
  344. [[ml-configuring-transform6]]
  345. .Example 6: Replacing tokens
  346. [source,js]
  347. --------------------------------------------------
  348. POST _xpack/ml/datafeeds/datafeed-test2/_update
  349. {
  350. "script_fields": {
  351. "my_script_field": {
  352. "script": {
  353. "lang": "painless",
  354. "inline": "/\\s/.matcher(doc['tokenstring2'].value).replaceAll('_')" <1>
  355. }
  356. }
  357. }
  358. }
  359. GET _xpack/ml/datafeeds/datafeed-test2/_preview
  360. --------------------------------------------------
  361. // CONSOLE
  362. // TEST[skip:continued]
  363. <1> This script field uses regular expressions to replace white
  364. space with underscores.
  365. The preview {dfeed} API returns the following results, which show that
  366. "foo bar baz" has been converted to "foo_bar_baz":
  367. [source,js]
  368. ----------------------------------
  369. [
  370. {
  371. "@timestamp": 1490274000000,
  372. "my_script_field": "foo_bar_baz"
  373. }
  374. ]
  375. ----------------------------------
  376. [[ml-configuring-transform7]]
  377. .Example 7: Regular expression matching and concatenation
  378. [source,js]
  379. --------------------------------------------------
  380. POST _xpack/ml/datafeeds/datafeed-test2/_update
  381. {
  382. "script_fields": {
  383. "my_script_field": {
  384. "script": {
  385. "lang": "painless",
  386. "inline": "def m = /(.*)-bar-([0-9][0-9])/.matcher(doc['tokenstring3'].value); return m.find() ? m.group(1) + '_' + m.group(2) : '';" <1>
  387. }
  388. }
  389. }
  390. }
  391. GET _xpack/ml/datafeeds/datafeed-test2/_preview
  392. --------------------------------------------------
  393. // CONSOLE
  394. // TEST[skip:continued]
  395. <1> This script field looks for a specific regular expression pattern and emits the
  396. matched groups as a concatenated string. If no match is found, it emits an empty
  397. string.
  398. The preview {dfeed} API returns the following results, which show that
  399. "foo-bar-19" has been converted to "foo_19":
  400. [source,js]
  401. ----------------------------------
  402. [
  403. {
  404. "@timestamp": 1490274000000,
  405. "my_script_field": "foo_19"
  406. }
  407. ]
  408. ----------------------------------
  409. [[ml-configuring-transform8]]
  410. .Example 8: Splitting strings by domain name
  411. [source,js]
  412. --------------------------------------------------
  413. PUT _xpack/ml/anomaly_detectors/test3
  414. {
  415. "description":"DNS tunneling",
  416. "analysis_config":{
  417. "bucket_span": "30m",
  418. "influencers": ["clientip","hrd"],
  419. "detectors":[
  420. {
  421. "function":"high_info_content",
  422. "field_name": "sub",
  423. "over_field_name": "hrd",
  424. "exclude_frequent":"all"
  425. }
  426. ]
  427. },
  428. "data_description": {
  429. "time_field":"@timestamp",
  430. "time_format":"epoch_ms"
  431. }
  432. }
  433. PUT _xpack/ml/datafeeds/datafeed-test3
  434. {
  435. "job_id": "test3",
  436. "indices": ["my_index"],
  437. "types": ["_doc"],
  438. "query": {
  439. "match_all": {
  440. "boost": 1
  441. }
  442. },
  443. "script_fields":{
  444. "sub":{
  445. "script":"return domainSplit(doc['query'].value).get(0);"
  446. },
  447. "hrd":{
  448. "script":"return domainSplit(doc['query'].value).get(1);"
  449. }
  450. }
  451. }
  452. GET _xpack/ml/datafeeds/datafeed-test3/_preview
  453. --------------------------------------------------
  454. // CONSOLE
  455. // TEST[skip:needs-licence]
  456. If you have a single field that contains a well-formed DNS domain name, you can
  457. use the `domainSplit()` function to split the string into its highest registered
  458. domain and the sub-domain, which is everything to the left of the highest
  459. registered domain. For example, the highest registered domain of
  460. `www.ml.elastic.co` is `elastic.co` and the sub-domain is `www.ml`. The
  461. `domainSplit()` function returns an array of two values: the first value is the
  462. subdomain; the second value is the highest registered domain.
  463. The preview {dfeed} API returns the following results, which show that
  464. "www.ml.elastic.co" has been split into "elastic.co" and "www.ml":
  465. [source,js]
  466. ----------------------------------
  467. [
  468. {
  469. "@timestamp": 1490274000000,
  470. "clientip.keyword": "123.456.78.900",
  471. "hrd": "elastic.co",
  472. "sub": "www.ml"
  473. }
  474. ]
  475. ----------------------------------
  476. [[ml-configuring-transform9]]
  477. .Example 9: Transforming geo_point data
  478. [source,js]
  479. --------------------------------------------------
  480. PUT _xpack/ml/anomaly_detectors/test4
  481. {
  482. "analysis_config":{
  483. "bucket_span": "10m",
  484. "detectors":[
  485. {
  486. "function":"lat_long",
  487. "field_name": "my_coordinates"
  488. }
  489. ]
  490. },
  491. "data_description": {
  492. "time_field":"@timestamp",
  493. "time_format":"epoch_ms"
  494. }
  495. }
  496. PUT _xpack/ml/datafeeds/datafeed-test4
  497. {
  498. "job_id": "test4",
  499. "indices": ["my_index"],
  500. "types": ["_doc"],
  501. "query": {
  502. "match_all": {
  503. "boost": 1
  504. }
  505. },
  506. "script_fields": {
  507. "my_coordinates": {
  508. "script": {
  509. "inline": "doc['coords.lat'].value + ',' + doc['coords.lon'].value",
  510. "lang": "painless"
  511. }
  512. }
  513. }
  514. }
  515. GET _xpack/ml/datafeeds/datafeed-test4/_preview
  516. --------------------------------------------------
  517. // CONSOLE
  518. // TEST[skip:needs-licence]
  519. In {es}, location data can be stored in `geo_point` fields but this data type is
  520. not supported natively in {xpackml} analytics. This example of a script field
  521. transforms the data into an appropriate format. For more information,
  522. see <<ml-geo-functions>>.
  523. The preview {dfeed} API returns the following results, which show that
  524. `41.44` and `90.5` have been combined into "41.44,90.5":
  525. [source,js]
  526. ----------------------------------
  527. [
  528. {
  529. "@timestamp": 1490274000000,
  530. "my_coordinates": "41.44,90.5"
  531. }
  532. ]
  533. ----------------------------------