transforms.asciidoc 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590
  1. [role="xpack"]
  2. [[ml-configuring-transform]]
  3. === Transforming data with script fields
  4. If you use {dfeeds}, you can add scripts to transform your data before
  5. it is analyzed. {dfeeds-cap} contain an optional `script_fields` property, where
  6. you can specify scripts that evaluate custom expressions and return script
  7. fields.
  8. If your {dfeed} defines script fields, you can use those fields in your
  9. {anomaly-job}. For example, you can use the script fields in the analysis
  10. functions in one or more detectors.
  11. * <<ml-configuring-transform1>>
  12. * <<ml-configuring-transform2>>
  13. * <<ml-configuring-transform3>>
  14. * <<ml-configuring-transform4>>
  15. * <<ml-configuring-transform5>>
  16. * <<ml-configuring-transform6>>
  17. * <<ml-configuring-transform7>>
  18. * <<ml-configuring-transform8>>
  19. * <<ml-configuring-transform9>>
  20. The following index APIs create and add content to an index that is used in
  21. subsequent examples:
  22. [source,js]
  23. ----------------------------------
  24. PUT /my_index
  25. {
  26. "mappings":{
  27. "properties": {
  28. "@timestamp": {
  29. "type": "date"
  30. },
  31. "aborted_count": {
  32. "type": "long"
  33. },
  34. "another_field": {
  35. "type": "keyword" <1>
  36. },
  37. "clientip": {
  38. "type": "keyword"
  39. },
  40. "coords": {
  41. "properties": {
  42. "lat": {
  43. "type": "keyword"
  44. },
  45. "lon": {
  46. "type": "keyword"
  47. }
  48. }
  49. },
  50. "error_count": {
  51. "type": "long"
  52. },
  53. "query": {
  54. "type": "keyword"
  55. },
  56. "some_field": {
  57. "type": "keyword"
  58. },
  59. "tokenstring1":{
  60. "type":"keyword"
  61. },
  62. "tokenstring2":{
  63. "type":"keyword"
  64. },
  65. "tokenstring3":{
  66. "type":"keyword"
  67. }
  68. }
  69. }
  70. }
  71. PUT /my_index/_doc/1
  72. {
  73. "@timestamp":"2017-03-23T13:00:00",
  74. "error_count":36320,
  75. "aborted_count":4156,
  76. "some_field":"JOE",
  77. "another_field":"SMITH ",
  78. "tokenstring1":"foo-bar-baz",
  79. "tokenstring2":"foo bar baz",
  80. "tokenstring3":"foo-bar-19",
  81. "query":"www.ml.elastic.co",
  82. "clientip":"123.456.78.900",
  83. "coords": {
  84. "lat" : 41.44,
  85. "lon":90.5
  86. }
  87. }
  88. ----------------------------------
  89. // CONSOLE
  90. // TEST[skip:SETUP]
  91. <1> In this example, string fields are mapped as `keyword` fields to support
  92. aggregation. If you want both a full text (`text`) and a keyword (`keyword`)
  93. version of the same field, use multi-fields. For more information, see
  94. {ref}/multi-fields.html[fields].
  95. [[ml-configuring-transform1]]
  96. .Example 1: Adding two numerical fields
  97. [source,js]
  98. ----------------------------------
  99. PUT _ml/anomaly_detectors/test1
  100. {
  101. "analysis_config":{
  102. "bucket_span": "10m",
  103. "detectors":[
  104. {
  105. "function":"mean",
  106. "field_name": "total_error_count", <1>
  107. "detector_description": "Custom script field transformation"
  108. }
  109. ]
  110. },
  111. "data_description": {
  112. "time_field":"@timestamp",
  113. "time_format":"epoch_ms"
  114. }
  115. }
  116. PUT _ml/datafeeds/datafeed-test1
  117. {
  118. "job_id": "test1",
  119. "indices": ["my_index"],
  120. "query": {
  121. "match_all": {
  122. "boost": 1
  123. }
  124. },
  125. "script_fields": {
  126. "total_error_count": { <2>
  127. "script": {
  128. "lang": "expression",
  129. "source": "doc['error_count'].value + doc['aborted_count'].value"
  130. }
  131. }
  132. }
  133. }
  134. ----------------------------------
  135. // CONSOLE
  136. // TEST[skip:needs-licence]
  137. <1> A script field named `total_error_count` is referenced in the detector
  138. within the job.
  139. <2> The script field is defined in the {dfeed}.
  140. This `test1` {anomaly-job} contains a detector that uses a script field in a
  141. mean analysis function. The `datafeed-test1` {dfeed} defines the script field.
  142. It contains a script that adds two fields in the document to produce a "total"
  143. error count.
  144. The syntax for the `script_fields` property is identical to that used by {es}.
  145. For more information, see
  146. {ref}/search-request-body.html#request-body-search-script-fields[Script fields].
  147. You can preview the contents of the {dfeed} by using the following API:
  148. [source,js]
  149. ----------------------------------
  150. GET _ml/datafeeds/datafeed-test1/_preview
  151. ----------------------------------
  152. // CONSOLE
  153. // TEST[skip:continued]
  154. In this example, the API returns the following results, which contain a sum of
  155. the `error_count` and `aborted_count` values:
  156. [source,js]
  157. ----------------------------------
  158. [
  159. {
  160. "@timestamp": 1490274000000,
  161. "total_error_count": 40476
  162. }
  163. ]
  164. ----------------------------------
  165. NOTE: This example demonstrates how to use script fields, but it contains
  166. insufficient data to generate meaningful results.
  167. //For a full demonstration of
  168. //how to create jobs with sample data, see <<ml-getting-started>>.
  169. You can alternatively use {kib} to create an advanced {anomaly-job} that uses
  170. script fields. To add the `script_fields` property to your {dfeed}, you must use
  171. the **Edit JSON** tab. For example:
  172. [role="screenshot"]
  173. image::images/ml-scriptfields.jpg[Adding script fields to a {dfeed} in {kib}]
  174. [[ml-configuring-transform-examples]]
  175. ==== Common script field examples
  176. While the possibilities are limitless, there are a number of common scenarios
  177. where you might use script fields in your {dfeeds}.
  178. [NOTE]
  179. ===============================
  180. Some of these examples use regular expressions. By default, regular
  181. expressions are disabled because they circumvent the protection that Painless
  182. provides against long running and memory hungry scripts. For more information,
  183. see {ref}/modules-scripting-painless.html[Painless scripting language].
  184. Machine learning analysis is case sensitive. For example, "John" is considered
  185. to be different than "john". This is one reason you might consider using scripts
  186. that convert your strings to upper or lowercase letters.
  187. ===============================
  188. [[ml-configuring-transform2]]
  189. .Example 2: Concatenating strings
  190. [source,js]
  191. --------------------------------------------------
  192. PUT _ml/anomaly_detectors/test2
  193. {
  194. "analysis_config":{
  195. "bucket_span": "10m",
  196. "detectors":[
  197. {
  198. "function":"low_info_content",
  199. "field_name":"my_script_field", <1>
  200. "detector_description": "Custom script field transformation"
  201. }
  202. ]
  203. },
  204. "data_description": {
  205. "time_field":"@timestamp",
  206. "time_format":"epoch_ms"
  207. }
  208. }
  209. PUT _ml/datafeeds/datafeed-test2
  210. {
  211. "job_id": "test2",
  212. "indices": ["my_index"],
  213. "query": {
  214. "match_all": {
  215. "boost": 1
  216. }
  217. },
  218. "script_fields": {
  219. "my_script_field": {
  220. "script": {
  221. "lang": "painless",
  222. "source": "doc['some_field'].value + '_' + doc['another_field'].value" <2>
  223. }
  224. }
  225. }
  226. }
  227. GET _ml/datafeeds/datafeed-test2/_preview
  228. --------------------------------------------------
  229. // CONSOLE
  230. // TEST[skip:needs-licence]
  231. <1> The script field has a rather generic name in this case, since it will
  232. be used for various tests in the subsequent examples.
  233. <2> The script field uses the plus (+) operator to concatenate strings.
  234. The preview {dfeed} API returns the following results, which show that "JOE"
  235. and "SMITH " have been concatenated and an underscore was added:
  236. [source,js]
  237. ----------------------------------
  238. [
  239. {
  240. "@timestamp": 1490274000000,
  241. "my_script_field": "JOE_SMITH "
  242. }
  243. ]
  244. ----------------------------------
  245. [[ml-configuring-transform3]]
  246. .Example 3: Trimming strings
  247. [source,js]
  248. --------------------------------------------------
  249. POST _ml/datafeeds/datafeed-test2/_update
  250. {
  251. "script_fields": {
  252. "my_script_field": {
  253. "script": {
  254. "lang": "painless",
  255. "source": "doc['another_field'].value.trim()" <1>
  256. }
  257. }
  258. }
  259. }
  260. GET _ml/datafeeds/datafeed-test2/_preview
  261. --------------------------------------------------
  262. // CONSOLE
  263. // TEST[skip:continued]
  264. <1> This script field uses the `trim()` function to trim extra white space from a
  265. string.
  266. The preview {dfeed} API returns the following results, which show that "SMITH "
  267. has been trimmed to "SMITH":
  268. [source,js]
  269. ----------------------------------
  270. [
  271. {
  272. "@timestamp": 1490274000000,
  273. "my_script_field": "SMITH"
  274. }
  275. ]
  276. ----------------------------------
  277. [[ml-configuring-transform4]]
  278. .Example 4: Converting strings to lowercase
  279. [source,js]
  280. --------------------------------------------------
  281. POST _ml/datafeeds/datafeed-test2/_update
  282. {
  283. "script_fields": {
  284. "my_script_field": {
  285. "script": {
  286. "lang": "painless",
  287. "source": "doc['some_field'].value.toLowerCase()" <1>
  288. }
  289. }
  290. }
  291. }
  292. GET _ml/datafeeds/datafeed-test2/_preview
  293. --------------------------------------------------
  294. // CONSOLE
  295. // TEST[skip:continued]
  296. <1> This script field uses the `toLowerCase` function to convert a string to all
  297. lowercase letters. Likewise, you can use the `toUpperCase{}` function to convert
  298. a string to uppercase letters.
  299. The preview {dfeed} API returns the following results, which show that "JOE"
  300. has been converted to "joe":
  301. [source,js]
  302. ----------------------------------
  303. [
  304. {
  305. "@timestamp": 1490274000000,
  306. "my_script_field": "joe"
  307. }
  308. ]
  309. ----------------------------------
  310. [[ml-configuring-transform5]]
  311. .Example 5: Converting strings to mixed case formats
  312. [source,js]
  313. --------------------------------------------------
  314. POST _ml/datafeeds/datafeed-test2/_update
  315. {
  316. "script_fields": {
  317. "my_script_field": {
  318. "script": {
  319. "lang": "painless",
  320. "source": "doc['some_field'].value.substring(0, 1).toUpperCase() + doc['some_field'].value.substring(1).toLowerCase()" <1>
  321. }
  322. }
  323. }
  324. }
  325. GET _ml/datafeeds/datafeed-test2/_preview
  326. --------------------------------------------------
  327. // CONSOLE
  328. // TEST[skip:continued]
  329. <1> This script field is a more complicated example of case manipulation. It uses
  330. the `subString()` function to capitalize the first letter of a string and
  331. converts the remaining characters to lowercase.
  332. The preview {dfeed} API returns the following results, which show that "JOE"
  333. has been converted to "Joe":
  334. [source,js]
  335. ----------------------------------
  336. [
  337. {
  338. "@timestamp": 1490274000000,
  339. "my_script_field": "Joe"
  340. }
  341. ]
  342. ----------------------------------
  343. [[ml-configuring-transform6]]
  344. .Example 6: Replacing tokens
  345. [source,js]
  346. --------------------------------------------------
  347. POST _ml/datafeeds/datafeed-test2/_update
  348. {
  349. "script_fields": {
  350. "my_script_field": {
  351. "script": {
  352. "lang": "painless",
  353. "source": "/\\s/.matcher(doc['tokenstring2'].value).replaceAll('_')" <1>
  354. }
  355. }
  356. }
  357. }
  358. GET _ml/datafeeds/datafeed-test2/_preview
  359. --------------------------------------------------
  360. // CONSOLE
  361. // TEST[skip:continued]
  362. <1> This script field uses regular expressions to replace white
  363. space with underscores.
  364. The preview {dfeed} API returns the following results, which show that
  365. "foo bar baz" has been converted to "foo_bar_baz":
  366. [source,js]
  367. ----------------------------------
  368. [
  369. {
  370. "@timestamp": 1490274000000,
  371. "my_script_field": "foo_bar_baz"
  372. }
  373. ]
  374. ----------------------------------
  375. [[ml-configuring-transform7]]
  376. .Example 7: Regular expression matching and concatenation
  377. [source,js]
  378. --------------------------------------------------
  379. POST _ml/datafeeds/datafeed-test2/_update
  380. {
  381. "script_fields": {
  382. "my_script_field": {
  383. "script": {
  384. "lang": "painless",
  385. "source": "def m = /(.*)-bar-([0-9][0-9])/.matcher(doc['tokenstring3'].value); return m.find() ? m.group(1) + '_' + m.group(2) : '';" <1>
  386. }
  387. }
  388. }
  389. }
  390. GET _ml/datafeeds/datafeed-test2/_preview
  391. --------------------------------------------------
  392. // CONSOLE
  393. // TEST[skip:continued]
  394. <1> This script field looks for a specific regular expression pattern and emits the
  395. matched groups as a concatenated string. If no match is found, it emits an empty
  396. string.
  397. The preview {dfeed} API returns the following results, which show that
  398. "foo-bar-19" has been converted to "foo_19":
  399. [source,js]
  400. ----------------------------------
  401. [
  402. {
  403. "@timestamp": 1490274000000,
  404. "my_script_field": "foo_19"
  405. }
  406. ]
  407. ----------------------------------
  408. [[ml-configuring-transform8]]
  409. .Example 8: Splitting strings by domain name
  410. [source,js]
  411. --------------------------------------------------
  412. PUT _ml/anomaly_detectors/test3
  413. {
  414. "description":"DNS tunneling",
  415. "analysis_config":{
  416. "bucket_span": "30m",
  417. "influencers": ["clientip","hrd"],
  418. "detectors":[
  419. {
  420. "function":"high_info_content",
  421. "field_name": "sub",
  422. "over_field_name": "hrd",
  423. "exclude_frequent":"all"
  424. }
  425. ]
  426. },
  427. "data_description": {
  428. "time_field":"@timestamp",
  429. "time_format":"epoch_ms"
  430. }
  431. }
  432. PUT _ml/datafeeds/datafeed-test3
  433. {
  434. "job_id": "test3",
  435. "indices": ["my_index"],
  436. "query": {
  437. "match_all": {
  438. "boost": 1
  439. }
  440. },
  441. "script_fields":{
  442. "sub":{
  443. "script":"return domainSplit(doc['query'].value).get(0);"
  444. },
  445. "hrd":{
  446. "script":"return domainSplit(doc['query'].value).get(1);"
  447. }
  448. }
  449. }
  450. GET _ml/datafeeds/datafeed-test3/_preview
  451. --------------------------------------------------
  452. // CONSOLE
  453. // TEST[skip:needs-licence]
  454. If you have a single field that contains a well-formed DNS domain name, you can
  455. use the `domainSplit()` function to split the string into its highest registered
  456. domain and the sub-domain, which is everything to the left of the highest
  457. registered domain. For example, the highest registered domain of
  458. `www.ml.elastic.co` is `elastic.co` and the sub-domain is `www.ml`. The
  459. `domainSplit()` function returns an array of two values: the first value is the
  460. subdomain; the second value is the highest registered domain.
  461. The preview {dfeed} API returns the following results, which show that
  462. "www.ml.elastic.co" has been split into "elastic.co" and "www.ml":
  463. [source,js]
  464. ----------------------------------
  465. [
  466. {
  467. "@timestamp": 1490274000000,
  468. "clientip.keyword": "123.456.78.900",
  469. "hrd": "elastic.co",
  470. "sub": "www.ml"
  471. }
  472. ]
  473. ----------------------------------
  474. [[ml-configuring-transform9]]
  475. .Example 9: Transforming geo_point data
  476. [source,js]
  477. --------------------------------------------------
  478. PUT _ml/anomaly_detectors/test4
  479. {
  480. "analysis_config":{
  481. "bucket_span": "10m",
  482. "detectors":[
  483. {
  484. "function":"lat_long",
  485. "field_name": "my_coordinates"
  486. }
  487. ]
  488. },
  489. "data_description": {
  490. "time_field":"@timestamp",
  491. "time_format":"epoch_ms"
  492. }
  493. }
  494. PUT _ml/datafeeds/datafeed-test4
  495. {
  496. "job_id": "test4",
  497. "indices": ["my_index"],
  498. "query": {
  499. "match_all": {
  500. "boost": 1
  501. }
  502. },
  503. "script_fields": {
  504. "my_coordinates": {
  505. "script": {
  506. "source": "doc['coords.lat'].value + ',' + doc['coords.lon'].value",
  507. "lang": "painless"
  508. }
  509. }
  510. }
  511. }
  512. GET _ml/datafeeds/datafeed-test4/_preview
  513. --------------------------------------------------
  514. // CONSOLE
  515. // TEST[skip:needs-licence]
  516. In {es}, location data can be stored in `geo_point` fields but this data type is
  517. not supported natively in {ml} analytics. This example of a script field
  518. transforms the data into an appropriate format. For more information,
  519. see <<ml-geo-functions>>.
  520. The preview {dfeed} API returns the following results, which show that
  521. `41.44` and `90.5` have been combined into "41.44,90.5":
  522. [source,js]
  523. ----------------------------------
  524. [
  525. {
  526. "@timestamp": 1490274000000,
  527. "my_coordinates": "41.44,90.5"
  528. }
  529. ]
  530. ----------------------------------