ml-configuring-transform.asciidoc 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587
  1. [role="xpack"]
  2. [[ml-configuring-transform]]
  3. = Transforming data with script fields
  4. If you use {dfeeds}, you can add scripts to transform your data before
  5. it is analyzed. {dfeeds-cap} contain an optional `script_fields` property, where
  6. you can specify scripts that evaluate custom expressions and return script
  7. fields.
  8. If your {dfeed} defines script fields, you can use those fields in your
  9. {anomaly-job}. For example, you can use the script fields in the analysis
  10. functions in one or more detectors.
  11. * <<ml-configuring-transform1>>
  12. * <<ml-configuring-transform2>>
  13. * <<ml-configuring-transform3>>
  14. * <<ml-configuring-transform4>>
  15. * <<ml-configuring-transform5>>
  16. * <<ml-configuring-transform6>>
  17. * <<ml-configuring-transform7>>
  18. * <<ml-configuring-transform8>>
  19. * <<ml-configuring-transform9>>
  20. The following index APIs create and add content to an index that is used in
  21. subsequent examples:
  22. [source,console]
  23. ----------------------------------
  24. PUT /my-index-000001
  25. {
  26. "mappings":{
  27. "properties": {
  28. "@timestamp": {
  29. "type": "date"
  30. },
  31. "aborted_count": {
  32. "type": "long"
  33. },
  34. "another_field": {
  35. "type": "keyword" <1>
  36. },
  37. "clientip": {
  38. "type": "keyword"
  39. },
  40. "coords": {
  41. "properties": {
  42. "lat": {
  43. "type": "keyword"
  44. },
  45. "lon": {
  46. "type": "keyword"
  47. }
  48. }
  49. },
  50. "error_count": {
  51. "type": "long"
  52. },
  53. "query": {
  54. "type": "keyword"
  55. },
  56. "some_field": {
  57. "type": "keyword"
  58. },
  59. "tokenstring1":{
  60. "type":"keyword"
  61. },
  62. "tokenstring2":{
  63. "type":"keyword"
  64. },
  65. "tokenstring3":{
  66. "type":"keyword"
  67. }
  68. }
  69. }
  70. }
  71. PUT /my-index-000001/_doc/1
  72. {
  73. "@timestamp":"2017-03-23T13:00:00",
  74. "error_count":36320,
  75. "aborted_count":4156,
  76. "some_field":"JOE",
  77. "another_field":"SMITH ",
  78. "tokenstring1":"foo-bar-baz",
  79. "tokenstring2":"foo bar baz",
  80. "tokenstring3":"foo-bar-19",
  81. "query":"www.ml.elastic.co",
  82. "clientip":"123.456.78.900",
  83. "coords": {
  84. "lat" : 41.44,
  85. "lon":90.5
  86. }
  87. }
  88. ----------------------------------
  89. // TEST[skip:SETUP]
  90. <1> In this example, string fields are mapped as `keyword` fields to support
  91. aggregation. If you want both a full text (`text`) and a keyword (`keyword`)
  92. version of the same field, use multi-fields. For more information, see
  93. {ref}/multi-fields.html[fields].
  94. [[ml-configuring-transform1]]
  95. .Example 1: Adding two numerical fields
  96. [source,console]
  97. ----------------------------------
  98. PUT _ml/anomaly_detectors/test1
  99. {
  100. "analysis_config":{
  101. "bucket_span": "10m",
  102. "detectors":[
  103. {
  104. "function":"mean",
  105. "field_name": "total_error_count", <1>
  106. "detector_description": "Custom script field transformation"
  107. }
  108. ]
  109. },
  110. "data_description": {
  111. "time_field":"@timestamp",
  112. "time_format":"epoch_ms"
  113. }
  114. }
  115. PUT _ml/datafeeds/datafeed-test1
  116. {
  117. "job_id": "test1",
  118. "indices": ["my-index-000001"],
  119. "query": {
  120. "match_all": {
  121. "boost": 1
  122. }
  123. },
  124. "script_fields": {
  125. "total_error_count": { <2>
  126. "script": {
  127. "lang": "expression",
  128. "source": "doc['error_count'].value + doc['aborted_count'].value"
  129. }
  130. }
  131. }
  132. }
  133. ----------------------------------
  134. // TEST[skip:needs-licence]
  135. <1> A script field named `total_error_count` is referenced in the detector
  136. within the job.
  137. <2> The script field is defined in the {dfeed}.
  138. This `test1` {anomaly-job} contains a detector that uses a script field in a
  139. mean analysis function. The `datafeed-test1` {dfeed} defines the script field.
  140. It contains a script that adds two fields in the document to produce a "total"
  141. error count.
  142. The syntax for the `script_fields` property is identical to that used by {es}.
  143. For more information, see
  144. {ref}/search-fields.html#script-fields[Script fields].
  145. You can preview the contents of the {dfeed} by using the following API:
  146. [source,console]
  147. ----------------------------------
  148. GET _ml/datafeeds/datafeed-test1/_preview
  149. ----------------------------------
  150. // TEST[skip:continued]
  151. In this example, the API returns the following results, which contain a sum of
  152. the `error_count` and `aborted_count` values:
  153. [source,js]
  154. ----------------------------------
  155. [
  156. {
  157. "@timestamp": 1490274000000,
  158. "total_error_count": 40476
  159. }
  160. ]
  161. ----------------------------------
  162. NOTE: This example demonstrates how to use script fields, but it contains
  163. insufficient data to generate meaningful results.
  164. //For a full demonstration of
  165. //how to create jobs with sample data, see <<ml-getting-started>>.
  166. You can alternatively use {kib} to create an advanced {anomaly-job} that uses
  167. script fields. To add the `script_fields` property to your {dfeed}, you must use
  168. the **Edit JSON** tab. For example:
  169. [role="screenshot"]
  170. image::images/ml-scriptfields.jpg[Adding script fields to a {dfeed} in {kib}]
  171. [[ml-configuring-transform-examples]]
  172. == Common script field examples
  173. While the possibilities are limitless, there are a number of common scenarios
  174. where you might use script fields in your {dfeeds}.
  175. [NOTE]
  176. ===============================
  177. Some of these examples use regular expressions. By default, regular
  178. expressions are disabled because they circumvent the protection that Painless
  179. provides against long running and memory hungry scripts. For more information,
  180. see {ref}/modules-scripting-painless.html[Painless scripting language].
  181. Machine learning analysis is case sensitive. For example, "John" is considered
  182. to be different than "john". This is one reason you might consider using scripts
  183. that convert your strings to upper or lowercase letters.
  184. ===============================
  185. [[ml-configuring-transform2]]
  186. .Example 2: Concatenating strings
  187. [source,console]
  188. --------------------------------------------------
  189. PUT _ml/anomaly_detectors/test2
  190. {
  191. "analysis_config":{
  192. "bucket_span": "10m",
  193. "detectors":[
  194. {
  195. "function":"low_info_content",
  196. "field_name":"my_script_field", <1>
  197. "detector_description": "Custom script field transformation"
  198. }
  199. ]
  200. },
  201. "data_description": {
  202. "time_field":"@timestamp",
  203. "time_format":"epoch_ms"
  204. }
  205. }
  206. PUT _ml/datafeeds/datafeed-test2
  207. {
  208. "job_id": "test2",
  209. "indices": ["my-index-000001"],
  210. "query": {
  211. "match_all": {
  212. "boost": 1
  213. }
  214. },
  215. "script_fields": {
  216. "my_script_field": {
  217. "script": {
  218. "lang": "painless",
  219. "source": "doc['some_field'].value + '_' + doc['another_field'].value" <2>
  220. }
  221. }
  222. }
  223. }
  224. GET _ml/datafeeds/datafeed-test2/_preview
  225. --------------------------------------------------
  226. // TEST[skip:needs-licence]
  227. <1> The script field has a rather generic name in this case, since it will
  228. be used for various tests in the subsequent examples.
  229. <2> The script field uses the plus (+) operator to concatenate strings.
  230. The preview {dfeed} API returns the following results, which show that "JOE"
  231. and "SMITH " have been concatenated and an underscore was added:
  232. [source,js]
  233. ----------------------------------
  234. [
  235. {
  236. "@timestamp": 1490274000000,
  237. "my_script_field": "JOE_SMITH "
  238. }
  239. ]
  240. ----------------------------------
  241. [[ml-configuring-transform3]]
  242. .Example 3: Trimming strings
  243. [source,console]
  244. --------------------------------------------------
  245. POST _ml/datafeeds/datafeed-test2/_update
  246. {
  247. "script_fields": {
  248. "my_script_field": {
  249. "script": {
  250. "lang": "painless",
  251. "source": "doc['another_field'].value.trim()" <1>
  252. }
  253. }
  254. }
  255. }
  256. GET _ml/datafeeds/datafeed-test2/_preview
  257. --------------------------------------------------
  258. // TEST[skip:continued]
  259. <1> This script field uses the `trim()` function to trim extra white space from a
  260. string.
  261. The preview {dfeed} API returns the following results, which show that "SMITH "
  262. has been trimmed to "SMITH":
  263. [source,js]
  264. ----------------------------------
  265. [
  266. {
  267. "@timestamp": 1490274000000,
  268. "my_script_field": "SMITH"
  269. }
  270. ]
  271. ----------------------------------
  272. [[ml-configuring-transform4]]
  273. .Example 4: Converting strings to lowercase
  274. [source,console]
  275. --------------------------------------------------
  276. POST _ml/datafeeds/datafeed-test2/_update
  277. {
  278. "script_fields": {
  279. "my_script_field": {
  280. "script": {
  281. "lang": "painless",
  282. "source": "doc['some_field'].value.toLowerCase()" <1>
  283. }
  284. }
  285. }
  286. }
  287. GET _ml/datafeeds/datafeed-test2/_preview
  288. --------------------------------------------------
  289. // TEST[skip:continued]
  290. <1> This script field uses the `toLowerCase` function to convert a string to all
  291. lowercase letters. Likewise, you can use the `toUpperCase{}` function to convert
  292. a string to uppercase letters.
  293. The preview {dfeed} API returns the following results, which show that "JOE"
  294. has been converted to "joe":
  295. [source,js]
  296. ----------------------------------
  297. [
  298. {
  299. "@timestamp": 1490274000000,
  300. "my_script_field": "joe"
  301. }
  302. ]
  303. ----------------------------------
  304. [[ml-configuring-transform5]]
  305. .Example 5: Converting strings to mixed case formats
  306. [source,console]
  307. --------------------------------------------------
  308. POST _ml/datafeeds/datafeed-test2/_update
  309. {
  310. "script_fields": {
  311. "my_script_field": {
  312. "script": {
  313. "lang": "painless",
  314. "source": "doc['some_field'].value.substring(0, 1).toUpperCase() + doc['some_field'].value.substring(1).toLowerCase()" <1>
  315. }
  316. }
  317. }
  318. }
  319. GET _ml/datafeeds/datafeed-test2/_preview
  320. --------------------------------------------------
  321. // TEST[skip:continued]
  322. <1> This script field is a more complicated example of case manipulation. It uses
  323. the `subString()` function to capitalize the first letter of a string and
  324. converts the remaining characters to lowercase.
  325. The preview {dfeed} API returns the following results, which show that "JOE"
  326. has been converted to "Joe":
  327. [source,js]
  328. ----------------------------------
  329. [
  330. {
  331. "@timestamp": 1490274000000,
  332. "my_script_field": "Joe"
  333. }
  334. ]
  335. ----------------------------------
  336. [[ml-configuring-transform6]]
  337. .Example 6: Replacing tokens
  338. [source,console]
  339. --------------------------------------------------
  340. POST _ml/datafeeds/datafeed-test2/_update
  341. {
  342. "script_fields": {
  343. "my_script_field": {
  344. "script": {
  345. "lang": "painless",
  346. "source": "/\\s/.matcher(doc['tokenstring2'].value).replaceAll('_')" <1>
  347. }
  348. }
  349. }
  350. }
  351. GET _ml/datafeeds/datafeed-test2/_preview
  352. --------------------------------------------------
  353. // TEST[skip:continued]
  354. <1> This script field uses regular expressions to replace white
  355. space with underscores.
  356. The preview {dfeed} API returns the following results, which show that
  357. "foo bar baz" has been converted to "foo_bar_baz":
  358. [source,js]
  359. ----------------------------------
  360. [
  361. {
  362. "@timestamp": 1490274000000,
  363. "my_script_field": "foo_bar_baz"
  364. }
  365. ]
  366. ----------------------------------
  367. [[ml-configuring-transform7]]
  368. .Example 7: Regular expression matching and concatenation
  369. [source,console]
  370. --------------------------------------------------
  371. POST _ml/datafeeds/datafeed-test2/_update
  372. {
  373. "script_fields": {
  374. "my_script_field": {
  375. "script": {
  376. "lang": "painless",
  377. "source": "def m = /(.*)-bar-([0-9][0-9])/.matcher(doc['tokenstring3'].value); return m.find() ? m.group(1) + '_' + m.group(2) : '';" <1>
  378. }
  379. }
  380. }
  381. }
  382. GET _ml/datafeeds/datafeed-test2/_preview
  383. --------------------------------------------------
  384. // TEST[skip:continued]
  385. <1> This script field looks for a specific regular expression pattern and emits the
  386. matched groups as a concatenated string. If no match is found, it emits an empty
  387. string.
  388. The preview {dfeed} API returns the following results, which show that
  389. "foo-bar-19" has been converted to "foo_19":
  390. [source,js]
  391. ----------------------------------
  392. [
  393. {
  394. "@timestamp": 1490274000000,
  395. "my_script_field": "foo_19"
  396. }
  397. ]
  398. ----------------------------------
  399. [[ml-configuring-transform8]]
  400. .Example 8: Splitting strings by domain name
  401. [source,console]
  402. --------------------------------------------------
  403. PUT _ml/anomaly_detectors/test3
  404. {
  405. "description":"DNS tunneling",
  406. "analysis_config":{
  407. "bucket_span": "30m",
  408. "influencers": ["clientip","hrd"],
  409. "detectors":[
  410. {
  411. "function":"high_info_content",
  412. "field_name": "sub",
  413. "over_field_name": "hrd",
  414. "exclude_frequent":"all"
  415. }
  416. ]
  417. },
  418. "data_description": {
  419. "time_field":"@timestamp",
  420. "time_format":"epoch_ms"
  421. }
  422. }
  423. PUT _ml/datafeeds/datafeed-test3
  424. {
  425. "job_id": "test3",
  426. "indices": ["my-index-000001"],
  427. "query": {
  428. "match_all": {
  429. "boost": 1
  430. }
  431. },
  432. "script_fields":{
  433. "sub":{
  434. "script":"return domainSplit(doc['query'].value).get(0);"
  435. },
  436. "hrd":{
  437. "script":"return domainSplit(doc['query'].value).get(1);"
  438. }
  439. }
  440. }
  441. GET _ml/datafeeds/datafeed-test3/_preview
  442. --------------------------------------------------
  443. // TEST[skip:needs-licence]
  444. If you have a single field that contains a well-formed DNS domain name, you can
  445. use the `domainSplit()` function to split the string into its highest registered
  446. domain and the sub-domain, which is everything to the left of the highest
  447. registered domain. For example, the highest registered domain of
  448. `www.ml.elastic.co` is `elastic.co` and the sub-domain is `www.ml`. The
  449. `domainSplit()` function returns an array of two values: the first value is the
  450. subdomain; the second value is the highest registered domain.
  451. The preview {dfeed} API returns the following results, which show that
  452. "www.ml.elastic.co" has been split into "elastic.co" and "www.ml":
  453. [source,js]
  454. ----------------------------------
  455. [
  456. {
  457. "@timestamp": 1490274000000,
  458. "clientip.keyword": "123.456.78.900",
  459. "hrd": "elastic.co",
  460. "sub": "www.ml"
  461. }
  462. ]
  463. ----------------------------------
  464. [[ml-configuring-transform9]]
  465. .Example 9: Transforming geo_point data
  466. [source,console]
  467. --------------------------------------------------
  468. PUT _ml/anomaly_detectors/test4
  469. {
  470. "analysis_config":{
  471. "bucket_span": "10m",
  472. "detectors":[
  473. {
  474. "function":"lat_long",
  475. "field_name": "my_coordinates"
  476. }
  477. ]
  478. },
  479. "data_description": {
  480. "time_field":"@timestamp",
  481. "time_format":"epoch_ms"
  482. }
  483. }
  484. PUT _ml/datafeeds/datafeed-test4
  485. {
  486. "job_id": "test4",
  487. "indices": ["my-index-000001"],
  488. "query": {
  489. "match_all": {
  490. "boost": 1
  491. }
  492. },
  493. "script_fields": {
  494. "my_coordinates": {
  495. "script": {
  496. "source": "doc['coords.lat'].value + ',' + doc['coords.lon'].value",
  497. "lang": "painless"
  498. }
  499. }
  500. }
  501. }
  502. GET _ml/datafeeds/datafeed-test4/_preview
  503. --------------------------------------------------
  504. // TEST[skip:needs-licence]
  505. In {es}, location data can be stored in `geo_point` fields but this data type is
  506. not supported natively in {ml} analytics. This example of a script field
  507. transforms the data into an appropriate format. For more information,
  508. see <<ml-geo-functions>>.
  509. The preview {dfeed} API returns the following results, which show that
  510. `41.44` and `90.5` have been combined into "41.44,90.5":
  511. [source,js]
  512. ----------------------------------
  513. [
  514. {
  515. "@timestamp": 1490274000000,
  516. "my_coordinates": "41.44,90.5"
  517. }
  518. ]
  519. ----------------------------------