|
@@ -0,0 +1,424 @@
|
|
|
+[[common-script-uses]]
|
|
|
+=== Common scripting use cases
|
|
|
+You can write a script to do almost anything, and sometimes, that's
|
|
|
+the trouble. It's challenging to know what's possible with scripts,
|
|
|
+so the following examples address common uses cases where scripts are
|
|
|
+really helpful.
|
|
|
+
|
|
|
+[[scripting-field-extraction]]
|
|
|
+==== Field extraction
|
|
|
+The goal of field extraction is simple; you have fields in your data with a bunch of
|
|
|
+information, but you only want to extract pieces and parts.
|
|
|
+
|
|
|
+There are two options at your disposal:
|
|
|
+
|
|
|
+* <<grok-basics,Grok>> is a regular expression dialect that supports aliased
|
|
|
+expressions that you can reuse. Because Grok sits on top of regular expressions
|
|
|
+(regex), any regular expressions are valid in grok as well.
|
|
|
+* <<dissect-processor,Dissect>> extracts structured fields out of text, using
|
|
|
+delimiters to define the matching pattern. Unlike grok, dissect doesn't use regular
|
|
|
+expressions.
|
|
|
+
|
|
|
+Regex is incredibly powerful but can be complicated. If you don't need the
|
|
|
+power of regular expressions, use dissect patterns, which are simple and
|
|
|
+often faster than grok patterns. Paying special attention to the parts of the string
|
|
|
+you want to discard will help build successful dissect patterns.
|
|
|
+
|
|
|
+Let's start with a simple example by adding the `@timestamp` and `message`
|
|
|
+fields to the `my-index` mapping as indexed fields. To remain flexible, use
|
|
|
+`wildcard` as the field type for `message`:
|
|
|
+
|
|
|
+[source,console]
|
|
|
+----
|
|
|
+PUT /my-index/
|
|
|
+{
|
|
|
+ "mappings": {
|
|
|
+ "properties": {
|
|
|
+ "@timestamp": {
|
|
|
+ "format": "strict_date_optional_time||epoch_second",
|
|
|
+ "type": "date"
|
|
|
+ },
|
|
|
+ "message": {
|
|
|
+ "type": "wildcard"
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+----
|
|
|
+
|
|
|
+After mapping the fields you want to retrieve, index a few records from
|
|
|
+your log data into {es}. The following request uses the <<docs-bulk,bulk API>>
|
|
|
+to index raw log data into `my-index`. Instead of indexing all of your log
|
|
|
+data, you can use a small sample to experiment with runtime fields.
|
|
|
+
|
|
|
+[source,console]
|
|
|
+----
|
|
|
+POST /my-index/_bulk?refresh
|
|
|
+{"index":{}}
|
|
|
+{"timestamp":"2020-04-30T14:30:17-05:00","message":"40.135.0.0 - - [30/Apr/2020:14:30:17 -0500] \"GET /images/hm_bg.jpg HTTP/1.0\" 200 24736"}
|
|
|
+{"index":{}}
|
|
|
+{"timestamp":"2020-04-30T14:30:53-05:00","message":"232.0.0.0 - - [30/Apr/2020:14:30:53 -0500] \"GET /images/hm_bg.jpg HTTP/1.0\" 200 24736"}
|
|
|
+{"index":{}}
|
|
|
+{"timestamp":"2020-04-30T14:31:12-05:00","message":"26.1.0.0 - - [30/Apr/2020:14:31:12 -0500] \"GET /images/hm_bg.jpg HTTP/1.0\" 200 24736"}
|
|
|
+{"index":{}}
|
|
|
+{"timestamp":"2020-04-30T14:31:19-05:00","message":"247.37.0.0 - - [30/Apr/2020:14:31:19 -0500] \"GET /french/splash_inet.html HTTP/1.0\" 200 3781"}
|
|
|
+{"index":{}}
|
|
|
+{"timestamp":"2020-04-30T14:31:22-05:00","message":"247.37.0.0 - - [30/Apr/2020:14:31:22 -0500] \"GET /images/hm_nbg.jpg HTTP/1.0\" 304 0"}
|
|
|
+{"index":{}}
|
|
|
+{"timestamp":"2020-04-30T14:31:27-05:00","message":"252.0.0.0 - - [30/Apr/2020:14:31:27 -0500] \"GET /images/hm_bg.jpg HTTP/1.0\" 200 24736"}
|
|
|
+{"index":{}}
|
|
|
+{"timestamp":"2020-04-30T14:31:28-05:00","message":"not a valid apache log"}
|
|
|
+----
|
|
|
+// TEST[continued]
|
|
|
+
|
|
|
+[discrete]
|
|
|
+[[field-extraction-ip]]
|
|
|
+===== Extract an IP address from a log message (Grok)
|
|
|
+If you want to retrieve results that include `clientip`, you can add that
|
|
|
+field as a runtime field in the mapping. The following runtime script defines a
|
|
|
+grok pattern that extracts structured fields out of the `message` field.
|
|
|
+
|
|
|
+The script matches on the `%{COMMONAPACHELOG}` log pattern, which understands
|
|
|
+the structure of Apache logs. If the pattern matches, the script emits the
|
|
|
+value matching the IP address. If the pattern doesn't match
|
|
|
+(`clientip != null`), the script just returns the field value without crashing.
|
|
|
+
|
|
|
+[source,console]
|
|
|
+----
|
|
|
+PUT my-index/_mappings
|
|
|
+{
|
|
|
+ "runtime": {
|
|
|
+ "http.clientip": {
|
|
|
+ "type": "ip",
|
|
|
+ "script": """
|
|
|
+ String clientip=grok('%{COMMONAPACHELOG}').extract(doc["message"].value)?.clientip;
|
|
|
+ if (clientip != null) emit(clientip); <1>
|
|
|
+ """
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+----
|
|
|
+// TEST[continued]
|
|
|
+<1> This condition ensures that the script doesn't emit anything even if the pattern of
|
|
|
+the message doesn't match.
|
|
|
+
|
|
|
+You can define a simple query to run a search for a specific IP address and
|
|
|
+return all related fields. Use the `fields` parameter of the search API to
|
|
|
+retrieve the `http.clientip` runtime field.
|
|
|
+
|
|
|
+[source,console]
|
|
|
+----
|
|
|
+GET my-index/_search
|
|
|
+{
|
|
|
+ "query": {
|
|
|
+ "match": {
|
|
|
+ "http.clientip": "40.135.0.0"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "fields" : ["http.clientip"]
|
|
|
+}
|
|
|
+----
|
|
|
+// TEST[continued]
|
|
|
+// TEST[s/_search/_search\?filter_path=hits/]
|
|
|
+
|
|
|
+The response includes documents where the value for `http.clientip` matches
|
|
|
+`40.135.0.0`.
|
|
|
+
|
|
|
+[source,console-result]
|
|
|
+----
|
|
|
+{
|
|
|
+ "hits" : {
|
|
|
+ "total" : {
|
|
|
+ "value" : 1,
|
|
|
+ "relation" : "eq"
|
|
|
+ },
|
|
|
+ "max_score" : 1.0,
|
|
|
+ "hits" : [
|
|
|
+ {
|
|
|
+ "_index" : "my-index",
|
|
|
+ "_id" : "Rq-ex3gBA_A0V6dYGLQ7",
|
|
|
+ "_score" : 1.0,
|
|
|
+ "_source" : {
|
|
|
+ "timestamp" : "2020-04-30T14:30:17-05:00",
|
|
|
+ "message" : "40.135.0.0 - - [30/Apr/2020:14:30:17 -0500] \"GET /images/hm_bg.jpg HTTP/1.0\" 200 24736"
|
|
|
+ },
|
|
|
+ "fields" : {
|
|
|
+ "http.clientip" : [
|
|
|
+ "40.135.0.0"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ }
|
|
|
+ ]
|
|
|
+ }
|
|
|
+}
|
|
|
+----
|
|
|
+// TESTRESPONSE[s/"_id" : "Rq-ex3gBA_A0V6dYGLQ7"/"_id": $body.hits.hits.0._id/]
|
|
|
+
|
|
|
+[discrete]
|
|
|
+[[field-extraction-parse]]
|
|
|
+==== Parse a string to extract part of a field (Dissect)
|
|
|
+Instead of matching on a log pattern like in the <<field-extraction-ip,previous example>>, you can just define a dissect pattern to include the parts of the string
|
|
|
+that you want to discard.
|
|
|
+
|
|
|
+For example, the log data at the start of this section includes a `message`
|
|
|
+field. This field contains several pieces of data:
|
|
|
+
|
|
|
+[source,js]
|
|
|
+----
|
|
|
+"message" : "247.37.0.0 - - [30/Apr/2020:14:31:22 -0500] \"GET /images/hm_nbg.jpg HTTP/1.0\" 304 0"
|
|
|
+----
|
|
|
+// NOTCONSOLE
|
|
|
+
|
|
|
+You can define a dissect pattern in a runtime field to extract the https://developer.mozilla.org/en-US/docs/Web/HTTP/Status[HTTP response code], which is
|
|
|
+`304` in the previous example.
|
|
|
+
|
|
|
+[source,console]
|
|
|
+----
|
|
|
+PUT my-index/_mappings
|
|
|
+{
|
|
|
+ "runtime": {
|
|
|
+ "http.response": {
|
|
|
+ "type": "long",
|
|
|
+ "script": """
|
|
|
+ String response=dissect('%{clientip} %{ident} %{auth} [%{@timestamp}] "%{verb} %{request} HTTP/%{httpversion}" %{response} %{size}').extract(doc["message"].value)?.response;
|
|
|
+ if (response != null) emit(Integer.parseInt(response));
|
|
|
+ """
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+----
|
|
|
+// TEST[continued]
|
|
|
+
|
|
|
+You can then run a query to retrieve a specific HTTP response using the
|
|
|
+`http.response` runtime field:
|
|
|
+
|
|
|
+[source,console]
|
|
|
+----
|
|
|
+GET my-index/_search
|
|
|
+{
|
|
|
+ "query": {
|
|
|
+ "match": {
|
|
|
+ "http.response": "304"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "fields" : ["http.response"]
|
|
|
+}
|
|
|
+----
|
|
|
+// TEST[continued]
|
|
|
+// TEST[s/_search/_search\?filter_path=hits/]
|
|
|
+
|
|
|
+The response includes a single document where the HTTP response is `304`:
|
|
|
+
|
|
|
+[source,console-result]
|
|
|
+----
|
|
|
+{
|
|
|
+ "hits" : {
|
|
|
+ "total" : {
|
|
|
+ "value" : 1,
|
|
|
+ "relation" : "eq"
|
|
|
+ },
|
|
|
+ "max_score" : 1.0,
|
|
|
+ "hits" : [
|
|
|
+ {
|
|
|
+ "_index" : "my-index",
|
|
|
+ "_id" : "Sq-ex3gBA_A0V6dYGLQ7",
|
|
|
+ "_score" : 1.0,
|
|
|
+ "_source" : {
|
|
|
+ "timestamp" : "2020-04-30T14:31:22-05:00",
|
|
|
+ "message" : "247.37.0.0 - - [30/Apr/2020:14:31:22 -0500] \"GET /images/hm_nbg.jpg HTTP/1.0\" 304 0"
|
|
|
+ },
|
|
|
+ "fields" : {
|
|
|
+ "http.response" : [
|
|
|
+ 304
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ }
|
|
|
+ ]
|
|
|
+ }
|
|
|
+}
|
|
|
+----
|
|
|
+// TESTRESPONSE[s/"_id" : "Sq-ex3gBA_A0V6dYGLQ7"/"_id": $body.hits.hits.0._id/]
|
|
|
+
|
|
|
+[discrete]
|
|
|
+[[field-extraction-split]]
|
|
|
+==== Split values in a field by a separator (Dissect)
|
|
|
+Let's say you want to extract part of a field like in the previous example, but you
|
|
|
+want to split on specific values. You can use a dissect pattern to extract only the
|
|
|
+information that you want, and also return that data in a specific format.
|
|
|
+
|
|
|
+For example, let's say you have a bunch of garbage collection (gc) log data from {es}
|
|
|
+in this format:
|
|
|
+
|
|
|
+[source,txt]
|
|
|
+----
|
|
|
+[2021-04-27T16:16:34.699+0000][82460][gc,heap,exit] class space used 266K, capacity 384K, committed 384K, reserved 1048576K
|
|
|
+----
|
|
|
+// NOTCONSOLE
|
|
|
+
|
|
|
+You only want to extract the `used`, `capacity`, and `committed` data, along with
|
|
|
+the associated values. Let's index some a few documents containing log data to use as
|
|
|
+an example:
|
|
|
+
|
|
|
+[source,console]
|
|
|
+----
|
|
|
+POST /my-index/_bulk?refresh
|
|
|
+{"index":{}}
|
|
|
+{"gc": "[2021-04-27T16:16:34.699+0000][82460][gc,heap,exit] class space used 266K, capacity 384K, committed 384K, reserved 1048576K"}
|
|
|
+{"index":{}}
|
|
|
+{"gc": "[2021-03-24T20:27:24.184+0000][90239][gc,heap,exit] class space used 15255K, capacity 16726K, committed 16844K, reserved 1048576K"}
|
|
|
+{"index":{}}
|
|
|
+{"gc": "[2021-03-24T20:27:24.184+0000][90239][gc,heap,exit] Metaspace used 115409K, capacity 119541K, committed 120248K, reserved 1153024K"}
|
|
|
+{"index":{}}
|
|
|
+{"gc": "[2021-04-19T15:03:21.735+0000][84408][gc,heap,exit] class space used 14503K, capacity 15894K, committed 15948K, reserved 1048576K"}
|
|
|
+{"index":{}}
|
|
|
+{"gc": "[2021-04-19T15:03:21.735+0000][84408][gc,heap,exit] Metaspace used 107719K, capacity 111775K, committed 112724K, reserved 1146880K"}
|
|
|
+{"index":{}}
|
|
|
+{"gc": "[2021-04-27T16:16:34.699+0000][82460][gc,heap,exit] class space used 266K, capacity 367K, committed 384K, reserved 1048576K"}
|
|
|
+----
|
|
|
+
|
|
|
+Looking at the data again, there's a timestamp, some other data that you're not
|
|
|
+interested in, and then the `used`, `capacity`, and `committed` data:
|
|
|
+
|
|
|
+[source,txt]
|
|
|
+----
|
|
|
+[2021-04-27T16:16:34.699+0000][82460][gc,heap,exit] class space used 266K, capacity 384K, committed 384K, reserved 1048576K
|
|
|
+----
|
|
|
+
|
|
|
+You can assign variables to each part of the data in the `gc` field, and then return
|
|
|
+only the parts that you want. Anything in curly braces `{}` is considered a variable.
|
|
|
+For example, the variables `[%{@timestamp}][%{code}][%{desc}]` will match the first
|
|
|
+three chunks of data, all of which are in square brackets `[]`.
|
|
|
+
|
|
|
+[source,txt]
|
|
|
+----
|
|
|
+[%{@timestamp}][%{code}][%{desc}] %{ident} used %{usize}, capacity %{csize}, committed %{comsize}, reserved %{rsize}
|
|
|
+----
|
|
|
+
|
|
|
+Your dissect pattern can include the terms `used`, `capacity`, and `committed` instead
|
|
|
+of using variables, because you want to return those terms exactly. You also assign
|
|
|
+variables to the values you want to return, such as `%{usize}`, `%{csize}`, and
|
|
|
+`%{comsize}`. The separator in the log data is a comma, so your dissect pattern also
|
|
|
+needs to use that separator.
|
|
|
+
|
|
|
+Now that you have a dissect pattern, you can include it in a Painless script as part
|
|
|
+of a runtime field. The script uses your dissect pattern to split apart the `gc`
|
|
|
+field, and then returns exactly the information that you want as defined by the
|
|
|
+`emit` method. Because dissect uses simple syntax, you just need to tell it exactly
|
|
|
+what you want.
|
|
|
+
|
|
|
+The following pattern tells dissect to return the term `used`, a blank space, the value
|
|
|
+from `gc.usize`, and a comma. This pattern repeats for the other data that you
|
|
|
+want to retrieve. While this pattern might not be as useful in production, it provides
|
|
|
+a lot of flexibility to experiment with and manipulate your data. In a production
|
|
|
+setting, you might just want to use `emit(gc.usize)` and then aggregate on that value
|
|
|
+or use it in computations.
|
|
|
+
|
|
|
+[source,painless]
|
|
|
+----
|
|
|
+emit("used" + ' ' + gc.usize + ', ' + "capacity" + ' ' + gc.csize + ', ' + "committed" + ' ' + gc.comsize)
|
|
|
+----
|
|
|
+
|
|
|
+Putting it all together, you can create a runtime field named `gc_size` in a search
|
|
|
+request. Using the <<search-fields-param,`fields` option>>, you can retrieve all values
|
|
|
+for the `gc_size` runtime field. This query also includes a bucket aggregation to group
|
|
|
+your data.
|
|
|
+
|
|
|
+[source,console]
|
|
|
+----
|
|
|
+GET my-index/_search
|
|
|
+{
|
|
|
+ "runtime_mappings": {
|
|
|
+ "gc_size": {
|
|
|
+ "type": "keyword",
|
|
|
+ "script": """
|
|
|
+ Map gc=dissect('[%{@timestamp}][%{code}][%{desc}] %{ident} used %{usize}, capacity %{csize}, committed %{comsize}, reserved %{rsize}').extract(doc["gc.keyword"].value);
|
|
|
+ if (gc != null) emit("used" + ' ' + gc.usize + ', ' + "capacity" + ' ' + gc.csize + ', ' + "committed" + ' ' + gc.comsize);
|
|
|
+ """
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "size": 1,
|
|
|
+ "aggs": {
|
|
|
+ "sizes": {
|
|
|
+ "terms": {
|
|
|
+ "field": "gc_size",
|
|
|
+ "size": 10
|
|
|
+ }
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "fields" : ["gc_size"]
|
|
|
+}
|
|
|
+----
|
|
|
+// TEST[continued]
|
|
|
+
|
|
|
+The response includes the data from the `gc_size` field, formatted exactly as you
|
|
|
+defined it in the dissect pattern!
|
|
|
+
|
|
|
+[source,console-result]
|
|
|
+----
|
|
|
+{
|
|
|
+ "took" : 2,
|
|
|
+ "timed_out" : false,
|
|
|
+ "_shards" : {
|
|
|
+ "total" : 1,
|
|
|
+ "successful" : 1,
|
|
|
+ "skipped" : 0,
|
|
|
+ "failed" : 0
|
|
|
+ },
|
|
|
+ "hits" : {
|
|
|
+ "total" : {
|
|
|
+ "value" : 6,
|
|
|
+ "relation" : "eq"
|
|
|
+ },
|
|
|
+ "max_score" : 1.0,
|
|
|
+ "hits" : [
|
|
|
+ {
|
|
|
+ "_index" : "my-index",
|
|
|
+ "_id" : "GXx3H3kBKGE42WRNlddJ",
|
|
|
+ "_score" : 1.0,
|
|
|
+ "_source" : {
|
|
|
+ "gc" : "[2021-04-27T16:16:34.699+0000][82460][gc,heap,exit] class space used 266K, capacity 384K, committed 384K, reserved 1048576K"
|
|
|
+ },
|
|
|
+ "fields" : {
|
|
|
+ "gc_size" : [
|
|
|
+ "used 266K, capacity 384K, committed 384K"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ }
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ "aggregations" : {
|
|
|
+ "sizes" : {
|
|
|
+ "doc_count_error_upper_bound" : 0,
|
|
|
+ "sum_other_doc_count" : 0,
|
|
|
+ "buckets" : [
|
|
|
+ {
|
|
|
+ "key" : "used 107719K, capacity 111775K, committed 112724K",
|
|
|
+ "doc_count" : 1
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "key" : "used 115409K, capacity 119541K, committed 120248K",
|
|
|
+ "doc_count" : 1
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "key" : "used 14503K, capacity 15894K, committed 15948K",
|
|
|
+ "doc_count" : 1
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "key" : "used 15255K, capacity 16726K, committed 16844K",
|
|
|
+ "doc_count" : 1
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "key" : "used 266K, capacity 367K, committed 384K",
|
|
|
+ "doc_count" : 1
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "key" : "used 266K, capacity 384K, committed 384K",
|
|
|
+ "doc_count" : 1
|
|
|
+ }
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+----
|
|
|
+// TESTRESPONSE[s/"took" : 2/"took": "$body.took"/]
|
|
|
+// TESTRESPONSE[s/"_id" : "GXx3H3kBKGE42WRNlddJ"/"_id": $body.hits.hits.0._id/]
|