5 years ago · 108f9ca73f
--- a/docs/reference/transform/index.asciidoc
+++ b/docs/reference/transform/index.asciidoc
@@ -16,6 +16,7 @@ your data.
 
				 * <<transform-api-quickref>>
			
 
				 * <<ecommerce-transforms>>
			
 
				 * <<transform-examples>>
			
 
				+* <<transform-painless-examples>>
			
 
				 * <<transform-troubleshooting>>
			
 
				 * <<transform-limitations>>
			
 
				 
			
@@ -26,5 +27,6 @@ include::checkpoints.asciidoc[]
 
				 include::api-quickref.asciidoc[]
			
 
				 include::ecommerce-tutorial.asciidoc[]
			
 
				 include::examples.asciidoc[]
			
 
				+include::painless-examples.asciidoc[]
			
 
				 include::troubleshooting.asciidoc[]
			
 
				 include::limitations.asciidoc[]
			
--- a/docs/reference/transform/painless-examples.asciidoc
+++ b/docs/reference/transform/painless-examples.asciidoc
@@ -0,0 +1,329 @@
 
				+[role="xpack"]
			
 
				+[testenv="basic"]
			
 
				+[[transform-painless-examples]]
			
 
				+=== Painless examples for {transforms}
			
 
				+++++
			
 
				+<titleabbrev>Painless examples for {transforms}</titleabbrev>
			
 
				+++++
			
 
				+
			
 
				+These examples demonstrate how to use Painless in {transforms}. You can learn 
			
 
				+more about the Painless scripting language in the 
			
 
				+{painless}/painless-guide.html[Painless guide].
			
 
				+
			
 
				+* <<painless-top-hits>>
			
 
				+* <<painless-time-features>>
			
 
				+* <<painless-group-by>>
			
 
				+* <<painless-bucket-script>>
			
 
				+
			
 
				+
			
 
				+[discrete]
			
 
				+[[painless-top-hits]]
			
 
				+==== Getting top hits by using scripted metric
			
 
				+
			
 
				+This snippet shows how to find the latest document, in other words the document 
			
 
				+with the earliest timestamp. From a technical perspective, it helps to achieve 
			
 
				+the function of a <<search-aggregations-metrics-top-hits-aggregation>> by using 
			
 
				+scripted metric aggregation which provides a metric output.
			
 
				+
			
 
				+[source,js]
			
 
				+--------------------------------------------------
			
 
				+"latest_doc": { 
			
 
				+  "scripted_metric": {
			
 
				+    "init_script": "state.timestamp_latest = 0L; state.last_doc = ''", <1>
			
 
				+    "map_script": """ <2>
			
 
				+      def current_date = doc['@timestamp'].getValue().toInstant().toEpochMilli(); 
			
 
				+      if (current_date > state.timestamp_latest) 
			
 
				+      {state.timestamp_latest = current_date;
			
 
				+      state.last_doc = new HashMap(params['_source']);}
			
 
				+    """,
			
 
				+    "combine_script": "return state", <3>
			
 
				+    "reduce_script": """ <4>
			
 
				+      def last_doc = '';
			
 
				+      def timestamp_latest = 0L;
			
 
				+      for (s in states) {if (s.timestamp_latest > (timestamp_latest))
			
 
				+      {timestamp_latest = s.timestamp_latest; last_doc = s.last_doc;}} 
			
 
				+      return last_doc
			
 
				+    """
			
 
				+  }
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+// NOTCONSOLE
			
 
				+
			
 
				+<1> The `init_script` creates a long type `timestamp_latest` and a string type 
			
 
				+`last_doc` in the `state` object.
			
 
				+<2> The `map_script` defines `current_date` based on the timestamp of the 
			
 
				+document, then compares `current_date` with `state.timestamp_latest`, finally 
			
 
				+returns `state.last_doc` from the shard. By using `new HashMap(...)` we copy the 
			
 
				+source document, this is important whenever you want to pass the full source 
			
 
				+object from one phase to the next.
			
 
				+<3> The `combine_script` returns `state` from each shard.
			
 
				+<4> The `reduce_script` iterates through the value of `s.timestamp_latest` 
			
 
				+returned by each shard and returns the document with the latest timestamp 
			
 
				+(`last_doc`). In the response, the top hit (in other words, the `latest_doc`) is 
			
 
				+nested below the `latest_doc` field.
			
 
				+
			
 
				+Check the
			
 
				+<<scripted-metric-aggregation-scope,scope of scripts>>
			
 
				+for detailed explanation on the respective scripts.
			
 
				+
			
 
				+You can retrieve the last value in a similar way: 
			
 
				+
			
 
				+[source,js]
			
 
				+--------------------------------------------------
			
 
				+"latest_value": {
			
 
				+  "scripted_metric": {
			
 
				+    "init_script": "state.timestamp_latest = 0L; state.last_value = ''",
			
 
				+    "map_script": """
			
 
				+      def current_date = doc['date'].getValue().toInstant().toEpochMilli(); 
			
 
				+      if (current_date > state.timestamp_latest) 
			
 
				+      {state.timestamp_latest = current_date;
			
 
				+      state.last_value = params['_source']['value'];}
			
 
				+    """,
			
 
				+    "combine_script": "return state",
			
 
				+    "reduce_script": """
			
 
				+      def last_value = '';
			
 
				+      def timestamp_latest = 0L; 
			
 
				+      for (s in states) {if (s.timestamp_latest > (timestamp_latest)) 
			
 
				+      {timestamp_latest = s.timestamp_latest; last_value = s.last_value;}} 
			
 
				+      return last_value
			
 
				+    """
			
 
				+  }
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+// NOTCONSOLE
			
 
				+
			
 
				+
			
 
				+[discrete]
			
 
				+[[painless-time-features]]
			
 
				+==== Getting time features as scripted fields
			
 
				+
			
 
				+This snippet shows how to extract time based features by using Painless. The 
			
 
				+snippet uses an index where `@timestamp` is defined as a `date` type field.
			
 
				+
			
 
				+[source,js]
			
 
				+--------------------------------------------------
			
 
				+"script_fields": {
			
 
				+    "hour_of_day": { <1>
			
 
				+      "script": {
			
 
				+        "lang": "painless",
			
 
				+        "source": """
			
 
				+          ZonedDateTime date =  doc['@timestamp'].value; <2>
			
 
				+          return date.getHour(); <3>
			
 
				+        """
			
 
				+      }
			
 
				+    },
			
 
				+    "month_of_year": { <4>
			
 
				+      "script": {
			
 
				+        "lang": "painless",
			
 
				+        "source": """
			
 
				+          ZonedDateTime date =  doc['@timestamp'].value; <5>
			
 
				+          return date.getMonthValue(); <6>
			
 
				+        """
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+--------------------------------------------------
			
 
				+// NOTCONSOLE
			
 
				+
			
 
				+<1> Contains the Painless script that returns the hour of the day.
			
 
				+<2> Sets `date` based on the timestamp of the document.
			
 
				+<3> Returns the hour value from `date`.
			
 
				+<4> Contains the Painless script that returns the month of the year.
			
 
				+<5> Sets `date` based on the timestamp of the document.
			
 
				+<6> Returns the month value from `date`.
			
 
				+
			
 
				+
			
 
				+[discrete]
			
 
				+[[painless-group-by]]
			
 
				+==== Using Painless in `group_by`
			
 
				+
			
 
				+It is possible to base the `group_by` property of a {transform} on the output of 
			
 
				+a script. The following example uses the {kib} sample web logs dataset. The goal 
			
 
				+here is to make the {transform} output easier to understand through normalizing 
			
 
				+the value of the fields that the data is grouped by.
			
 
				+
			
 
				+[source,console]
			
 
				+--------------------------------------------------
			
 
				+POST _transform/_preview
			
 
				+{
			
 
				+  "source": {
			
 
				+    "index": [ <1>
			
 
				+      "kibana_sample_data_logs"
			
 
				+    ]
			
 
				+  },
			
 
				+  "pivot": {
			
 
				+    "group_by": {
			
 
				+      "agent": {
			
 
				+        "terms": {
			
 
				+          "script": { <2>
			
 
				+            "source": """String agent = doc['agent.keyword'].value; 
			
 
				+            if (agent.contains("MSIE")) { 
			
 
				+              return "internet explorer";
			
 
				+            } else if (agent.contains("AppleWebKit")) { 
			
 
				+              return "safari"; 
			
 
				+            } else if (agent.contains('Firefox')) { 
			
 
				+              return "firefox";
			
 
				+            } else { return agent }""",
			
 
				+            "lang": "painless"
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+    },
			
 
				+    "aggregations": { <3>
			
 
				+      "200": {
			
 
				+        "filter": {
			
 
				+          "term": {
			
 
				+            "response": "200"
			
 
				+          }
			
 
				+        }
			
 
				+      },
			
 
				+      "404": {
			
 
				+        "filter": {
			
 
				+          "term": {
			
 
				+            "response": "404"
			
 
				+          }
			
 
				+        }
			
 
				+      },
			
 
				+      "503": {
			
 
				+        "filter": {
			
 
				+          "term": {
			
 
				+            "response": "503"
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  },
			
 
				+  "dest": { <4>
			
 
				+    "index": "pivot_logs"
			
 
				+  }
			
 
				+} 
			
 
				+--------------------------------------------------
			
 
				+// TEST[skip:setup kibana sample data]
			
 
				+
			
 
				+<1> Specifies the source index or indices.
			
 
				+<2> The script defines an `agent` string based on the `agent` field of the 
			
 
				+documents, then iterates through the values. If an `agent` field contains 
			
 
				+"MSIE", than the script returns "Internet Explorer". If it contains 
			
 
				+`AppleWebKit`, it returns "safari". It returns "firefox" if the field value 
			
 
				+contains "Firefox". Finally, in every other case, the value of the field is 
			
 
				+returned.
			
 
				+<3> The aggregations object contains filters that narrow down the results to 
			
 
				+documents that contains `200`, `404`, or `503` values in the `response` field.
			
 
				+<4> Specifies the destination index of the {transform}.
			
 
				+
			
 
				+The API returns the following result:
			
 
				+
			
 
				+[source,js]
			
 
				+--------------------------------------------------
			
 
				+{
			
 
				+  "preview" : [
			
 
				+    {
			
 
				+      "agent" : "firefox",
			
 
				+      "200" : 4931,
			
 
				+      "404" : 259,
			
 
				+      "503" : 172
			
 
				+    },
			
 
				+    {
			
 
				+      "agent" : "internet explorer",
			
 
				+      "200" : 3674,
			
 
				+      "404" : 210,
			
 
				+      "503" : 126
			
 
				+    },
			
 
				+    {
			
 
				+      "agent" : "safari",
			
 
				+      "200" : 4227,
			
 
				+      "404" : 332,
			
 
				+      "503" : 143
			
 
				+    }
			
 
				+  ],
			
 
				+  "mappings" : {
			
 
				+    "properties" : {
			
 
				+      "200" : {
			
 
				+        "type" : "long"
			
 
				+      },
			
 
				+      "agent" : {
			
 
				+        "type" : "keyword"
			
 
				+      },
			
 
				+      "404" : {
			
 
				+        "type" : "long"
			
 
				+      },
			
 
				+      "503" : {
			
 
				+        "type" : "long"
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+// NOTCONSOLE
			
 
				+
			
 
				+You can see that the `agent` values are simplified so it is easier to interpret 
			
 
				+them. The table below shows how normalization modifies the output of the 
			
 
				+{transform} in our example compared to the non-normalized values.
			
 
				+
			
 
				+[width="50%"]
			
 
				+
			
 
				+|===
			
 
				+| Non-normalized `agent` value                                                 | Normalized `agent` value 
			
 
				+
			
 
				+| "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)" | "internet explorer"
			
 
				+| "Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.50 Safari/534.24" | "safari"
			
 
				+| "Mozilla/5.0 (X11; Linux x86_64; rv:6.0a1) Gecko/20110421 Firefox/6.0a1" | "firefox"
			
 
				+|===
			
 
				+
			
 
				+
			
 
				+[discrete]
			
 
				+[[painless-bucket-script]]
			
 
				+==== Getting duration by using bucket script
			
 
				+
			
 
				+This example shows you how to get the duration of a session by client IP from a 
			
 
				+data log by using 
			
 
				+{ref}/search-aggregations-pipeline-bucket-script-aggregation.html[bucket script]. 
			
 
				+The example uses the {kib} sample web logs dataset.
			
 
				+
			
 
				+[source,console]
			
 
				+--------------------------------------------------
			
 
				+PUT _data_frame/transforms/data_log
			
 
				+{
			
 
				+  "source": {
			
 
				+    "index": "kibana_sample_data_logs"
			
 
				+  },
			
 
				+  "dest": {
			
 
				+    "index": "data-logs-by-client"
			
 
				+  },
			
 
				+  "pivot": {
			
 
				+    "group_by": {
			
 
				+      "machine.os": {"terms": {"field": "machine.os.keyword"}},
			
 
				+      "machine.ip": {"terms": {"field": "clientip"}}
			
 
				+    },
			
 
				+    "aggregations": {
			
 
				+      "time_frame.lte": {
			
 
				+        "max": {
			
 
				+          "field": "timestamp"
			
 
				+        }
			
 
				+      },
			
 
				+      "time_frame.gte": {
			
 
				+        "min": {
			
 
				+          "field": "timestamp"
			
 
				+        }
			
 
				+      },
			
 
				+      "time_length": { <1>
			
 
				+        "bucket_script": {
			
 
				+          "buckets_path": { <2>
			
 
				+            "min": "time_frame.gte.value",
			
 
				+            "max": "time_frame.lte.value"
			
 
				+          },
			
 
				+          "script": "params.max - params.min" <3>
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+// TEST[skip:setup kibana sample data]
			
 
				+
			
 
				+<1> To define the length of the sessions, we use a bucket script.
			
 
				+<2> The bucket path is a map of script variables and their associated path to 
			
 
				+the buckets you want to use for the variable. In this particular case, `min` and 
			
 
				+`max` are variables mapped to `time_frame.gte.value` and `time_frame.lte.value`.
			
 
				+<3> Finally, the script substracts the start date of the session from the end 
			
 
				+date which results in the duration of the session.