|
@@ -0,0 +1,335 @@
|
|
|
+[role="xpack"]
|
|
|
+[testenv="basic"]
|
|
|
+[[dataframe-examples]]
|
|
|
+== {dataframe-transform-cap} examples
|
|
|
+++++
|
|
|
+<titleabbrev>Examples</titleabbrev>
|
|
|
+++++
|
|
|
+
|
|
|
+beta[]
|
|
|
+
|
|
|
+These examples demonstrate how to use {dataframe-transforms} to derive useful
|
|
|
+insights from your data. All the examples use one of the
|
|
|
+{kibana-ref}/add-sample-data.html[{kib} sample datasets]. For a more detailed,
|
|
|
+step-by-step example, see
|
|
|
+<<ecommerce-dataframes,Transforming your data with {dataframes}>>.
|
|
|
+
|
|
|
+* <<ecommerce-dataframes>>
|
|
|
+* <<example-best-customers>>
|
|
|
+* <<example-airline>>
|
|
|
+* <<example-clientips>>
|
|
|
+
|
|
|
+include::ecommerce-example.asciidoc[]
|
|
|
+
|
|
|
+[[example-best-customers]]
|
|
|
+=== Finding your best customers
|
|
|
+
|
|
|
+In this example, we use the eCommerce orders sample dataset to find the customers
|
|
|
+who spent the most in our hypothetical webshop. Let's transform the data such
|
|
|
+that the destination index contains the number of orders, the total price of
|
|
|
+the orders, the amount of unique products and the average price per order,
|
|
|
+and the total amount of ordered products for each customer.
|
|
|
+
|
|
|
+[source,console]
|
|
|
+----------------------------------
|
|
|
+POST _data_frame/transforms/_preview
|
|
|
+{
|
|
|
+ "source": {
|
|
|
+ "index": "kibana_sample_data_ecommerce"
|
|
|
+ },
|
|
|
+ "dest" : { <1>
|
|
|
+ "index" : "sample_ecommerce_orders_by_customer"
|
|
|
+ },
|
|
|
+ "pivot": {
|
|
|
+ "group_by": { <2>
|
|
|
+ "user": { "terms": { "field": "user" }},
|
|
|
+ "customer_id": { "terms": { "field": "customer_id" }}
|
|
|
+ },
|
|
|
+ "aggregations": {
|
|
|
+ "order_count": { "value_count": { "field": "order_id" }},
|
|
|
+ "total_order_amt": { "sum": { "field": "taxful_total_price" }},
|
|
|
+ "avg_amt_per_order": { "avg": { "field": "taxful_total_price" }},
|
|
|
+ "avg_unique_products_per_order": { "avg": { "field": "total_unique_products" }},
|
|
|
+ "total_unique_products": { "cardinality": { "field": "products.product_id" }}
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+----------------------------------
|
|
|
+// TEST[skip:setup kibana sample data]
|
|
|
+
|
|
|
+<1> This is the destination index for the {dataframe}. It is ignored by
|
|
|
+`_preview`.
|
|
|
+<2> Two `group_by` fields have been selected. This means the {dataframe} will
|
|
|
+contain a unique row per `user` and `customer_id` combination. Within this
|
|
|
+dataset both these fields are unique. By including both in the {dataframe} it
|
|
|
+gives more context to the final results.
|
|
|
+
|
|
|
+NOTE: In the example above, condensed JSON formatting has been used for easier
|
|
|
+readability of the pivot object.
|
|
|
+
|
|
|
+The preview {dataframe-transforms} API enables you to see the layout of the
|
|
|
+{dataframe} in advance, populated with some sample values. For example:
|
|
|
+
|
|
|
+[source,js]
|
|
|
+----------------------------------
|
|
|
+{
|
|
|
+ "preview" : [
|
|
|
+ {
|
|
|
+ "total_order_amt" : 3946.9765625,
|
|
|
+ "order_count" : 59.0,
|
|
|
+ "total_unique_products" : 116.0,
|
|
|
+ "avg_unique_products_per_order" : 2.0,
|
|
|
+ "customer_id" : "10",
|
|
|
+ "user" : "recip",
|
|
|
+ "avg_amt_per_order" : 66.89790783898304
|
|
|
+ },
|
|
|
+ ...
|
|
|
+ ]
|
|
|
+ }
|
|
|
+----------------------------------
|
|
|
+// NOTCONSOLE
|
|
|
+
|
|
|
+This {dataframe} makes it easier to answer questions such as:
|
|
|
+
|
|
|
+* Which customers spend the most?
|
|
|
+
|
|
|
+* Which customers spend the most per order?
|
|
|
+
|
|
|
+* Which customers order most often?
|
|
|
+
|
|
|
+* Which customers ordered the least number of different products?
|
|
|
+
|
|
|
+It's possible to answer these questions using aggregations alone, however
|
|
|
+{dataframes} allow us to persist this data as a customer centric index. This
|
|
|
+enables us to analyze data at scale and gives more flexibility to explore and
|
|
|
+navigate data from a customer centric perspective. In some cases, it can even
|
|
|
+make creating visualizations much simpler.
|
|
|
+
|
|
|
+[[example-airline]]
|
|
|
+=== Finding air carriers with the most delays
|
|
|
+
|
|
|
+In this example, we use the Flights sample dataset to find out which air carrier
|
|
|
+had the most delays. First, we filter the source data such that it excludes all
|
|
|
+the cancelled flights by using a query filter. Then we transform the data to
|
|
|
+contain the distinct number of flights, the sum of delayed minutes, and the sum
|
|
|
+of the flight minutes by air carrier. Finally, we use a
|
|
|
+{ref}/search-aggregations-pipeline-bucket-script-aggregation.html[`bucket_script`]
|
|
|
+to determine what percentage of the flight time was actually delay.
|
|
|
+
|
|
|
+[source,console]
|
|
|
+----------------------------------
|
|
|
+POST _data_frame/transforms/_preview
|
|
|
+{
|
|
|
+ "source": {
|
|
|
+ "index": "kibana_sample_data_flights",
|
|
|
+ "query": { <1>
|
|
|
+ "bool": {
|
|
|
+ "filter": [
|
|
|
+ { "term": { "Cancelled": false } }
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "dest" : { <2>
|
|
|
+ "index" : "sample_flight_delays_by_carrier"
|
|
|
+ },
|
|
|
+ "pivot": {
|
|
|
+ "group_by": { <3>
|
|
|
+ "carrier": { "terms": { "field": "Carrier" }}
|
|
|
+ },
|
|
|
+ "aggregations": {
|
|
|
+ "flights_count": { "value_count": { "field": "FlightNum" }},
|
|
|
+ "delay_mins_total": { "sum": { "field": "FlightDelayMin" }},
|
|
|
+ "flight_mins_total": { "sum": { "field": "FlightTimeMin" }},
|
|
|
+ "delay_time_percentage": { <4>
|
|
|
+ "bucket_script": {
|
|
|
+ "buckets_path": {
|
|
|
+ "delay_time": "delay_mins_total.value",
|
|
|
+ "flight_time": "flight_mins_total.value"
|
|
|
+ },
|
|
|
+ "script": "(params.delay_time / params.flight_time) * 100"
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+----------------------------------
|
|
|
+// TEST[skip:setup kibana sample data]
|
|
|
+
|
|
|
+<1> Filter the source data to select only flights that were not cancelled.
|
|
|
+<2> This is the destination index for the {dataframe}. It is ignored by
|
|
|
+`_preview`.
|
|
|
+<3> The data is grouped by the `Carrier` field which contains the airline name.
|
|
|
+<4> This `bucket_script` performs calculations on the results that are returned
|
|
|
+by the aggregation. In this particular example, it calculates what percentage of
|
|
|
+travel time was taken up by delays.
|
|
|
+
|
|
|
+The preview shows you that the new index would contain data like this for each
|
|
|
+carrier:
|
|
|
+
|
|
|
+[source,js]
|
|
|
+----------------------------------
|
|
|
+{
|
|
|
+ "preview" : [
|
|
|
+ {
|
|
|
+ "carrier" : "ES-Air",
|
|
|
+ "flights_count" : 2802.0,
|
|
|
+ "flight_mins_total" : 1436927.5130677223,
|
|
|
+ "delay_time_percentage" : 9.335543983955839,
|
|
|
+ "delay_mins_total" : 134145.0
|
|
|
+ },
|
|
|
+ ...
|
|
|
+ ]
|
|
|
+}
|
|
|
+----------------------------------
|
|
|
+// NOTCONSOLE
|
|
|
+
|
|
|
+This {dataframe} makes it easier to answer questions such as:
|
|
|
+
|
|
|
+* Which air carrier has the most delays as a percentage of flight time?
|
|
|
+
|
|
|
+NOTE: This data is fictional and does not reflect actual delays
|
|
|
+or flight stats for any of the featured destination or origin airports.
|
|
|
+
|
|
|
+
|
|
|
+[[example-clientips]]
|
|
|
+=== Finding suspicious client IPs by using scripted metrics
|
|
|
+
|
|
|
+With {dataframe-transforms}, you can use
|
|
|
+{ref}/search-aggregations-metrics-scripted-metric-aggregation.html[scripted
|
|
|
+metric aggregations] on your data. These aggregations are flexible and make
|
|
|
+it possible to perform very complex processing. Let's use scripted metrics to
|
|
|
+identify suspicious client IPs in the web log sample dataset.
|
|
|
+
|
|
|
+We transform the data such that the new index contains the sum of bytes and the
|
|
|
+number of distinct URLs, agents, incoming requests by location, and geographic
|
|
|
+destinations for each client IP. We also use a scripted field to count the
|
|
|
+specific types of HTTP responses that each client IP receives. Ultimately, the
|
|
|
+example below transforms web log data into an entity centric index where the
|
|
|
+entity is `clientip`.
|
|
|
+
|
|
|
+[source,console]
|
|
|
+----------------------------------
|
|
|
+POST _data_frame/transforms/_preview
|
|
|
+{
|
|
|
+ "source": {
|
|
|
+ "index": "kibana_sample_data_logs",
|
|
|
+ "query": { <1>
|
|
|
+ "range" : {
|
|
|
+ "timestamp" : {
|
|
|
+ "gte" : "now-30d/d"
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "dest" : { <2>
|
|
|
+ "index" : "sample_weblogs_by_clientip"
|
|
|
+ },
|
|
|
+ "pivot": {
|
|
|
+ "group_by": { <3>
|
|
|
+ "clientip": { "terms": { "field": "clientip" } }
|
|
|
+ },
|
|
|
+ "aggregations": {
|
|
|
+ "url_dc": { "cardinality": { "field": "url.keyword" }},
|
|
|
+ "bytes_sum": { "sum": { "field": "bytes" }},
|
|
|
+ "geo.src_dc": { "cardinality": { "field": "geo.src" }},
|
|
|
+ "agent_dc": { "cardinality": { "field": "agent.keyword" }},
|
|
|
+ "geo.dest_dc": { "cardinality": { "field": "geo.dest" }},
|
|
|
+ "responses.total": { "value_count": { "field": "timestamp" }},
|
|
|
+ "responses.counts": { <4>
|
|
|
+ "scripted_metric": {
|
|
|
+ "init_script": "state.responses = ['error':0L,'success':0L,'other':0L]",
|
|
|
+ "map_script": """
|
|
|
+ def code = doc['response.keyword'].value;
|
|
|
+ if (code.startsWith('5') || code.startsWith('4')) {
|
|
|
+ state.responses.error += 1 ;
|
|
|
+ } else if(code.startsWith('2')) {
|
|
|
+ state.responses.success += 1;
|
|
|
+ } else {
|
|
|
+ state.responses.other += 1;
|
|
|
+ }
|
|
|
+ """,
|
|
|
+ "combine_script": "state.responses",
|
|
|
+ "reduce_script": """
|
|
|
+ def counts = ['error': 0L, 'success': 0L, 'other': 0L];
|
|
|
+ for (responses in states) {
|
|
|
+ counts.error += responses['error'];
|
|
|
+ counts.success += responses['success'];
|
|
|
+ counts.other += responses['other'];
|
|
|
+ }
|
|
|
+ return counts;
|
|
|
+ """
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "timestamp.min": { "min": { "field": "timestamp" }},
|
|
|
+ "timestamp.max": { "max": { "field": "timestamp" }},
|
|
|
+ "timestamp.duration_ms": { <5>
|
|
|
+ "bucket_script": {
|
|
|
+ "buckets_path": {
|
|
|
+ "min_time": "timestamp.min.value",
|
|
|
+ "max_time": "timestamp.max.value"
|
|
|
+ },
|
|
|
+ "script": "(params.max_time - params.min_time)"
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+----------------------------------
|
|
|
+// TEST[skip:setup kibana sample data]
|
|
|
+
|
|
|
+<1> This range query limits the transform to documents that are within the last
|
|
|
+30 days at the point in time the {dataframe-transform} checkpoint is processed.
|
|
|
+For batch {dataframes} this occurs once.
|
|
|
+<2> This is the destination index for the {dataframe}. It is ignored by
|
|
|
+`_preview`.
|
|
|
+<3> The data is grouped by the `clientip` field.
|
|
|
+<4> This `scripted_metric` performs a distributed operation on the web log data
|
|
|
+to count specific types of HTTP responses (error, success, and other).
|
|
|
+<5> This `bucket_script` calculates the duration of the `clientip` access based
|
|
|
+on the results of the aggregation.
|
|
|
+
|
|
|
+The preview shows you that the new index would contain data like this for each
|
|
|
+client IP:
|
|
|
+
|
|
|
+[source,js]
|
|
|
+----------------------------------
|
|
|
+{
|
|
|
+ "preview" : [
|
|
|
+ {
|
|
|
+ "geo" : {
|
|
|
+ "src_dc" : 12.0,
|
|
|
+ "dest_dc" : 9.0
|
|
|
+ },
|
|
|
+ "clientip" : "0.72.176.46",
|
|
|
+ "agent_dc" : 3.0,
|
|
|
+ "responses" : {
|
|
|
+ "total" : 14.0,
|
|
|
+ "counts" : {
|
|
|
+ "other" : 0,
|
|
|
+ "success" : 14,
|
|
|
+ "error" : 0
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "bytes_sum" : 74808.0,
|
|
|
+ "timestamp" : {
|
|
|
+ "duration_ms" : 4.919943239E9,
|
|
|
+ "min" : "2019-06-17T07:51:57.333Z",
|
|
|
+ "max" : "2019-08-13T06:31:00.572Z"
|
|
|
+ },
|
|
|
+ "url_dc" : 11.0
|
|
|
+ },
|
|
|
+ ...
|
|
|
+ }
|
|
|
+----------------------------------
|
|
|
+// NOTCONSOLE
|
|
|
+
|
|
|
+This {dataframe} makes it easier to answer questions such as:
|
|
|
+
|
|
|
+* Which client IPs are transferring the most amounts of data?
|
|
|
+
|
|
|
+* Which client IPs are interacting with a high number of different URLs?
|
|
|
+
|
|
|
+* Which client IPs have high error rates?
|
|
|
+
|
|
|
+* Which client IPs are interacting with a high number of destination countries?
|