5 years ago · 1471f34c54
--- a/docs/reference/analysis/tokenfilters/delimited-payload-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/delimited-payload-tokenfilter.asciidoc
@@ -1,21 +1,323 @@
 
				 [[analysis-delimited-payload-tokenfilter]]
			
 
				-=== Delimited Payload Token Filter
			
 
				-
			
 
				-Named `delimited_payload`. Splits tokens into tokens and payload whenever a delimiter character is found.
			
 
				+=== Delimited payload token filter
			
 
				+++++
			
 
				+<titleabbrev>Delimited payload</titleabbrev>
			
 
				+++++
			
 
				 
			
 
				 [WARNING]
			
 
				-============================================
			
 
				+====
			
 
				+The older name `delimited_payload_filter` is deprecated and should not be used
			
 
				+with new indices. Use `delimited_payload` instead.
			
 
				+====
			
 
				+
			
 
				+Separates a token stream into tokens and payloads based on a specified
			
 
				+delimiter.
			
 
				+
			
 
				+For example, you can use the `delimited_payload` filter with a `|` delimiter to
			
 
				+split `the|1 quick|2 fox|3` into the tokens `the`, `quick`, and `fox`
			
 
				+with respective payloads of `1`, `2`, and `3`.
			
 
				+
			
 
				+This filter uses Lucene's
			
 
				+https://lucene.apache.org/core/{lucene_version_path}/analyzers-common/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilter.html[DelimitedPayloadTokenFilter].
			
 
				+
			
 
				+[NOTE]
			
 
				+.Payloads
			
 
				+====
			
 
				+A payload is user-defined binary data associated with a token position and
			
 
				+stored as base64-encoded bytes.
			
 
				+
			
 
				+{es} does not store token payloads by default. To store payloads, you must:
			
 
				+
			
 
				+* Set the <<term-vector,`term_vector`>> mapping parameter to
			
 
				+  `with_positions_payloads` or `with_positions_offsets_payloads` for any field
			
 
				+  storing payloads.
			
 
				+* Use an index analyzer that includes the `delimited_payload` filter
			
 
				+
			
 
				+You can view stored payloads using the <<docs-termvectors,term vectors API>>.
			
 
				+====
			
 
				+
			
 
				+[[analysis-delimited-payload-tokenfilter-analyze-ex]]
			
 
				+==== Example
			
 
				+
			
 
				+The following <<indices-analyze,analyze API>> request uses the
			
 
				+`delimited_payload` filter with the default `|` delimiter to split
			
 
				+`the|0 brown|10 fox|5 is|0 quick|10` into tokens and payloads.
			
 
				+
			
 
				+[source,console]
			
 
				+--------------------------------------------------
			
 
				+GET _analyze
			
 
				+{
			
 
				+  "tokenizer": "whitespace",
			
 
				+  "filter": ["delimited_payload"],
			
 
				+  "text": "the|0 brown|10 fox|5 is|0 quick|10"
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+
			
 
				+The filter produces the following tokens:
			
 
				+
			
 
				+[source,text]
			
 
				+--------------------------------------------------
			
 
				+[ the, brown, fox, is, quick ]
			
 
				+--------------------------------------------------
			
 
				+
			
 
				+Note that the analyze API does not return stored payloads. For an example that
			
 
				+includes returned payloads, see
			
 
				+<<analysis-delimited-payload-tokenfilter-return-stored-payloads>>.
			
 
				+
			
 
				+/////////////////////
			
 
				+[source,console-result]
			
 
				+--------------------------------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "the",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 5,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "brown",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 14,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "fox",
			
 
				+      "start_offset": 15,
			
 
				+      "end_offset": 20,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "is",
			
 
				+      "start_offset": 21,
			
 
				+      "end_offset": 25,
			
 
				+      "type": "word",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "quick",
			
 
				+      "start_offset": 26,
			
 
				+      "end_offset": 34,
			
 
				+      "type": "word",
			
 
				+      "position": 4
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+/////////////////////
			
 
				+
			
 
				+[[analysis-delimited-payload-tokenfilter-analyzer-ex]]
			
 
				+==== Add to an analyzer
			
 
				+
			
 
				+The following <<indices-create-index,create index API>> request uses the
			
 
				+`delimited-payload` filter to configure a new <<analysis-custom-analyzer,custom
			
 
				+analyzer>>.
			
 
				+
			
 
				+[source,console]
			
 
				+--------------------------------------------------
			
 
				+PUT delimited_payload
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "whitespace_delimited_payload": {
			
 
				+          "tokenizer": "whitespace",
			
 
				+          "filter": [ "delimited_payload" ]
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+
			
 
				+[[analysis-delimited-payload-tokenfilter-configure-parms]]
			
 
				+==== Configurable parameters
			
 
				+
			
 
				+`delimiter`::
			
 
				+(Optional, string)
			
 
				+Character used to separate tokens from payloads. Defaults to `|`. 
			
 
				+
			
 
				+`encoding`::
			
 
				++
			
 
				+--
			
 
				+(Optional, string)
			
 
				+Datatype for the stored payload. Valid values are:
			
 
				+
			
 
				+`float`:::
			
 
				+(Default) Float
			
 
				+
			
 
				+`identity`:::
			
 
				+Characters
			
 
				+
			
 
				+`int`:::
			
 
				+Integer
			
 
				+--
			
 
				+
			
 
				+[[analysis-delimited-payload-tokenfilter-customize]]
			
 
				+==== Customize and add to an analyzer
			
 
				+
			
 
				+To customize the `delimited_payload` filter, duplicate it to create the basis
			
 
				+for a new custom token filter. You can modify the filter using its configurable
			
 
				+parameters.
			
 
				+
			
 
				+For example, the following <<indices-create-index,create index API>> request
			
 
				+uses a custom `delimited_payload` filter to configure a new
			
 
				+<<analysis-custom-analyzer,custom analyzer>>. The custom `delimited_payload`
			
 
				+filter uses the `+` delimiter to separate tokens from payloads. Payloads are
			
 
				+encoded as integers.
			
 
				+
			
 
				+[source,console]
			
 
				+--------------------------------------------------
			
 
				+PUT delimited_payload_example
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "whitespace_plus_delimited": {
			
 
				+          "tokenizer": "whitespace",
			
 
				+          "filter": [ "plus_delimited" ]
			
 
				+        }
			
 
				+      },
			
 
				+      "filter": {
			
 
				+        "plus_delimited": {
			
 
				+          "type": "delimited_payload",
			
 
				+          "delimiter": "+",
			
 
				+          "encoding": "int"
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+
			
 
				+[[analysis-delimited-payload-tokenfilter-return-stored-payloads]]
			
 
				+==== Return stored payloads
			
 
				+
			
 
				+Use the <<indices-create-index,create index API>> to create an index that:
			
 
				+
			
 
				+* Includes a field that stores term vectors with payloads.
			
 
				+* Uses a <<analysis-custom-analyzer,custom index analyzer>> with the
			
 
				+  `delimited_payload` filter.
			
 
				+
			
 
				+[source,console]
			
 
				+--------------------------------------------------
			
 
				+PUT text_payloads
			
 
				+{
			
 
				+  "mappings": {
			
 
				+    "properties": {
			
 
				+      "text": {
			
 
				+        "type": "text",
			
 
				+        "term_vector": "with_positions_payloads",
			
 
				+        "analyzer": "payload_delimiter"
			
 
				+      }
			
 
				+    }
			
 
				+  },
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "payload_delimiter": {
			
 
				+          "tokenizer": "whitespace",
			
 
				+          "filter": [ "delimited_payload" ]
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				 
			
 
				-The older name `delimited_payload_filter` is deprecated and should not be used for new indices. Use `delimited_payload` instead.
			
 
				+Add a document containing payloads to the index.
			
 
				 
			
 
				-============================================
			
 
				+[source,console]
			
 
				+--------------------------------------------------
			
 
				+POST text_payloads/_doc/1
			
 
				+{
			
 
				+  "text": "the|0 brown|3 fox|4 is|0 quick|10"
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+// TEST[continued]
			
 
				 
			
 
				-Example: "the|1 quick|2 fox|3" is split by default into tokens `the`, `quick`, and `fox` with payloads `1`, `2`, and `3` respectively.
			
 
				+Use the <<docs-termvectors,term vectors API>> to return the document's tokens
			
 
				+and base64-encoded payloads.
			
 
				 
			
 
				-Parameters:
			
 
				+[source,console]
			
 
				+--------------------------------------------------
			
 
				+GET text_payloads/_termvectors/1
			
 
				+{
			
 
				+  "fields": [ "text" ],
			
 
				+  "payloads": true
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+// TEST[continued]
			
 
				 
			
 
				-`delimiter`:: 
			
 
				-    Character used for splitting the tokens. Default is `|`. 
			
 
				+The API returns the following response:
			
 
				 
			
 
				-`encoding`:: 
			
 
				-    The type of the payload. `int` for integer, `float` for float and `identity` for characters. Default is `float`.
			
 
				+[source,console-result]
			
 
				+--------------------------------------------------
			
 
				+{
			
 
				+  "_index": "text_payloads",
			
 
				+  "_id": "1",
			
 
				+  "_version": 1,
			
 
				+  "found": true,
			
 
				+  "took": 8,
			
 
				+  "term_vectors": {
			
 
				+    "text": {
			
 
				+      "field_statistics": {
			
 
				+        "sum_doc_freq": 5,
			
 
				+        "doc_count": 1,
			
 
				+        "sum_ttf": 5
			
 
				+      },
			
 
				+      "terms": {
			
 
				+        "brown": {
			
 
				+          "term_freq": 1,
			
 
				+          "tokens": [
			
 
				+            {
			
 
				+              "position": 1,
			
 
				+              "payload": "QEAAAA=="
			
 
				+            }
			
 
				+          ]
			
 
				+        },
			
 
				+        "fox": {
			
 
				+          "term_freq": 1,
			
 
				+          "tokens": [
			
 
				+            {
			
 
				+              "position": 2,
			
 
				+              "payload": "QIAAAA=="
			
 
				+            }
			
 
				+          ]
			
 
				+        },
			
 
				+        "is": {
			
 
				+          "term_freq": 1,
			
 
				+          "tokens": [
			
 
				+            {
			
 
				+              "position": 3,
			
 
				+              "payload": "AAAAAA=="
			
 
				+            }
			
 
				+          ]
			
 
				+        },
			
 
				+        "quick": {
			
 
				+          "term_freq": 1,
			
 
				+          "tokens": [
			
 
				+            {
			
 
				+              "position": 4,
			
 
				+              "payload": "QSAAAA=="
			
 
				+            }
			
 
				+          ]
			
 
				+        },
			
 
				+        "the": {
			
 
				+          "term_freq": 1,
			
 
				+          "tokens": [
			
 
				+            {
			
 
				+              "position": 0,
			
 
				+              "payload": "AAAAAA=="
			
 
				+            }
			
 
				+          ]
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+// TESTRESPONSE[s/"took": 8/"took": "$body.took"/]