5 years ago · 00ab16ff97
--- a/docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc
@@ -4,46 +4,507 @@
 
				 <titleabbrev>Shingle</titleabbrev>
			
 
				 ++++
			
 
				 
			
 
				-NOTE: Shingles are generally used to help speed up phrase queries.  Rather
			
 
				-than building filter chains by hand, you may find it easier to use the
			
 
				-<<index-phrases,`index-phrases`>> option on a text field.
			
 
				-
			
 
				-A token filter of type `shingle` that constructs shingles (token
			
 
				-n-grams) from a token stream. In other words, it creates combinations of
			
 
				-tokens as a single token. For example, the sentence "please divide this
			
 
				-sentence into shingles" might be tokenized into shingles "please
			
 
				-divide", "divide this", "this sentence", "sentence into", and "into
			
 
				-shingles".
			
 
				-
			
 
				-This filter handles position increments > 1 by inserting filler tokens
			
 
				-(tokens with termtext "_"). It does not handle a position increment of
			
 
				-0.
			
 
				-
			
 
				-The following are settings that can be set for a `shingle` token filter
			
 
				-type:
			
 
				-
			
 
				-[cols="<,<",options="header",]
			
 
				-|=======================================================================
			
 
				-|Setting |Description
			
 
				-|`max_shingle_size` |The maximum shingle size. Defaults to `2`.
			
 
				-
			
 
				-|`min_shingle_size` |The minimum shingle size. Defaults to `2`.
			
 
				-
			
 
				-|`output_unigrams` |If `true` the output will contain the input tokens
			
 
				-(unigrams) as well as the shingles. Defaults to `true`.
			
 
				-
			
 
				-|`output_unigrams_if_no_shingles` |If `output_unigrams` is `false` the
			
 
				-output will contain the input tokens (unigrams) if no shingles are
			
 
				-available. Note if `output_unigrams` is set to `true` this setting has
			
 
				-no effect. Defaults to `false`.
			
 
				-
			
 
				-|`token_separator` |The string to use when joining adjacent tokens to
			
 
				-form a shingle. Defaults to `" "`.
			
 
				-|`filler_token` | The string to use as a replacement for each position 
			
 
				-at which there is no actual token in the stream. For instance this string is
			
 
				-used if the position increment is greater than one when a `stop` filter is used
			
 
				-together with the `shingle` filter. Defaults to `"_"`
			
 
				-|=======================================================================
			
 
				-
			
 
				-The index level setting `index.max_shingle_diff` controls the maximum allowed
			
 
				-difference between `max_shingle_size` and `min_shingle_size`.
			
 
				+Add shingles, or word https://en.wikipedia.org/wiki/N-gram[n-grams], to a token
			
 
				+stream by concatenating adjacent tokens. By default, the `shingle` token filter
			
 
				+outputs two-word shingles and unigrams.
			
 
				+
			
 
				+For example, many tokenizers convert `the lazy dog` to `[ the, lazy, dog ]`. You
			
 
				+can use the `shingle` filter to add two-word shingles to this stream:
			
 
				+`[ the, the lazy, lazy, lazy dog, dog ]`.
			
 
				+
			
 
				+TIP: Shingles are often used to help speed up phrase queries, such as
			
 
				+<<query-dsl-match-query-phrase,`match_phrase`>>. Rather than creating shingles
			
 
				+using the `shingles` filter, we recommend you use the
			
 
				+<<index-phrases,`index-phrases`>> mapping parameter on the appropriate
			
 
				+<<text,text>> field instead.
			
 
				+
			
 
				+This filter uses Lucene's
			
 
				+{lucene-analysis-docs}/shingle/ShingleFilter.html[ShingleFilter].
			
 
				+
			
 
				+[[analysis-shingle-tokenfilter-analyze-ex]]
			
 
				+==== Example
			
 
				+
			
 
				+The following <<indices-analyze,analyze API>> request uses the `shingle`
			
 
				+filter to add two-word shingles to the token stream for `quick brown fox jumps`:
			
 
				+
			
 
				+[source,console]
			
 
				+----
			
 
				+GET /_analyze
			
 
				+{
			
 
				+  "tokenizer": "whitespace",
			
 
				+  "filter": [ "shingle" ],
			
 
				+  "text": "quick brown fox jumps"
			
 
				+}
			
 
				+----
			
 
				+
			
 
				+The filter produces the following tokens:
			
 
				+
			
 
				+[source,text]
			
 
				+----
			
 
				+[ quick, quick brown, brown, brown fox, fox, fox jumps, jumps ]
			
 
				+----
			
 
				+
			
 
				+////
			
 
				+[source,console-result]
			
 
				+----
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "quick",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 5,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "quick brown",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "shingle",
			
 
				+      "position": 0,
			
 
				+      "positionLength": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "brown",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "brown fox",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 15,
			
 
				+      "type": "shingle",
			
 
				+      "position": 1,
			
 
				+      "positionLength": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "fox",
			
 
				+      "start_offset": 12,
			
 
				+      "end_offset": 15,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "fox jumps",
			
 
				+      "start_offset": 12,
			
 
				+      "end_offset": 21,
			
 
				+      "type": "shingle",
			
 
				+      "position": 2,
			
 
				+      "positionLength": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "jumps",
			
 
				+      "start_offset": 16,
			
 
				+      "end_offset": 21,
			
 
				+      "type": "word",
			
 
				+      "position": 3
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----
			
 
				+////
			
 
				+
			
 
				+To produce shingles of 2-3 words, add the following arguments to the analyze API
			
 
				+request:
			
 
				+
			
 
				+* `min_shingle_size`: `2`
			
 
				+* `max_shingle_size`: `3`
			
 
				+
			
 
				+[source,console]
			
 
				+----
			
 
				+GET /_analyze
			
 
				+{
			
 
				+  "tokenizer": "whitespace",
			
 
				+  "filter": [
			
 
				+    {
			
 
				+      "type": "shingle",
			
 
				+      "min_shingle_size": 2,
			
 
				+      "max_shingle_size": 3
			
 
				+    }
			
 
				+  ],
			
 
				+  "text": "quick brown fox jumps"
			
 
				+}
			
 
				+----
			
 
				+
			
 
				+The filter produces the following tokens:
			
 
				+
			
 
				+[source,text]
			
 
				+----
			
 
				+[ quick, quick brown, quick brown fox, brown, brown fox, brown fox jumps, fox, fox jumps, jumps ]
			
 
				+----
			
 
				+
			
 
				+////
			
 
				+[source, console-result]
			
 
				+----
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "quick",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 5,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "quick brown",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "shingle",
			
 
				+      "position": 0,
			
 
				+      "positionLength": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "quick brown fox",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 15,
			
 
				+      "type": "shingle",
			
 
				+      "position": 0,
			
 
				+      "positionLength": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "brown",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "brown fox",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 15,
			
 
				+      "type": "shingle",
			
 
				+      "position": 1,
			
 
				+      "positionLength": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "brown fox jumps",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 21,
			
 
				+      "type": "shingle",
			
 
				+      "position": 1,
			
 
				+      "positionLength": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "fox",
			
 
				+      "start_offset": 12,
			
 
				+      "end_offset": 15,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "fox jumps",
			
 
				+      "start_offset": 12,
			
 
				+      "end_offset": 21,
			
 
				+      "type": "shingle",
			
 
				+      "position": 2,
			
 
				+      "positionLength": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "jumps",
			
 
				+      "start_offset": 16,
			
 
				+      "end_offset": 21,
			
 
				+      "type": "word",
			
 
				+      "position": 3
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----
			
 
				+////
			
 
				+
			
 
				+To only include shingles in the output, add an `output_unigrams` argument of
			
 
				+`false` to the request.
			
 
				+
			
 
				+[source,console]
			
 
				+----
			
 
				+GET /_analyze
			
 
				+{
			
 
				+  "tokenizer": "whitespace",
			
 
				+  "filter": [
			
 
				+    {
			
 
				+      "type": "shingle",
			
 
				+      "min_shingle_size": 2,
			
 
				+      "max_shingle_size": 3,
			
 
				+      "output_unigrams": false
			
 
				+    }
			
 
				+  ],
			
 
				+  "text": "quick brown fox jumps"
			
 
				+}
			
 
				+----
			
 
				+
			
 
				+The filter produces the following tokens:
			
 
				+
			
 
				+[source,text]
			
 
				+----
			
 
				+[ quick brown, quick brown fox, brown fox, brown fox jumps, fox jumps ]
			
 
				+----
			
 
				+
			
 
				+////
			
 
				+[source, console-result]
			
 
				+----
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "quick brown",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "shingle",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "quick brown fox",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 15,
			
 
				+      "type": "shingle",
			
 
				+      "position": 0,
			
 
				+      "positionLength": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "brown fox",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 15,
			
 
				+      "type": "shingle",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "brown fox jumps",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 21,
			
 
				+      "type": "shingle",
			
 
				+      "position": 1,
			
 
				+      "positionLength": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "fox jumps",
			
 
				+      "start_offset": 12,
			
 
				+      "end_offset": 21,
			
 
				+      "type": "shingle",
			
 
				+      "position": 2
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----
			
 
				+////
			
 
				+
			
 
				+[[analysis-shingle-tokenfilter-analyzer-ex]]
			
 
				+==== Add to an analyzer
			
 
				+
			
 
				+The following <<indices-create-index,create index API>> request uses the
			
 
				+`shingle` filter to configure a new <<analysis-custom-analyzer,custom
			
 
				+analyzer>>.
			
 
				+
			
 
				+[source,console]
			
 
				+----
			
 
				+PUT /my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "standard_shingle": {
			
 
				+          "tokenizer": "standard",
			
 
				+          "filter": [ "shingle" ]
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+----
			
 
				+
			
 
				+[[analysis-shingle-tokenfilter-configure-parms]]
			
 
				+==== Configurable parameters
			
 
				+
			
 
				+`max_shingle_size`::
			
 
				+(Optional, integer)
			
 
				+Maximum number of tokens to concatenate when creating shingles. Defaults to `2`.
			
 
				++
			
 
				+NOTE: This value cannot be lower than the `min_shingle_size` argument, which
			
 
				+defaults to `2`. The difference between this value and the `min_shingle_size`
			
 
				+argument cannot exceed the <<index-max-shingle-diff,`index.max_shingle_diff`>>
			
 
				+index-level setting, which defaults to `3`.
			
 
				+
			
 
				+`min_shingle_size`::
			
 
				+(Optional, integer)
			
 
				+Minimum number of tokens to concatenate when creating shingles. Defaults to `2`.
			
 
				++
			
 
				+NOTE: This value cannot exceed the `max_shingle_size` argument, which defaults
			
 
				+to `2`. The difference between the `max_shingle_size` argument and this value
			
 
				+cannot exceed the <<index-max-shingle-diff,`index.max_shingle_diff`>>
			
 
				+index-level setting, which defaults to `3`.
			
 
				+
			
 
				+`output_unigrams`::
			
 
				+(Optional, boolean)
			
 
				+If `true`, the output includes the original input tokens. If `false`, the output
			
 
				+only includes shingles; the original input tokens are removed. Defaults to
			
 
				+`true`.
			
 
				+
			
 
				+`output_unigrams_if_no_shingles`::
			
 
				+If `true`, the output includes the original input tokens only if no shingles are
			
 
				+produced; if shingles are produced, the output only includes shingles. Defaults
			
 
				+to `false`.
			
 
				++
			
 
				+IMPORTANT: If both this and the `output_unigrams` parameter are `true`, only the
			
 
				+`output_unigrams` argument is used.
			
 
				+
			
 
				+`token_separator`::
			
 
				+(Optional, string)
			
 
				+Separator used to concatenate adjacent tokens to form a shingle. Defaults to a
			
 
				+space (`" "`).
			
 
				+
			
 
				+`filler_token`::
			
 
				++
			
 
				+--
			
 
				+(Optional, string)
			
 
				+String used in shingles as a replacement for empty positions that do not contain
			
 
				+a token. This filler token is only used in shingles, not original unigrams.
			
 
				+Defaults to an underscore (`_`).
			
 
				+
			
 
				+Some token filters, such as the `stop` filter, create empty positions when
			
 
				+removing stop words with a position increment greater than one.
			
 
				+
			
 
				+.*Example*
			
 
				+[%collapsible]
			
 
				+====
			
 
				+In the following <<indices-analyze,analyze API>> request, the `stop` filter
			
 
				+removes the stop word `a` from `fox jumps a lazy dog`, creating an empty
			
 
				+position. The subsequent `shingle` filter replaces this empty position with a
			
 
				+plus sign (`+`) in shingles.
			
 
				+
			
 
				+[source,console]
			
 
				+----
			
 
				+GET /_analyze
			
 
				+{
			
 
				+  "tokenizer": "whitespace",
			
 
				+  "filter": [
			
 
				+    {
			
 
				+      "type": "stop",
			
 
				+      "stopwords": [ "a" ]
			
 
				+    },
			
 
				+    {
			
 
				+      "type": "shingle",
			
 
				+      "filler_token": "+"
			
 
				+    }
			
 
				+  ],
			
 
				+  "text": "fox jumps a lazy dog"
			
 
				+}
			
 
				+----
			
 
				+
			
 
				+The filter produces the following tokens:
			
 
				+
			
 
				+[source,text]
			
 
				+----
			
 
				+[ fox, fox jumps, jumps, jumps +, + lazy, lazy, lazy dog, dog ]
			
 
				+----
			
 
				+
			
 
				+////
			
 
				+[source, console-result]
			
 
				+----
			
 
				+{
			
 
				+  "tokens" : [
			
 
				+    {
			
 
				+      "token" : "fox",
			
 
				+      "start_offset" : 0,
			
 
				+      "end_offset" : 3,
			
 
				+      "type" : "word",
			
 
				+      "position" : 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "fox jumps",
			
 
				+      "start_offset" : 0,
			
 
				+      "end_offset" : 9,
			
 
				+      "type" : "shingle",
			
 
				+      "position" : 0,
			
 
				+      "positionLength" : 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "jumps",
			
 
				+      "start_offset" : 4,
			
 
				+      "end_offset" : 9,
			
 
				+      "type" : "word",
			
 
				+      "position" : 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "jumps +",
			
 
				+      "start_offset" : 4,
			
 
				+      "end_offset" : 12,
			
 
				+      "type" : "shingle",
			
 
				+      "position" : 1,
			
 
				+      "positionLength" : 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "+ lazy",
			
 
				+      "start_offset" : 12,
			
 
				+      "end_offset" : 16,
			
 
				+      "type" : "shingle",
			
 
				+      "position" : 2,
			
 
				+      "positionLength" : 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "lazy",
			
 
				+      "start_offset" : 12,
			
 
				+      "end_offset" : 16,
			
 
				+      "type" : "word",
			
 
				+      "position" : 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "lazy dog",
			
 
				+      "start_offset" : 12,
			
 
				+      "end_offset" : 20,
			
 
				+      "type" : "shingle",
			
 
				+      "position" : 3,
			
 
				+      "positionLength" : 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "dog",
			
 
				+      "start_offset" : 17,
			
 
				+      "end_offset" : 20,
			
 
				+      "type" : "word",
			
 
				+      "position" : 4
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----
			
 
				+////
			
 
				+====
			
 
				+--
			
 
				+
			
 
				+[[analysis-shingle-tokenfilter-customize]]
			
 
				+==== Customize
			
 
				+
			
 
				+To customize the `shingle` filter, duplicate it to create the basis for a new
			
 
				+custom token filter. You can modify the filter using its configurable
			
 
				+parameters.
			
 
				+
			
 
				+For example, the following <<indices-create-index,create index API>> request
			
 
				+uses a custom `shingle` filter, `my_shingle_filter`, to configure a new
			
 
				+<<analysis-custom-analyzer,custom analyzer>>.
			
 
				+
			
 
				+The `my_shingle_filter` filter uses a `min_shingle_size` of `2` and a
			
 
				+`max_shingle_size` of `5`, meaning it produces shingles of 2-5 words.
			
 
				+The filter also includes a `output_unigrams` argument of `false`, meaning that
			
 
				+only shingles are included in the output.
			
 
				+
			
 
				+[source,console]
			
 
				+----
			
 
				+PUT /my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "en": {
			
 
				+          "tokenizer": "standard",
			
 
				+          "filter": [ "my_shingle_filter" ]
			
 
				+        }
			
 
				+      },
			
 
				+      "filter": {
			
 
				+        "my_shingle_filter": {
			
 
				+          "type": "shingle",
			
 
				+          "min_shingle_size": 2,
			
 
				+          "max_shingle_size": 5,
			
 
				+          "output_unigrams": false
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+----
			
--- a/docs/reference/index-modules.asciidoc
+++ b/docs/reference/index-modules.asciidoc
@@ -166,10 +166,12 @@ specific index module:
 
				     The maximum allowed difference between min_gram and max_gram for NGramTokenizer and NGramTokenFilter.
			
 
				     Defaults to `1`.
			
 
				 
			
 
				+[[index-max-shingle-diff]]
			
 
				 `index.max_shingle_diff`::
			
 
				 
			
 
				-    The maximum allowed difference between max_shingle_size and min_shingle_size for ShingleTokenFilter.
			
 
				-    Defaults to `3`.
			
 
				+    The maximum allowed difference between max_shingle_size and min_shingle_size
			
 
				+    for the <<analysis-shingle-tokenfilter,`shingle` token filter>>. Defaults to
			
 
				+    `3`.
			
 
				 
			
 
				 `index.blocks.read_only`::