5 years ago · ddf5c0a76a
--- a/docs/reference/analysis/tokenfilters/edgengram-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/edgengram-tokenfilter.asciidoc
@@ -1,16 +1,244 @@
 
				 [[analysis-edgengram-tokenfilter]]
			
 
				-=== Edge NGram Token Filter
			
 
				+=== Edge n-gram token filter
			
 
				+++++
			
 
				+<titleabbrev>Edge n-gram</titleabbrev>
			
 
				+++++
			
 
				 
			
 
				-A token filter of type `edge_ngram`.
			
 
				+Forms an https://en.wikipedia.org/wiki/N-gram[n-gram] of a specified length from
			
 
				+the beginning of a token.
			
 
				 
			
 
				-The following are settings that can be set for a `edge_ngram` token
			
 
				-filter type:
			
 
				+For example, you can use the `edge_ngram` token filter to change `quick` to
			
 
				+`qu`.
			
 
				 
			
 
				-[cols="<,<",options="header",]
			
 
				-|======================================================
			
 
				-|Setting |Description
			
 
				-|`min_gram` |Defaults to `1`.
			
 
				-|`max_gram` |Defaults to `2`.
			
 
				-|`side` |deprecated. Either `front` or `back`. Defaults to `front`.
			
 
				-|======================================================
			
 
				+When not customized, the filter creates 1-character edge n-grams by default.
			
 
				 
			
 
				+This filter uses Lucene's
			
 
				+https://lucene.apache.org/core/{lucene_version_path}/analyzers-common/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.html[EdgeNGramTokenFilter].
			
 
				+
			
 
				+[NOTE]
			
 
				+====
			
 
				+The `edge_ngram` filter is similar to the <<analysis-ngram-tokenizer,`ngram`
			
 
				+token filter>>. However, the `edge_ngram` only outputs n-grams that start at the
			
 
				+beginning of a token. These edge n-grams are useful for
			
 
				+<<search-as-you-type,search-as-you-type>> queries.
			
 
				+====
			
 
				+
			
 
				+[[analysis-edgengram-tokenfilter-analyze-ex]]
			
 
				+==== Example
			
 
				+
			
 
				+The following <<indices-analyze,analyze API>> request uses the `edge_ngram`
			
 
				+filter to convert `the quick brown fox jumps` to 1-character and 2-character
			
 
				+edge n-grams:
			
 
				+
			
 
				+[source,console]
			
 
				+--------------------------------------------------
			
 
				+GET _analyze
			
 
				+{
			
 
				+  "tokenizer": "standard",
			
 
				+  "filter": [
			
 
				+    { "type": "edge_ngram",
			
 
				+      "min_gram": 1,
			
 
				+      "max_gram": 2
			
 
				+    }
			
 
				+  ],
			
 
				+  "text": "the quick brown fox jumps"
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+
			
 
				+The filter produces the following tokens:
			
 
				+
			
 
				+[source,text]
			
 
				+--------------------------------------------------
			
 
				+[ t, th, q, ui, b, br, f, fo, j, ju ]
			
 
				+--------------------------------------------------
			
 
				+
			
 
				+/////////////////////
			
 
				+[source,console-result]
			
 
				+--------------------------------------------------
			
 
				+{
			
 
				+  "tokens" : [
			
 
				+    {
			
 
				+      "token" : "t",
			
 
				+      "start_offset" : 0,
			
 
				+      "end_offset" : 3,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "th",
			
 
				+      "start_offset" : 0,
			
 
				+      "end_offset" : 3,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "q",
			
 
				+      "start_offset" : 4,
			
 
				+      "end_offset" : 9,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "qu",
			
 
				+      "start_offset" : 4,
			
 
				+      "end_offset" : 9,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "b",
			
 
				+      "start_offset" : 10,
			
 
				+      "end_offset" : 15,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "br",
			
 
				+      "start_offset" : 10,
			
 
				+      "end_offset" : 15,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "f",
			
 
				+      "start_offset" : 16,
			
 
				+      "end_offset" : 19,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "fo",
			
 
				+      "start_offset" : 16,
			
 
				+      "end_offset" : 19,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "j",
			
 
				+      "start_offset" : 20,
			
 
				+      "end_offset" : 25,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "ju",
			
 
				+      "start_offset" : 20,
			
 
				+      "end_offset" : 25,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 4
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+/////////////////////
			
 
				+
			
 
				+[[analysis-edgengram-tokenfilter-analyzer-ex]]
			
 
				+==== Add to an analyzer
			
 
				+
			
 
				+The following <<indices-create-index,create index API>> request uses the
			
 
				+`edge_ngram` filter to configure a new 
			
 
				+<<analysis-custom-analyzer,custom analyzer>>.
			
 
				+
			
 
				+[source,console]
			
 
				+--------------------------------------------------
			
 
				+PUT edge_ngram_example
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "standard_edge_ngram": {
			
 
				+          "tokenizer": "standard",
			
 
				+          "filter": [ "edge_ngram" ]
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+
			
 
				+[[analysis-edgengram-tokenfilter-configure-parms]]
			
 
				+==== Configurable parameters
			
 
				+
			
 
				+`max_gram`::
			
 
				++
			
 
				+--
			
 
				+(Optional, integer)
			
 
				+Maximum character length of a gram. For custom token filters, defaults to `2`.
			
 
				+For the built-in `edge_ngram` filter, defaults to `1`.
			
 
				+
			
 
				+See <<analysis-edgengram-tokenfilter-max-gram-limits>>.
			
 
				+--
			
 
				+
			
 
				+`min_gram`::
			
 
				+(Optional, integer)
			
 
				+Minimum character length of a gram. Defaults to `1`.
			
 
				+
			
 
				+`side`::
			
 
				++
			
 
				+--
			
 
				+(Optional, string)
			
 
				+Deprecated. Indicates whether to truncate tokens from the `front` or `back`.
			
 
				+Defaults to `front`.
			
 
				+
			
 
				+Instead of using the `back` value, you can use the
			
 
				+<<analysis-reverse-tokenfilter,`reverse`>> token filter before and after the
			
 
				+`edge_ngram` filter to achieve the same results.
			
 
				+--
			
 
				+
			
 
				+[[analysis-edgengram-tokenfilter-customize]]
			
 
				+==== Customize
			
 
				+
			
 
				+To customize the `edge_ngram` filter, duplicate it to create the basis
			
 
				+for a new custom token filter. You can modify the filter using its configurable
			
 
				+parameters.
			
 
				+
			
 
				+For example, the following request creates a custom `edge_ngram`
			
 
				+filter that forms n-grams between 3-5 characters.
			
 
				+
			
 
				+[source,console]
			
 
				+--------------------------------------------------
			
 
				+PUT edge_ngram_custom_example
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "default": {
			
 
				+          "tokenizer": "whitespace",
			
 
				+          "filter": [ "3_5_edgegrams" ]
			
 
				+        }
			
 
				+      },
			
 
				+      "filter": {
			
 
				+        "3_5_edgegrams": {
			
 
				+          "type": "edge_ngram",
			
 
				+          "min_gram": 3,
			
 
				+          "max_gram": 5
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+
			
 
				+[[analysis-edgengram-tokenfilter-max-gram-limits]]
			
 
				+==== Limitations of the `max_gram` parameter
			
 
				+
			
 
				+The `edge_ngram` filter's `max_gram` value limits the character length of
			
 
				+tokens. When the `edge_ngram` filter is used with an index analyzer, this
			
 
				+means search terms longer than the `max_gram` length may not match any indexed
			
 
				+terms.
			
 
				+
			
 
				+For example, if the `max_gram` is `3`, searches for `apple` won't match the
			
 
				+indexed term `app`.
			
 
				+
			
 
				+To account for this, you can use the
			
 
				+<<analysis-truncate-tokenfilter,`truncate`>> filter with a search analyzer
			
 
				+to shorten search terms to the `max_gram` character length. However, this could
			
 
				+return irrelevant results.
			
 
				+
			
 
				+For example, if the `max_gram` is `3` and search terms are truncated to three
			
 
				+characters, the search term `apple` is shortened to `app`. This means searches
			
 
				+for `apple` return any indexed terms matching `app`, such as `apply`, `snapped`,
			
 
				+and `apple`.
			
 
				+
			
 
				+We recommend testing both approaches to see which best fits your
			
 
				+use case and desired search experience.
			
--- a/docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc
@@ -1,18 +1,228 @@
 
				 [[analysis-ngram-tokenfilter]]
			
 
				-=== NGram Token Filter
			
 
				+=== N-gram token filter
			
 
				+++++
			
 
				+<titleabbrev>N-gram</titleabbrev>
			
 
				+++++
			
 
				 
			
 
				-A token filter of type `ngram`.
			
 
				+Forms https://en.wikipedia.org/wiki/N-gram[n-grams] of specified lengths from
			
 
				+a token.
			
 
				 
			
 
				-The following are settings that can be set for a `ngram` token filter
			
 
				-type:
			
 
				+For example, you can use the `ngram` token filter to change `fox` to
			
 
				+`[ f, fo, o, ox, x ]`.
			
 
				 
			
 
				-[cols="<,<",options="header",]
			
 
				-|============================
			
 
				-|Setting |Description
			
 
				-|`min_gram` |Defaults to `1`.
			
 
				-|`max_gram` |Defaults to `2`.
			
 
				-|============================
			
 
				+This filter uses Lucene's
			
 
				+https://lucene.apache.org/core/{lucene_version_path}/analyzers-common/org/apache/lucene/analysis/ngram/NGramTokenFilter.html[NGramTokenFilter].
			
 
				 
			
 
				-The index level setting `index.max_ngram_diff` controls the maximum allowed
			
 
				-difference between `max_gram` and `min_gram`.
			
 
				+[NOTE]
			
 
				+====
			
 
				+The `ngram` filter is similar to the
			
 
				+<<analysis-edgengram-tokenfilter,`edge_ngram` token filter>>. However, the
			
 
				+`edge_ngram` only outputs n-grams that start at the beginning of a token.
			
 
				+====
			
 
				 
			
 
				+[[analysis-ngram-tokenfilter-analyze-ex]]
			
 
				+==== Example
			
 
				+
			
 
				+The following <<indices-analyze,analyze API>> request uses the `ngram`
			
 
				+filter to convert `Quick fox` to 1-character and 2-character n-grams:
			
 
				+
			
 
				+[source,console]
			
 
				+--------------------------------------------------
			
 
				+GET _analyze
			
 
				+{
			
 
				+  "tokenizer": "standard",
			
 
				+  "filter": [ "ngram" ],
			
 
				+  "text": "Quick fox"
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+
			
 
				+The filter produces the following tokens:
			
 
				+
			
 
				+[source,text]
			
 
				+--------------------------------------------------
			
 
				+[ Q, Qu, u, ui, i, ic, c, ck, k, f, fo, o, ox, x ]
			
 
				+--------------------------------------------------
			
 
				+
			
 
				+/////////////////////
			
 
				+[source,console-result]
			
 
				+--------------------------------------------------
			
 
				+{
			
 
				+  "tokens" : [
			
 
				+    {
			
 
				+      "token" : "Q",
			
 
				+      "start_offset" : 0,
			
 
				+      "end_offset" : 5,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "Qu",
			
 
				+      "start_offset" : 0,
			
 
				+      "end_offset" : 5,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "u",
			
 
				+      "start_offset" : 0,
			
 
				+      "end_offset" : 5,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "ui",
			
 
				+      "start_offset" : 0,
			
 
				+      "end_offset" : 5,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "i",
			
 
				+      "start_offset" : 0,
			
 
				+      "end_offset" : 5,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "ic",
			
 
				+      "start_offset" : 0,
			
 
				+      "end_offset" : 5,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "c",
			
 
				+      "start_offset" : 0,
			
 
				+      "end_offset" : 5,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "ck",
			
 
				+      "start_offset" : 0,
			
 
				+      "end_offset" : 5,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "k",
			
 
				+      "start_offset" : 0,
			
 
				+      "end_offset" : 5,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "f",
			
 
				+      "start_offset" : 6,
			
 
				+      "end_offset" : 9,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "fo",
			
 
				+      "start_offset" : 6,
			
 
				+      "end_offset" : 9,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "o",
			
 
				+      "start_offset" : 6,
			
 
				+      "end_offset" : 9,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "ox",
			
 
				+      "start_offset" : 6,
			
 
				+      "end_offset" : 9,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "x",
			
 
				+      "start_offset" : 6,
			
 
				+      "end_offset" : 9,
			
 
				+      "type" : "<ALPHANUM>",
			
 
				+      "position" : 1
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+/////////////////////
			
 
				+
			
 
				+[[analysis-ngram-tokenfilter-analyzer-ex]]
			
 
				+==== Add to an analyzer
			
 
				+
			
 
				+The following <<indices-create-index,create index API>> request uses the `ngram`
			
 
				+filter to configure a new <<analysis-custom-analyzer,custom analyzer>>.
			
 
				+
			
 
				+[source,console]
			
 
				+--------------------------------------------------
			
 
				+PUT ngram_example
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "standard_ngram": {
			
 
				+          "tokenizer": "standard",
			
 
				+          "filter": [ "ngram" ]
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+
			
 
				+[[analysis-ngram-tokenfilter-configure-parms]]
			
 
				+==== Configurable parameters
			
 
				+
			
 
				+`max_gram`::
			
 
				+(Optional, integer)
			
 
				+Maximum length of characters in a gram. Defaults to `2`.
			
 
				+
			
 
				+`min_gram`::
			
 
				+(Optional, integer)
			
 
				+Minimum length of characters in a gram. Defaults to `1`.
			
 
				+
			
 
				+You can use the <<index-max-ngram-diff,`index.max_ngram_diff`>> index-level
			
 
				+setting to control the maximum allowed difference between the `max_gram` and
			
 
				+`min_gram` values.
			
 
				+
			
 
				+[[analysis-ngram-tokenfilter-customize]]
			
 
				+==== Customize
			
 
				+
			
 
				+To customize the `ngram` filter, duplicate it to create the basis for a new
			
 
				+custom token filter. You can modify the filter using its configurable
			
 
				+parameters.
			
 
				+
			
 
				+For example, the following request creates a custom `ngram` filter that forms
			
 
				+n-grams between 3-5 characters. The request also increases the
			
 
				+`index.max_ngram_diff` setting to `2`.
			
 
				+
			
 
				+[source,console]
			
 
				+--------------------------------------------------
			
 
				+PUT ngram_custom_example
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "index": {
			
 
				+      "max_ngram_diff": 2
			
 
				+    },
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "default": {
			
 
				+          "tokenizer": "whitespace",
			
 
				+          "filter": [ "3_5_grams" ]
			
 
				+        }
			
 
				+      },
			
 
				+      "filter": {
			
 
				+        "3_5_grams": {
			
 
				+          "type": "ngram",
			
 
				+          "min_gram": 3,
			
 
				+          "max_gram": 5
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+--------------------------------------------------
			
--- a/docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc
@@ -1,5 +1,5 @@
 
				 [[analysis-edgengram-tokenizer]]
			
 
				-=== Edge NGram Tokenizer
			
 
				+=== Edge n-gram tokenizer
			
 
				 
			
 
				 The `edge_ngram` tokenizer first breaks text down into words whenever it
			
 
				 encounters one of a list of specified characters, then it emits
			
@@ -116,9 +116,10 @@ terms.
 
				 For example, if the `max_gram` is `3`, searches for `apple` won't match the
			
 
				 indexed term `app`.
			
 
				 
			
 
				-To account for this, you can use the <<analysis-truncate-tokenfilter,`truncate`
			
 
				-token filter>> token filter with a search analyzer to shorten search terms to
			
 
				-the `max_gram` character length. However, this could return irrelevant results.
			
 
				+To account for this, you can use the
			
 
				+<<analysis-truncate-tokenfilter,`truncate`>> token filter with a search analyzer
			
 
				+to shorten search terms to the `max_gram` character length. However, this could
			
 
				+return irrelevant results.
			
 
				 
			
 
				 For example, if the `max_gram` is `3` and search terms are truncated to three
			
 
				 characters, the search term `apple` is shortened to `app`. This means searches
			
--- a/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc
@@ -1,5 +1,5 @@
 
				 [[analysis-ngram-tokenizer]]
			
 
				-=== NGram Tokenizer
			
 
				+=== N-gram tokenizer
			
 
				 
			
 
				 The `ngram` tokenizer first breaks text down into words whenever it encounters
			
 
				 one of a list of specified characters, then it emits
			
--- a/docs/reference/index-modules.asciidoc
+++ b/docs/reference/index-modules.asciidoc
@@ -152,6 +152,7 @@ specific index module:
 
				     The maximum number of `script_fields` that are allowed in a query.
			
 
				     Defaults to `32`.
			
 
				 
			
 
				+[[index-max-ngram-diff]]
			
 
				 `index.max_ngram_diff`::
			
 
				 
			
 
				     The maximum allowed difference between min_gram and max_gram for NGramTokenizer and NGramTokenFilter.