9 年之前 · 5da9e5dcbc
--- a/docs/reference/analysis/analyzers/configuring.asciidoc
+++ b/docs/reference/analysis/analyzers/configuring.asciidoc
@@ -64,3 +64,38 @@ POST my_index/_analyze
 
				     English stop words will be removed.  The resulting terms are:
			
 
				     `[ old, brown, cow ]`
			
 
				 
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "old",
			
 
				+      "start_offset": 4,
			
 
				+      "end_offset": 7,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "brown",
			
 
				+      "start_offset": 8,
			
 
				+      "end_offset": 13,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "cow",
			
 
				+      "start_offset": 14,
			
 
				+      "end_offset": 17,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 3
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
--- a/docs/reference/analysis/analyzers/custom-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/custom-analyzer.asciidoc
@@ -84,6 +84,48 @@ POST my_index/_analyze
 
				 --------------------------------
			
 
				 // CONSOLE
			
 
				 
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "is",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 2,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "this",
			
 
				+      "start_offset": 3,
			
 
				+      "end_offset": 7,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "deja",
			
 
				+      "start_offset": 11,
			
 
				+      "end_offset": 15,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "vu",
			
 
				+      "start_offset": 16,
			
 
				+      "end_offset": 22,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 3
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				 The above example produces the following terms:
			
 
				 
			
 
				 [source,text]
			
@@ -119,13 +161,10 @@ PUT my_index
 
				       "analyzer": {
			
 
				         "my_custom_analyzer": {
			
 
				           "type": "custom",
			
 
				-
			
 
				           "char_filter": [
			
 
				             "emoticons" <1>
			
 
				           ],
			
 
				-
			
 
				           "tokenizer": "punctuation", <1>
			
 
				-
			
 
				           "filter": [
			
 
				             "lowercase",
			
 
				             "english_stop" <1>
			
@@ -165,11 +204,54 @@ POST my_index/_analyze
 
				   "text":     "I'm a :) person, and you?"
			
 
				 }
			
 
				 --------------------------------------------------
			
 
				+// CONSOLE
			
 
				 
			
 
				 <1> The `emoticon` character filter, `punctuation` tokenizer and
			
 
				     `english_stop` token filter are custom implementations which are defined
			
 
				     in the same index settings.
			
 
				 
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "i'm",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 3,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "_happy_",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 8,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "person",
			
 
				+      "start_offset": 9,
			
 
				+      "end_offset": 15,
			
 
				+      "type": "word",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "you",
			
 
				+      "start_offset": 21,
			
 
				+      "end_offset": 24,
			
 
				+      "type": "word",
			
 
				+      "position": 5
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				 The above example produces the following terms:
			
 
				 
			
 
				 [source,text]
			
--- a/docs/reference/analysis/analyzers/fingerprint-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/fingerprint-analyzer.asciidoc
@@ -36,6 +36,27 @@ POST _analyze
 
				 ---------------------------
			
 
				 // CONSOLE
			
 
				 
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "and consistent godel is said sentence this yes",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 52,
			
 
				+      "type": "fingerprint",
			
 
				+      "position": 0
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				 The above sentence would produce the following single term:
			
 
				 
			
 
				 [source,text]
			
@@ -58,16 +79,11 @@ The `fingerprint` analyzer accepts the following parameters:
 
				     The maximum token size to emit.  Defaults to `255`. Tokens larger than
			
 
				     this size will be discarded.
			
 
				 
			
 
				-`preserve_original`::
			
 
				-
			
 
				-    If `true`, emits two tokens: one with ASCII-folding of terms that contain
			
 
				-    extended characters (if any) and one with the original characters.
			
 
				-    Defaults to `false`.
			
 
				-
			
 
				 `stopwords`::
			
 
				 
			
 
				     A pre-defined stop words list like `_english_` or an array  containing a
			
 
				     list of stop words.  Defaults to `_none_`.
			
 
				+
			
 
				 `stopwords_path`::
			
 
				 
			
 
				     The path to a file containing stop words.
			
@@ -80,8 +96,7 @@ about stop word configuration.
 
				 === Example configuration
			
 
				 
			
 
				 In this example, we configure the `fingerprint` analyzer to use the
			
 
				-pre-defined list of English stop words, and to emit a second token in
			
 
				-the presence of non-ASCII characters:
			
 
				+pre-defined list of English stop words:
			
 
				 
			
 
				 [source,js]
			
 
				 ----------------------------
			
@@ -92,8 +107,7 @@ PUT my_index
 
				       "analyzer": {
			
 
				         "my_fingerprint_analyzer": {
			
 
				           "type": "fingerprint",
			
 
				-          "stopwords": "_english_",
			
 
				-          "preserve_original": true
			
 
				+          "stopwords": "_english_"
			
 
				         }
			
 
				       }
			
 
				     }
			
@@ -110,9 +124,30 @@ POST my_index/_analyze
 
				 ----------------------------
			
 
				 // CONSOLE
			
 
				 
			
 
				-The above example produces the following two terms:
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "consistent godel said sentence yes",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 52,
			
 
				+      "type": "fingerprint",
			
 
				+      "position": 0
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				+The above example produces the following term:
			
 
				 
			
 
				 [source,text]
			
 
				 ---------------------------
			
 
				-[ consistent godel said sentence yes, consistent gödel said sentence yes ]
			
 
				+[ consistent godel said sentence yes ]
			
 
				 ---------------------------
			
--- a/docs/reference/analysis/analyzers/keyword-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/keyword-analyzer.asciidoc
@@ -25,6 +25,27 @@ POST _analyze
 
				 ---------------------------
			
 
				 // CONSOLE
			
 
				 
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 56,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				 The above sentence would produce the following single term:
			
 
				 
			
 
				 [source,text]
			
--- a/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc
@@ -30,6 +30,104 @@ POST _analyze
 
				 ---------------------------
			
 
				 // CONSOLE
			
 
				 
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "the",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 3,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "2",
			
 
				+      "start_offset": 4,
			
 
				+      "end_offset": 5,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "quick",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "brown",
			
 
				+      "start_offset": 12,
			
 
				+      "end_offset": 17,
			
 
				+      "type": "word",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "foxes",
			
 
				+      "start_offset": 18,
			
 
				+      "end_offset": 23,
			
 
				+      "type": "word",
			
 
				+      "position": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "jumped",
			
 
				+      "start_offset": 24,
			
 
				+      "end_offset": 30,
			
 
				+      "type": "word",
			
 
				+      "position": 5
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "over",
			
 
				+      "start_offset": 31,
			
 
				+      "end_offset": 35,
			
 
				+      "type": "word",
			
 
				+      "position": 6
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "the",
			
 
				+      "start_offset": 36,
			
 
				+      "end_offset": 39,
			
 
				+      "type": "word",
			
 
				+      "position": 7
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "lazy",
			
 
				+      "start_offset": 40,
			
 
				+      "end_offset": 44,
			
 
				+      "type": "word",
			
 
				+      "position": 8
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "dog",
			
 
				+      "start_offset": 45,
			
 
				+      "end_offset": 48,
			
 
				+      "type": "word",
			
 
				+      "position": 9
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "s",
			
 
				+      "start_offset": 49,
			
 
				+      "end_offset": 50,
			
 
				+      "type": "word",
			
 
				+      "position": 10
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "bone",
			
 
				+      "start_offset": 51,
			
 
				+      "end_offset": 55,
			
 
				+      "type": "word",
			
 
				+      "position": 11
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				 The above sentence would produce the following terms:
			
 
				 
			
 
				 [source,text]
			
@@ -110,6 +208,55 @@ POST my_index/_analyze
 
				 <1> The backslashes in the pattern need to be escaped when specifying the
			
 
				     pattern as a JSON string.
			
 
				 
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "john",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 4,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "smith",
			
 
				+      "start_offset": 5,
			
 
				+      "end_offset": 10,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "foo",
			
 
				+      "start_offset": 11,
			
 
				+      "end_offset": 14,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "bar",
			
 
				+      "start_offset": 15,
			
 
				+      "end_offset": 18,
			
 
				+      "type": "word",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "com",
			
 
				+      "start_offset": 19,
			
 
				+      "end_offset": 22,
			
 
				+      "type": "word",
			
 
				+      "position": 4
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				 The above example produces the following terms:
			
 
				 
			
 
				 [source,text]
			
@@ -148,6 +295,62 @@ GET my_index/_analyze
 
				 --------------------------------------------------
			
 
				 // CONSOLE
			
 
				 
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "moose",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 5,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "x",
			
 
				+      "start_offset": 5,
			
 
				+      "end_offset": 6,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "ftp",
			
 
				+      "start_offset": 8,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "class",
			
 
				+      "start_offset": 11,
			
 
				+      "end_offset": 16,
			
 
				+      "type": "word",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "2",
			
 
				+      "start_offset": 16,
			
 
				+      "end_offset": 17,
			
 
				+      "type": "word",
			
 
				+      "position": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "beta",
			
 
				+      "start_offset": 18,
			
 
				+      "end_offset": 22,
			
 
				+      "type": "word",
			
 
				+      "position": 5
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				 The above example produces the following terms:
			
 
				 
			
 
				 [source,text]
			
--- a/docs/reference/analysis/analyzers/simple-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/simple-analyzer.asciidoc
@@ -25,6 +25,97 @@ POST _analyze
 
				 ---------------------------
			
 
				 // CONSOLE
			
 
				 
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "the",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 3,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "quick",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "brown",
			
 
				+      "start_offset": 12,
			
 
				+      "end_offset": 17,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "foxes",
			
 
				+      "start_offset": 18,
			
 
				+      "end_offset": 23,
			
 
				+      "type": "word",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "jumped",
			
 
				+      "start_offset": 24,
			
 
				+      "end_offset": 30,
			
 
				+      "type": "word",
			
 
				+      "position": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "over",
			
 
				+      "start_offset": 31,
			
 
				+      "end_offset": 35,
			
 
				+      "type": "word",
			
 
				+      "position": 5
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "the",
			
 
				+      "start_offset": 36,
			
 
				+      "end_offset": 39,
			
 
				+      "type": "word",
			
 
				+      "position": 6
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "lazy",
			
 
				+      "start_offset": 40,
			
 
				+      "end_offset": 44,
			
 
				+      "type": "word",
			
 
				+      "position": 7
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "dog",
			
 
				+      "start_offset": 45,
			
 
				+      "end_offset": 48,
			
 
				+      "type": "word",
			
 
				+      "position": 8
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "s",
			
 
				+      "start_offset": 49,
			
 
				+      "end_offset": 50,
			
 
				+      "type": "word",
			
 
				+      "position": 9
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "bone",
			
 
				+      "start_offset": 51,
			
 
				+      "end_offset": 55,
			
 
				+      "type": "word",
			
 
				+      "position": 10
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				 The above sentence would produce the following terms:
			
 
				 
			
 
				 [source,text]
			
--- a/docs/reference/analysis/analyzers/standard-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/standard-analyzer.asciidoc
@@ -33,6 +33,97 @@ POST _analyze
 
				 ---------------------------
			
 
				 // CONSOLE
			
 
				 
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "the",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 3,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "2",
			
 
				+      "start_offset": 4,
			
 
				+      "end_offset": 5,
			
 
				+      "type": "<NUM>",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "quick",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "brown",
			
 
				+      "start_offset": 12,
			
 
				+      "end_offset": 17,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "foxes",
			
 
				+      "start_offset": 18,
			
 
				+      "end_offset": 23,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "jumped",
			
 
				+      "start_offset": 24,
			
 
				+      "end_offset": 30,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 5
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "over",
			
 
				+      "start_offset": 31,
			
 
				+      "end_offset": 35,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 6
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "the",
			
 
				+      "start_offset": 36,
			
 
				+      "end_offset": 39,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 7
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "lazy",
			
 
				+      "start_offset": 40,
			
 
				+      "end_offset": 44,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 8
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "dog's",
			
 
				+      "start_offset": 45,
			
 
				+      "end_offset": 50,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 9
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "bone",
			
 
				+      "start_offset": 51,
			
 
				+      "end_offset": 55,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 10
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				 The above sentence would produce the following terms:
			
 
				 
			
 
				 [source,text]
			
@@ -98,6 +189,89 @@ POST my_index/_analyze
 
				 ----------------------------
			
 
				 // CONSOLE
			
 
				 
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "2",
			
 
				+      "start_offset": 4,
			
 
				+      "end_offset": 5,
			
 
				+      "type": "<NUM>",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "quick",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "brown",
			
 
				+      "start_offset": 12,
			
 
				+      "end_offset": 17,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "foxes",
			
 
				+      "start_offset": 18,
			
 
				+      "end_offset": 23,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "jumpe",
			
 
				+      "start_offset": 24,
			
 
				+      "end_offset": 29,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 5
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "d",
			
 
				+      "start_offset": 29,
			
 
				+      "end_offset": 30,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 6
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "over",
			
 
				+      "start_offset": 31,
			
 
				+      "end_offset": 35,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 7
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "lazy",
			
 
				+      "start_offset": 40,
			
 
				+      "end_offset": 44,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 9
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "dog's",
			
 
				+      "start_offset": 45,
			
 
				+      "end_offset": 50,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 10
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "bone",
			
 
				+      "start_offset": 51,
			
 
				+      "end_offset": 55,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 11
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				 The above example produces the following terms:
			
 
				 
			
 
				 [source,text]
			
--- a/docs/reference/analysis/analyzers/stop-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/stop-analyzer.asciidoc
@@ -29,6 +29,83 @@ POST _analyze
 
				 ---------------------------
			
 
				 // CONSOLE
			
 
				 
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "quick",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "brown",
			
 
				+      "start_offset": 12,
			
 
				+      "end_offset": 17,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "foxes",
			
 
				+      "start_offset": 18,
			
 
				+      "end_offset": 23,
			
 
				+      "type": "word",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "jumped",
			
 
				+      "start_offset": 24,
			
 
				+      "end_offset": 30,
			
 
				+      "type": "word",
			
 
				+      "position": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "over",
			
 
				+      "start_offset": 31,
			
 
				+      "end_offset": 35,
			
 
				+      "type": "word",
			
 
				+      "position": 5
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "lazy",
			
 
				+      "start_offset": 40,
			
 
				+      "end_offset": 44,
			
 
				+      "type": "word",
			
 
				+      "position": 7
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "dog",
			
 
				+      "start_offset": 45,
			
 
				+      "end_offset": 48,
			
 
				+      "type": "word",
			
 
				+      "position": 8
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "s",
			
 
				+      "start_offset": 49,
			
 
				+      "end_offset": 50,
			
 
				+      "type": "word",
			
 
				+      "position": 9
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "bone",
			
 
				+      "start_offset": 51,
			
 
				+      "end_offset": 55,
			
 
				+      "type": "word",
			
 
				+      "position": 10
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				 The above sentence would produce the following terms:
			
 
				 
			
 
				 [source,text]
			
@@ -87,6 +164,76 @@ POST my_index/_analyze
 
				 ----------------------------
			
 
				 // CONSOLE
			
 
				 
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "quick",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "brown",
			
 
				+      "start_offset": 12,
			
 
				+      "end_offset": 17,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "foxes",
			
 
				+      "start_offset": 18,
			
 
				+      "end_offset": 23,
			
 
				+      "type": "word",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "jumped",
			
 
				+      "start_offset": 24,
			
 
				+      "end_offset": 30,
			
 
				+      "type": "word",
			
 
				+      "position": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "lazy",
			
 
				+      "start_offset": 40,
			
 
				+      "end_offset": 44,
			
 
				+      "type": "word",
			
 
				+      "position": 7
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "dog",
			
 
				+      "start_offset": 45,
			
 
				+      "end_offset": 48,
			
 
				+      "type": "word",
			
 
				+      "position": 8
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "s",
			
 
				+      "start_offset": 49,
			
 
				+      "end_offset": 50,
			
 
				+      "type": "word",
			
 
				+      "position": 9
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "bone",
			
 
				+      "start_offset": 51,
			
 
				+      "end_offset": 55,
			
 
				+      "type": "word",
			
 
				+      "position": 10
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				 The above example produces the following terms:
			
 
				 
			
 
				 [source,text]
			
--- a/docs/reference/analysis/analyzers/whitespace-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/whitespace-analyzer.asciidoc
@@ -25,6 +25,90 @@ POST _analyze
 
				 ---------------------------
			
 
				 // CONSOLE
			
 
				 
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "The",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 3,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "2",
			
 
				+      "start_offset": 4,
			
 
				+      "end_offset": 5,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "QUICK",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "Brown-Foxes",
			
 
				+      "start_offset": 12,
			
 
				+      "end_offset": 23,
			
 
				+      "type": "word",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "jumped",
			
 
				+      "start_offset": 24,
			
 
				+      "end_offset": 30,
			
 
				+      "type": "word",
			
 
				+      "position": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "over",
			
 
				+      "start_offset": 31,
			
 
				+      "end_offset": 35,
			
 
				+      "type": "word",
			
 
				+      "position": 5
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "the",
			
 
				+      "start_offset": 36,
			
 
				+      "end_offset": 39,
			
 
				+      "type": "word",
			
 
				+      "position": 6
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "lazy",
			
 
				+      "start_offset": 40,
			
 
				+      "end_offset": 44,
			
 
				+      "type": "word",
			
 
				+      "position": 7
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "dog's",
			
 
				+      "start_offset": 45,
			
 
				+      "end_offset": 50,
			
 
				+      "type": "word",
			
 
				+      "position": 8
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "bone.",
			
 
				+      "start_offset": 51,
			
 
				+      "end_offset": 56,
			
 
				+      "type": "word",
			
 
				+      "position": 9
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				 The above sentence would produce the following terms:
			
 
				 
			
 
				 [source,text]
			
--- a/docs/reference/analysis/charfilters.asciidoc
+++ b/docs/reference/analysis/charfilters.asciidoc
@@ -1,16 +1,36 @@
 
				 [[analysis-charfilters]]
			
 
				 == Character Filters
			
 
				 
			
 
				-Character filters are used to preprocess the string of 
			
 
				-characters before it is passed to the <<analysis-tokenizers,tokenizer>>.
			
 
				-A character filter may be used to strip out HTML markup, or to convert
			
 
				-`"&"` characters to the word `"and"`.    
			
 
				+_Character filters_ are used to preprocess the stream of characters before it
			
 
				+is passed to the <<analysis-tokenizers,tokenizer>>.
			
 
				 
			
 
				-Elasticsearch has built in characters filters which can be
			
 
				-used to build <<analysis-custom-analyzer,custom analyzers>>.
			
 
				+A character filter receives the original text as a stream of characters and
			
 
				+can transform the stream by adding, removing, or changing characters.  For
			
 
				+instance, a character filter could be used to convert Arabic numerals
			
 
				+(٠‎١٢٣٤٥٦٧٨‎٩‎) into their Latin equivalents (0123456789), or to strip HTML
			
 
				+elements like `<b>` from the stream.
			
 
				 
			
 
				-include::charfilters/mapping-charfilter.asciidoc[]
			
 
				+
			
 
				+Elasticsearch has a number of built in character filters which can be used to build
			
 
				+<<analysis-custom-analyzer,custom analyzers>>.
			
 
				+
			
 
				+<<analysis-htmlstrip-charfilter,HTML Strip Character Filter>>::
			
 
				+
			
 
				+The `html_strip` character filter strips out HTML elements like `<b>` and
			
 
				+decodes HTML entities like `&amp;`.
			
 
				+
			
 
				+<<analysis-mapping-charfilter,Mapping Character Filter>>::
			
 
				+
			
 
				+The `mapping` character filter replaces any occurrences of the specified
			
 
				+strings with the specified replacements.
			
 
				+
			
 
				+<<analysis-pattern-replace-charfilter,Pattern Replace Character Filter>>::
			
 
				+
			
 
				+The `pattern_replace` character filter replaces any characters matching a
			
 
				+regular expression with the specified replacement.
			
 
				 
			
 
				 include::charfilters/htmlstrip-charfilter.asciidoc[]
			
 
				 
			
 
				+include::charfilters/mapping-charfilter.asciidoc[]
			
 
				+
			
 
				 include::charfilters/pattern-replace-charfilter.asciidoc[]
			
--- a/docs/reference/analysis/charfilters/htmlstrip-charfilter.asciidoc
+++ b/docs/reference/analysis/charfilters/htmlstrip-charfilter.asciidoc
@@ -1,5 +1,135 @@
 
				 [[analysis-htmlstrip-charfilter]]
			
 
				 === HTML Strip Char Filter
			
 
				 
			
 
				-A char filter of type `html_strip` stripping out HTML elements from an
			
 
				-analyzed text.
			
 
				+The `html_strip` character filter strips HTML elements from the text and
			
 
				+replaces HTML entities with their decoded value (e.g. replacing `&amp;` with
			
 
				+`&`).
			
 
				+
			
 
				+[float]
			
 
				+=== Example output
			
 
				+
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+POST _analyze
			
 
				+{
			
 
				+  "tokenizer":      "keyword", <1>
			
 
				+  "char_filter":  [ "html_strip" ],
			
 
				+  "text": "<p>I&apos;m so <b>happy</b>!</p>"
			
 
				+}
			
 
				+---------------------------
			
 
				+// CONSOLE
			
 
				+<1> The <<analysis-keyword-tokenizer,`keyword` tokenizer>> returns a single term.
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "\nI'm so happy!\n",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 32,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				+The above example returns the term:
			
 
				+
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+[ \nI'm so happy!\n ]
			
 
				+---------------------------
			
 
				+
			
 
				+The same example with the `standard` tokenizer would return the following terms:
			
 
				+
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+[ I'm, so, happy ]
			
 
				+---------------------------
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `html_strip` character filter accepts the following parameter:
			
 
				+
			
 
				+[horizontal]
			
 
				+`escaped_tags`::
			
 
				+
			
 
				+    An array of HTML tags which should not be stripped from the original text.
			
 
				+
			
 
				+[float]
			
 
				+=== Example configuration
			
 
				+
			
 
				+In this example, we configure the `html_strip` character filter to leave `<b>`
			
 
				+tags in place:
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_analyzer": {
			
 
				+          "tokenizer": "keyword",
			
 
				+          "char_filter": ["my_char_filter"]
			
 
				+        }
			
 
				+      },
			
 
				+      "char_filter": {
			
 
				+        "my_char_filter": {
			
 
				+          "type": "html_strip",
			
 
				+          "escaped_tags": ["b"]
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+GET _cluster/health?wait_for_status=yellow
			
 
				+
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "analyzer": "my_analyzer",
			
 
				+  "text": "<p>I&apos;m so <b>happy</b>!</p>"
			
 
				+}
			
 
				+----------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "\nI'm so <b>happy</b>!\n",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 32,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				+The above example produces the following term:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ \nI'm so <b>happy</b>!\n ]
			
 
				+---------------------------
			
 
				+
			
 
				+
			
 
				+
			
--- a/docs/reference/analysis/charfilters/mapping-charfilter.asciidoc
+++ b/docs/reference/analysis/charfilters/mapping-charfilter.asciidoc
@@ -1,42 +1,202 @@
 
				 [[analysis-mapping-charfilter]]
			
 
				 === Mapping Char Filter
			
 
				 
			
 
				-A char filter of type `mapping` replacing characters of an analyzed text
			
 
				-with given mapping.
			
 
				+The `mapping` character filter accepts a map of keys and values.  Whenever it
			
 
				+encounters a string of characters that is the same as a key, it replaces them
			
 
				+with the value associated with that key.
			
 
				+
			
 
				+Matching is greedy; the longest pattern matching at a given point wins.
			
 
				+Replacements are allowed to be the empty string.
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `mapping` character filter accepts the following parameters:
			
 
				 
			
 
				 [horizontal]
			
 
				 `mappings`::
			
 
				 
			
 
				-    A list of mappings to use.
			
 
				+    A array of mappings, with each element having the form `key => value`.
			
 
				 
			
 
				 `mappings_path`::
			
 
				 
			
 
				-    A path, relative to the `config` directory, to a mappings file
			
 
				-    configuration.
			
 
				+    A path, either absolute or relative to the `config` directory, to a UTF-8
			
 
				+    encoded text mappings file containing a `key => value` mapping per line.
			
 
				 
			
 
				-Here is a sample configuration:
			
 
				+Either the `mappings` or `mappings_path` parameter must be provided.
			
 
				+
			
 
				+[float]
			
 
				+=== Example configuration
			
 
				+
			
 
				+In this example, we configure the `mapping` character filter to replace Arabic
			
 
				+numerals with their Latin equivalents:
			
 
				 
			
 
				 [source,js]
			
 
				---------------------------------------------------
			
 
				+----------------------------
			
 
				+PUT my_index
			
 
				 {
			
 
				-    "index" : {
			
 
				-        "analysis" : {
			
 
				-            "char_filter" : {
			
 
				-                "my_mapping" : {
			
 
				-                    "type" : "mapping",
			
 
				-                    "mappings" : [
			
 
				-                      "ph => f",
			
 
				-                      "qu => k"
			
 
				-                    ]
			
 
				-                }
			
 
				-            },
			
 
				-            "analyzer" : {
			
 
				-                "custom_with_char_filter" : {
			
 
				-                    "tokenizer" : "standard",
			
 
				-                    "char_filter" : ["my_mapping"]
			
 
				-                }
			
 
				-            }
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_analyzer": {
			
 
				+          "tokenizer": "keyword",
			
 
				+          "char_filter": [
			
 
				+            "my_char_filter"
			
 
				+          ]
			
 
				+        }
			
 
				+      },
			
 
				+      "char_filter": {
			
 
				+        "my_char_filter": {
			
 
				+          "type": "mapping",
			
 
				+          "mappings": [
			
 
				+            "٠ => 0",
			
 
				+            "١ => 1",
			
 
				+            "٢ => 2",
			
 
				+            "٣ => 3",
			
 
				+            "٤ => 4",
			
 
				+            "٥ => 5",
			
 
				+            "٦ => 6",
			
 
				+            "٧ => 7",
			
 
				+            "٨ => 8",
			
 
				+            "٩ => 9"
			
 
				+          ]
			
 
				         }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+GET _cluster/health?wait_for_status=yellow
			
 
				+
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "analyzer": "my_analyzer",
			
 
				+  "text": "My license plate is ٢٥٠١٥"
			
 
				+}
			
 
				+----------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "My license plate is 25015",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 25,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				     }
			
 
				+  ]
			
 
				 }
			
 
				---------------------------------------------------
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				+The above example produces the following term:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ My license plate is 25015 ]
			
 
				+---------------------------
			
 
				+
			
 
				+Keys and values can be strings with multiple characters.  The following
			
 
				+example replaces the `:)` and `:(` emoticons with a text equivalent:
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_analyzer": {
			
 
				+          "tokenizer": "standard",
			
 
				+          "char_filter": [
			
 
				+            "my_char_filter"
			
 
				+          ]
			
 
				+        }
			
 
				+      },
			
 
				+      "char_filter": {
			
 
				+        "my_char_filter": {
			
 
				+          "type": "mapping",
			
 
				+          "mappings": [
			
 
				+            ":) => _happy_",
			
 
				+            ":( => _sad_"
			
 
				+          ]
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+GET _cluster/health?wait_for_status=yellow
			
 
				+
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "analyzer": "my_analyzer",
			
 
				+  "text": "I'm delighted about it :("
			
 
				+}
			
 
				+----------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "I'm",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 3,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "delighted",
			
 
				+      "start_offset": 4,
			
 
				+      "end_offset": 13,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "about",
			
 
				+      "start_offset": 14,
			
 
				+      "end_offset": 19,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "it",
			
 
				+      "start_offset": 20,
			
 
				+      "end_offset": 22,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "_sad_",
			
 
				+      "start_offset": 23,
			
 
				+      "end_offset": 25,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 4
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				+The above example produces the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ I'm, delighted, about, it, _sad_ ]
			
 
				+---------------------------
			
--- a/docs/reference/analysis/charfilters/pattern-replace-charfilter.asciidoc
+++ b/docs/reference/analysis/charfilters/pattern-replace-charfilter.asciidoc
@@ -1,37 +1,249 @@
 
				 [[analysis-pattern-replace-charfilter]]
			
 
				 === Pattern Replace Char Filter
			
 
				 
			
 
				-The `pattern_replace` char filter allows the use of a regex to
			
 
				-manipulate the characters in a string before analysis. The regular
			
 
				-expression is defined using the `pattern` parameter, and the replacement
			
 
				-string can be provided using the `replacement` parameter (supporting
			
 
				-referencing the original text, as explained
			
 
				-http://docs.oracle.com/javase/6/docs/api/java/util/regex/Matcher.html#appendReplacement(java.lang.StringBuffer,%20java.lang.String)[here]).
			
 
				-For more information check the
			
 
				-http://lucene.apache.org/core/4_3_1/analyzers-common/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.html[lucene
			
 
				-documentation]
			
 
				-
			
 
				-Here is a sample configuration:
			
 
				+The `pattern_replace` character filter uses a regular expression to match
			
 
				+characters which should be replaced with the specified replacement string.
			
 
				+The replacement string can refer to capture groups in the regular expression.
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `pattern_replace` character filter accepts the following parameters:
			
 
				+
			
 
				+[horizontal]
			
 
				+`pattern`::
			
 
				+
			
 
				+    A http://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html[Java regular expression]. Required.
			
 
				+
			
 
				+`replacement`::
			
 
				+
			
 
				+    The replacement string, which can reference capture groups using the
			
 
				+    `$1`..`$9` syntax, as explained
			
 
				+    http://docs.oracle.com/javase/8/docs/api/java/util/regex/Matcher.html#appendReplacement-java.lang.StringBuffer-java.lang.String-[here].
			
 
				+
			
 
				+[float]
			
 
				+=== Example configuration
			
 
				+
			
 
				+In this example, we configure the `pattern_replace` character filter to
			
 
				+replace any embedded dashes in numbers with underscores, i.e `123-456-789` ->
			
 
				+`123_456_789`:
			
 
				 
			
 
				 [source,js]
			
 
				---------------------------------------------------
			
 
				-{
			
 
				-    "index" : {
			
 
				-        "analysis" : {
			
 
				-            "char_filter" : {
			
 
				-                "my_pattern":{
			
 
				-                    "type":"pattern_replace",
			
 
				-                    "pattern":"sample(.*)",
			
 
				-                    "replacement":"replacedSample $1"
			
 
				-                }
			
 
				-            },
			
 
				-            "analyzer" : {
			
 
				-                "custom_with_char_filter" : {
			
 
				-                    "tokenizer" : "standard",
			
 
				-                    "char_filter" : ["my_pattern"]
			
 
				-                }
			
 
				-            }
			
 
				+----------------------------
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_analyzer": {
			
 
				+          "tokenizer": "standard",
			
 
				+          "char_filter": [
			
 
				+            "my_char_filter"
			
 
				+          ]
			
 
				+        }
			
 
				+      },
			
 
				+      "char_filter": {
			
 
				+        "my_char_filter": {
			
 
				+          "type": "pattern_replace",
			
 
				+          "pattern": "(\\d+)-(?=\\d)",
			
 
				+          "replacement": "$1_"
			
 
				         }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+GET _cluster/health?wait_for_status=yellow
			
 
				+
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "analyzer": "my_analyzer",
			
 
				+  "text": "My credit card is 123-456-789"
			
 
				+}
			
 
				+----------------------------
			
 
				+// CONSOLE
			
 
				+// TEST[skip:Test interprets $1 as a stashed variable]
			
 
				+
			
 
				+The above example produces the following term:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ My, credit, card, is 123_456_789 ]
			
 
				+---------------------------
			
 
				+
			
 
				+
			
 
				+WARNING: Using a replacement string that changes the length of the original
			
 
				+text will work for search purposes, but will result in incorrect highlighting,
			
 
				+as can be seen in the following example.
			
 
				+
			
 
				+This example inserts a space whenever it encounters a lower-case letter
			
 
				+followed by an upper-case letter (i.e. `fooBarBaz` -> `foo Bar Baz`), allowing
			
 
				+camelCase words to be queried individually:
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_analyzer": {
			
 
				+          "tokenizer": "standard",
			
 
				+          "char_filter": [
			
 
				+            "my_char_filter"
			
 
				+          ],
			
 
				+          "filter": [
			
 
				+            "lowercase"
			
 
				+          ]
			
 
				+        }
			
 
				+      },
			
 
				+      "char_filter": {
			
 
				+        "my_char_filter": {
			
 
				+          "type": "pattern_replace",
			
 
				+          "pattern": "(?<=\\p{Lower})(?=\\p{Upper})",
			
 
				+          "replacement": " "
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  },
			
 
				+  "mappings": {
			
 
				+    "my_type": {
			
 
				+      "properties": {
			
 
				+        "text": {
			
 
				+          "type": "text",
			
 
				+          "analyzer": "my_analyzer"
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+GET _cluster/health?wait_for_status=yellow
			
 
				+
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "analyzer": "my_analyzer",
			
 
				+  "text": "The fooBarBaz method"
			
 
				+}
			
 
				+----------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "the",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 3,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "foo",
			
 
				+      "start_offset": 4,
			
 
				+      "end_offset": 6,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "bar",
			
 
				+      "start_offset": 7,
			
 
				+      "end_offset": 9,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "baz",
			
 
				+      "start_offset": 10,
			
 
				+      "end_offset": 13,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "method",
			
 
				+      "start_offset": 14,
			
 
				+      "end_offset": 20,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 4
			
 
				     }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+The above returns the following terms:
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+[ the, foo, bar, baz, method ]
			
 
				+----------------------------
			
 
				+
			
 
				+Querying for `bar` will find the document correctly, but highlighting on the
			
 
				+result will produce incorrect highlights, because our character filter changed
			
 
				+the length of the original text:
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+PUT my_index/my_doc/1?refresh
			
 
				+{
			
 
				+  "text": "The fooBarBaz method"
			
 
				+}
			
 
				+
			
 
				+GET my_index/_search
			
 
				+{
			
 
				+  "query": {
			
 
				+    "match": {
			
 
				+      "text": "bar"
			
 
				+    }
			
 
				+  },
			
 
				+  "highlight": {
			
 
				+    "fields": {
			
 
				+      "text": {}
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+----------------------------
			
 
				+// CONSOLE
			
 
				+// TEST[continued]
			
 
				+
			
 
				+The output from the above is:
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "timed_out": false,
			
 
				+  "took": $body.took,
			
 
				+  "_shards": {
			
 
				+    "total": 5,
			
 
				+    "successful": 5,
			
 
				+    "failed": 0
			
 
				+  },
			
 
				+  "hits": {
			
 
				+    "total": 1,
			
 
				+    "max_score": 0.4375,
			
 
				+    "hits": [
			
 
				+      {
			
 
				+        "_index": "my_index",
			
 
				+        "_type": "my_doc",
			
 
				+        "_id": "1",
			
 
				+        "_score": 0.4375,
			
 
				+        "_source": {
			
 
				+          "text": "The fooBarBaz method"
			
 
				+        },
			
 
				+        "highlight": {
			
 
				+          "text": [
			
 
				+            "The foo<em>Ba</em>rBaz method" <1>
			
 
				+          ]
			
 
				+        }
			
 
				+      }
			
 
				+    ]
			
 
				+  }
			
 
				 }
			
 
				---------------------------------------------------
			
 
				+----------------------------
			
 
				+// TESTRESPONSE[s/"took".*/"took": "$body.took",/]
			
 
				+<1> Note the incorrect highlight.
			
--- a/docs/reference/analysis/tokenizers.asciidoc
+++ b/docs/reference/analysis/tokenizers.asciidoc
@@ -1,34 +1,136 @@
 
				 [[analysis-tokenizers]]
			
 
				 == Tokenizers
			
 
				 
			
 
				-Tokenizers are used to break a string down into a stream of terms
			
 
				-or tokens. A simple tokenizer might split the string up into terms 
			
 
				-wherever it encounters whitespace or punctuation.
			
 
				+A _tokenizer_  receives a stream of characters, breaks it up into individual
			
 
				+_tokens_ (usually individual words), and outputs a stream of _tokens_. For
			
 
				+instance, a <<analysis-whitespace-tokenizer,`whitespace`>> tokenizer breaks
			
 
				+text into tokens whenever it sees any whitespace.  It would convert the text
			
 
				+`"Quick brown fox!"` into the terms `[Quick, brown, fox!]`.
			
 
				 
			
 
				-Elasticsearch has a number of built in tokenizers which can be
			
 
				-used to build <<analysis-custom-analyzer,custom analyzers>>.
			
 
				+The tokenizer is also responsible for recording the order or _position_ of
			
 
				+each term (used for phrase and word proximity queries) and the start and end
			
 
				+_character offsets_ of the original word which the term represents (used for
			
 
				+highlighting search snippets).
			
 
				 
			
 
				-include::tokenizers/standard-tokenizer.asciidoc[]
			
 
				+Elasticsearch has a number of built in tokenizers which can be used to build
			
 
				+<<analysis-custom-analyzer,custom analyzers>>.
			
 
				 
			
 
				-include::tokenizers/edgengram-tokenizer.asciidoc[]
			
 
				+[float]
			
 
				+=== Word Oriented Tokenizers
			
 
				 
			
 
				-include::tokenizers/keyword-tokenizer.asciidoc[]
			
 
				+The following tokenizers are usually used for tokenizing full text into
			
 
				+individual words:
			
 
				+
			
 
				+<<analysis-standard-tokenizer,Standard Tokenizer>>::
			
 
				+
			
 
				+The `standard` tokenizer divides text into terms on word boundaries, as
			
 
				+defined by the Unicode Text Segmentation algorithm. It removes most
			
 
				+punctuation symbols. It is the best choice for most languages.
			
 
				+
			
 
				+<<analysis-letter-tokenizer,Letter Tokenizer>>::
			
 
				+
			
 
				+The `letter` tokenizer divides text into terms whenever it encounters a
			
 
				+character which is not a letter.
			
 
				+
			
 
				+<<analysis-letter-tokenizer,Lowercase Tokenizer>>::
			
 
				+
			
 
				+The `lowercase` tokenizer, like the `letter` tokenizer,  divides text into
			
 
				+terms whenever it encounters a character which is not a letter, but it also
			
 
				+lowercases all terms.
			
 
				+
			
 
				+<<analysis-whitespace-tokenizer,Whitespace Tokenizer>>::
			
 
				+
			
 
				+The `whitespace` tokenizer divides text into terms whenever it encounters any
			
 
				+whitespace character.
			
 
				+
			
 
				+<<analysis-uaxurlemail-tokenizer,UAX URL Email Tokenizer>>::
			
 
				+
			
 
				+The `uax_url_email` tokenizer is like the `standard` tokenizer except that it
			
 
				+recognises URLs and email addresses as single tokens.
			
 
				+
			
 
				+<<analysis-classic-tokenizer,Classic Tokenizer>>::
			
 
				+
			
 
				+The `classic` tokenizer is a grammar based tokenizer for the English Language.
			
 
				+
			
 
				+<<analysis-thai-tokenizer,Thai Tokenizer>>::
			
 
				+
			
 
				+The `thai` tokenizer segments Thai text into words.
			
 
				+
			
 
				+[float]
			
 
				+=== Partial Word Tokenizers
			
 
				+
			
 
				+These tokenizers break up text or words into small fragments, for partial word
			
 
				+matching:
			
 
				+
			
 
				+<<analysis-ngram-tokenizer,N-Gram Tokenizer>>::
			
 
				+
			
 
				+The `ngram` tokenizer can break up text into words when it encounters any of
			
 
				+a list of specified characters (e.g. whitespace or punctuation), then it returns
			
 
				+n-grams of each word: a sliding window of continuous letters, e.g. `quick` ->
			
 
				+`[qu, ui, ic, ck]`.
			
 
				+
			
 
				+<<analysis-edgengram-tokenizer,Edge N-Gram Tokenizer>>::
			
 
				+
			
 
				+The `edge_ngram` tokenizer can break up text into words when it encounters any of
			
 
				+a list of specified characters (e.g. whitespace or punctuation), then it returns
			
 
				+n-grams of each word which are anchored to the start of the word, e.g. `quick` ->
			
 
				+`[q, qu, qui, quic, quick]`.
			
 
				+
			
 
				+
			
 
				+[float]
			
 
				+=== Structured Text Tokenizers
			
 
				+
			
 
				+The following tokenizers are usually used with structured text like
			
 
				+identifiers, email addresses, zip codes, and paths, rather than with full
			
 
				+text:
			
 
				+
			
 
				+<<analysis-keyword-tokenizer,Keyword Tokenizer>>::
			
 
				+
			
 
				+The `keyword` tokenizer is a ``noop'' tokenizer that accepts whatever text it
			
 
				+is given and outputs the exact same text as a single term.  It can be combined
			
 
				+with token filters like <<analysis-lowercase-tokenfilter,`lowercase`>> to
			
 
				+normalise the analysed terms.
			
 
				+
			
 
				+<<analysis-pattern-tokenizer,Pattern Tokenizer>>::
			
 
				+
			
 
				+The `pattern` tokenizer uses a regular expression to either split text into
			
 
				+terms whenever it matches a word separator, or to capture matching text as
			
 
				+terms.
			
 
				+
			
 
				+<<analysis-pathhierarchy-tokenizer,Path Tokenizer>>::
			
 
				+
			
 
				+The `path_hierarchy` tokenizer takes a hierarchical value like a filesystem
			
 
				+path, splits on the path separator, and emits a term for each component in the
			
 
				+tree, e.g. `/foo/bar/baz` -> `[/foo, /foo/bar, /foo/bar/baz ]`.
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+include::tokenizers/standard-tokenizer.asciidoc[]
			
 
				 
			
 
				 include::tokenizers/letter-tokenizer.asciidoc[]
			
 
				 
			
 
				 include::tokenizers/lowercase-tokenizer.asciidoc[]
			
 
				 
			
 
				-include::tokenizers/ngram-tokenizer.asciidoc[]
			
 
				-
			
 
				 include::tokenizers/whitespace-tokenizer.asciidoc[]
			
 
				 
			
 
				-include::tokenizers/pattern-tokenizer.asciidoc[]
			
 
				-
			
 
				 include::tokenizers/uaxurlemail-tokenizer.asciidoc[]
			
 
				 
			
 
				-include::tokenizers/pathhierarchy-tokenizer.asciidoc[]
			
 
				-
			
 
				 include::tokenizers/classic-tokenizer.asciidoc[]
			
 
				 
			
 
				 include::tokenizers/thai-tokenizer.asciidoc[]
			
 
				 
			
 
				+
			
 
				+include::tokenizers/ngram-tokenizer.asciidoc[]
			
 
				+
			
 
				+include::tokenizers/edgengram-tokenizer.asciidoc[]
			
 
				+
			
 
				+
			
 
				+include::tokenizers/keyword-tokenizer.asciidoc[]
			
 
				+
			
 
				+include::tokenizers/pattern-tokenizer.asciidoc[]
			
 
				+
			
 
				+include::tokenizers/pathhierarchy-tokenizer.asciidoc[]
			
 
				+
			
 
				+
			
--- a/docs/reference/analysis/tokenizers/classic-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/classic-tokenizer.asciidoc
@@ -1,19 +1,269 @@
 
				 [[analysis-classic-tokenizer]]
			
 
				 === Classic Tokenizer
			
 
				 
			
 
				-A tokenizer of type `classic` providing grammar based tokenizer that is
			
 
				-a good tokenizer for English language documents. This tokenizer has 
			
 
				-heuristics for special treatment of acronyms, company names, email addresses,
			
 
				-and internet host names. However, these rules don't always work, and 
			
 
				-the tokenizer doesn't work well for most languages other than English.
			
 
				-
			
 
				-The following are settings that can be set for a `classic` tokenizer
			
 
				-type:
			
 
				-
			
 
				-[cols="<,<",options="header",]
			
 
				-|=======================================================================
			
 
				-|Setting |Description
			
 
				-|`max_token_length` |The maximum token length. If a token is seen that
			
 
				-exceeds this length then it is discarded. Defaults to `255`.
			
 
				-|=======================================================================
			
 
				+The `classic` tokenizer is a grammar based tokenizer that is good for English
			
 
				+language documents. This tokenizer has heuristics for special treatment of
			
 
				+acronyms, company names, email addresses, and internet host names. However,
			
 
				+these rules don't always work, and the tokenizer doesn't work well for most
			
 
				+languages other than English:
			
 
				+
			
 
				+* It splits words at most punctuation characters, removing punctuation. However, a
			
 
				+  dot that's not followed by whitespace is considered part of a token.
			
 
				+
			
 
				+* It splits words at hyphens, unless there's a number in the token, in which case
			
 
				+  the whole token is interpreted as a product number and is not split.
			
 
				+
			
 
				+* It recognizes email addresses and internet hostnames as one token.
			
 
				+
			
 
				+[float]
			
 
				+=== Example output
			
 
				+
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+POST _analyze
			
 
				+{
			
 
				+  "tokenizer": "classic",
			
 
				+  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
			
 
				+}
			
 
				+---------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "The",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 3,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "2",
			
 
				+      "start_offset": 4,
			
 
				+      "end_offset": 5,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "QUICK",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "Brown",
			
 
				+      "start_offset": 12,
			
 
				+      "end_offset": 17,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "Foxes",
			
 
				+      "start_offset": 18,
			
 
				+      "end_offset": 23,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "jumped",
			
 
				+      "start_offset": 24,
			
 
				+      "end_offset": 30,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 5
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "over",
			
 
				+      "start_offset": 31,
			
 
				+      "end_offset": 35,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 6
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "the",
			
 
				+      "start_offset": 36,
			
 
				+      "end_offset": 39,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 7
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "lazy",
			
 
				+      "start_offset": 40,
			
 
				+      "end_offset": 44,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 8
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "dog's",
			
 
				+      "start_offset": 45,
			
 
				+      "end_offset": 50,
			
 
				+      "type": "<APOSTROPHE>",
			
 
				+      "position": 9
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "bone",
			
 
				+      "start_offset": 51,
			
 
				+      "end_offset": 55,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 10
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				+The above sentence would produce the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
			
 
				+---------------------------
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `classic` tokenizer accepts the following parameters:
			
 
				+
			
 
				+[horizontal]
			
 
				+`max_token_length`::
			
 
				+
			
 
				+    The maximum token length. If a token is seen that exceeds this length then
			
 
				+    it is split at `max_token_length` intervals. Defaults to `255`.
			
 
				+
			
 
				+[float]
			
 
				+=== Example configuration
			
 
				+
			
 
				+In this example, we configure the `classic` tokenizer to have a
			
 
				+`max_token_length` of 5 (for demonstration purposes):
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_analyzer": {
			
 
				+          "tokenizer": "my_tokenizer"
			
 
				+        }
			
 
				+      },
			
 
				+      "tokenizer": {
			
 
				+        "my_tokenizer": {
			
 
				+          "type": "classic",
			
 
				+          "max_token_length": 5
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+GET _cluster/health?wait_for_status=yellow
			
 
				+
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "analyzer": "my_analyzer",
			
 
				+  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
			
 
				+}
			
 
				+----------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "The",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 3,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "2",
			
 
				+      "start_offset": 4,
			
 
				+      "end_offset": 5,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "QUICK",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "Brown",
			
 
				+      "start_offset": 12,
			
 
				+      "end_offset": 17,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "Foxes",
			
 
				+      "start_offset": 18,
			
 
				+      "end_offset": 23,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "over",
			
 
				+      "start_offset": 31,
			
 
				+      "end_offset": 35,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 6
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "the",
			
 
				+      "start_offset": 36,
			
 
				+      "end_offset": 39,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 7
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "lazy",
			
 
				+      "start_offset": 40,
			
 
				+      "end_offset": 44,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 8
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "dog's",
			
 
				+      "start_offset": 45,
			
 
				+      "end_offset": 50,
			
 
				+      "type": "<APOSTROPHE>",
			
 
				+      "position": 9
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "bone",
			
 
				+      "start_offset": 51,
			
 
				+      "end_offset": 55,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 10
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				+The above example produces the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ The, 2, QUICK, Brown, Foxes, jumpe, d, over, the, lazy, dog's, bone ]
			
 
				+---------------------------
			
 
				+
			
 
				+
			
 
				 
			
--- a/docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/edgengram-tokenizer.asciidoc
@@ -1,80 +1,323 @@
 
				 [[analysis-edgengram-tokenizer]]
			
 
				 === Edge NGram Tokenizer
			
 
				 
			
 
				-A tokenizer of type `edgeNGram`.
			
 
				+The `edge_ngram` tokenizer first breaks text down into words whenever it
			
 
				+encounters one of a list of specified characters, then it emits
			
 
				+https://en.wikipedia.org/wiki/N-gram[N-grams] of each word where the start of
			
 
				+the N-gram is anchored to the beginning of the word.
			
 
				 
			
 
				-This tokenizer is very similar to `nGram` but only keeps n-grams which
			
 
				-start at the beginning of a token.
			
 
				+Edge N-Grams are useful for _search-as-you-type_ queries.
			
 
				 
			
 
				-The following are settings that can be set for a `edgeNGram` tokenizer
			
 
				-type:
			
 
				+TIP: When you need _search-as-you-type_ for text which has a widely known
			
 
				+order, such as movie or song titles, the
			
 
				+<<search-suggesters-completion,completion suggester>> is a much more efficient
			
 
				+choice than edge N-grams.  Edge N-grams have the advantage when trying to
			
 
				+autocomplete words that can appear in any order.
			
 
				+
			
 
				+[float]
			
 
				+=== Example output
			
 
				+
			
 
				+With the default settings, the `edge_ngram` tokenizer treats the initial text as a
			
 
				+single token and produces N-grams with minimum length `1` and maximum length
			
 
				+`2`:
			
 
				+
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+POST _analyze
			
 
				+{
			
 
				+  "tokenizer": "edge_ngram",
			
 
				+  "text": "Quick Fox"
			
 
				+}
			
 
				+---------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "Q",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 1,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "Qu",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 2,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				 
			
 
				-[cols="<,<,<",options="header",]
			
 
				-|=======================================================================
			
 
				-|Setting |Description |Default value
			
 
				-|`min_gram` |Minimum size in codepoints of a single n-gram |`1`.
			
 
				 
			
 
				-|`max_gram` |Maximum size in codepoints of a single n-gram |`2`.
			
 
				+The above sentence would produce the following terms:
			
 
				 
			
 
				-|`token_chars` | Characters classes to keep in the
			
 
				-tokens, Elasticsearch will split on characters that don't belong to any
			
 
				-of these classes. |`[]` (Keep all characters)
			
 
				-|=======================================================================
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ Q, Qu ]
			
 
				+---------------------------
			
 
				 
			
 
				+NOTE: These default gram lengths are almost entirely useless.  You need to
			
 
				+configure the `edge_ngram` before using it.
			
 
				 
			
 
				-`token_chars` accepts the following character classes:
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `edge_ngram` tokenizer accepts the following parameters:
			
 
				 
			
 
				 [horizontal]
			
 
				-`letter`::      for example `a`, `b`, `ï` or `京`
			
 
				-`digit`::       for example `3` or `7`
			
 
				-`whitespace`::  for example `" "` or `"\n"`
			
 
				-`punctuation`:: for example `!` or `"`
			
 
				-`symbol`::      for example `$` or `√`
			
 
				+`min_gram`::
			
 
				+    Minimum length of characters in a gram.  Defaults to `1`.
			
 
				+
			
 
				+`max_gram`::
			
 
				+    Maximum length of characters in a gram.  Defaults to `2`.
			
 
				+
			
 
				+`token_chars`::
			
 
				+
			
 
				+    Character classes that should be included in a token.  Elasticsearch
			
 
				+    will split on characters that don't belong to the classes specified.
			
 
				+    Defaults to `[]` (keep all characters).
			
 
				++
			
 
				+Character classes may be any of the following:
			
 
				++
			
 
				+* `letter` --      for example `a`, `b`, `ï` or `京`
			
 
				+* `digit` --       for example `3` or `7`
			
 
				+* `whitespace` --  for example `" "` or `"\n"`
			
 
				+* `punctuation` -- for example `!` or `"`
			
 
				+* `symbol` --      for example `$` or `√`
			
 
				 
			
 
				 [float]
			
 
				-==== Example
			
 
				+=== Example configuration
			
 
				+
			
 
				+In this example, we configure the `edge_ngram` tokenizer to treat letters and
			
 
				+digits as tokens, and to produce grams with minimum length `2` and maximum
			
 
				+length `10`:
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_analyzer": {
			
 
				+          "tokenizer": "my_tokenizer"
			
 
				+        }
			
 
				+      },
			
 
				+      "tokenizer": {
			
 
				+        "my_tokenizer": {
			
 
				+          "type": "edge_ngram",
			
 
				+          "min_gram": 2,
			
 
				+          "max_gram": 10,
			
 
				+          "token_chars": [
			
 
				+            "letter",
			
 
				+            "digit"
			
 
				+          ]
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+GET _cluster/health?wait_for_status=yellow
			
 
				+
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "analyzer": "my_analyzer",
			
 
				+  "text": "2 Quick Foxes."
			
 
				+}
			
 
				+----------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				 
			
 
				 [source,js]
			
 
				---------------------------------------------------
			
 
				-    curl -XPUT 'localhost:9200/test' -d '
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "Qu",
			
 
				+      "start_offset": 2,
			
 
				+      "end_offset": 4,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "Qui",
			
 
				+      "start_offset": 2,
			
 
				+      "end_offset": 5,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "Quic",
			
 
				+      "start_offset": 2,
			
 
				+      "end_offset": 6,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "Quick",
			
 
				+      "start_offset": 2,
			
 
				+      "end_offset": 7,
			
 
				+      "type": "word",
			
 
				+      "position": 3
			
 
				+    },
			
 
				     {
			
 
				-        "settings" : {
			
 
				-            "analysis" : {
			
 
				-                "analyzer" : {
			
 
				-                    "my_edge_ngram_analyzer" : {
			
 
				-                        "tokenizer" : "my_edge_ngram_tokenizer"
			
 
				-                    }
			
 
				-                },
			
 
				-                "tokenizer" : {
			
 
				-                    "my_edge_ngram_tokenizer" : {
			
 
				-                        "type" : "edgeNGram",
			
 
				-                        "min_gram" : "2",
			
 
				-                        "max_gram" : "5",
			
 
				-                        "token_chars": [ "letter", "digit" ]
			
 
				-                    }
			
 
				-                }
			
 
				-            }
			
 
				+      "token": "Fo",
			
 
				+      "start_offset": 8,
			
 
				+      "end_offset": 10,
			
 
				+      "type": "word",
			
 
				+      "position": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "Fox",
			
 
				+      "start_offset": 8,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "word",
			
 
				+      "position": 5
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "Foxe",
			
 
				+      "start_offset": 8,
			
 
				+      "end_offset": 12,
			
 
				+      "type": "word",
			
 
				+      "position": 6
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "Foxes",
			
 
				+      "start_offset": 8,
			
 
				+      "end_offset": 13,
			
 
				+      "type": "word",
			
 
				+      "position": 7
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+The above example produces the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ Qu, Qui, Quic, Quick, Fo, Fox, Foxe, Foxes ]
			
 
				+---------------------------
			
 
				+
			
 
				+Usually we recommend using the same `analyzer` at index time and at search
			
 
				+time. In the case of the `edge_ngram` tokenizer, the advice is different.  It
			
 
				+only makes sense to use the `edge_ngram` tokenizer at index time, to ensure
			
 
				+that partial words are available for matching in the index.  At search time,
			
 
				+just search for the terms the user has typed in, for instance: `Quick Fo`.
			
 
				+
			
 
				+Below is an example of how to set up a field for _search-as-you-type_:
			
 
				+
			
 
				+[source,js]
			
 
				+-----------------------------------
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "autocomplete": {
			
 
				+          "tokenizer": "autocomplete",
			
 
				+          "filter": [
			
 
				+            "lowercase"
			
 
				+          ]
			
 
				+        },
			
 
				+        "autocomplete_search": {
			
 
				+          "tokenizer": "lowercase"
			
 
				+        }
			
 
				+      },
			
 
				+      "tokenizer": {
			
 
				+        "autocomplete": {
			
 
				+          "type": "edge_ngram",
			
 
				+          "min_gram": 2,
			
 
				+          "max_gram": 10,
			
 
				+          "token_chars": [
			
 
				+            "letter"
			
 
				+          ]
			
 
				         }
			
 
				-    }'
			
 
				+      }
			
 
				+    }
			
 
				+  },
			
 
				+  "mappings": {
			
 
				+    "doc": {
			
 
				+      "properties": {
			
 
				+        "title": {
			
 
				+          "type": "text",
			
 
				+          "analyzer": "autocomplete",
			
 
				+          "search_analyzer": "autocomplete_search"
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				 
			
 
				-    curl 'localhost:9200/test/_analyze?pretty=1&analyzer=my_edge_ngram_analyzer' -d 'FC Schalke 04'
			
 
				-    # FC, Sc, Sch, Scha, Schal, 04
			
 
				---------------------------------------------------
			
 
				+PUT my_index/doc/1
			
 
				+{
			
 
				+  "title": "Quick Foxes" <1>
			
 
				+}
			
 
				 
			
 
				-[float]
			
 
				-==== `side` deprecated
			
 
				+POST my_index/_refresh
			
 
				+
			
 
				+GET my_index/_search
			
 
				+{
			
 
				+  "query": {
			
 
				+    "match": {
			
 
				+      "title": {
			
 
				+        "query": "Quick Fo", <2>
			
 
				+        "operator": "and"
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+-----------------------------------
			
 
				+// CONSOLE
			
 
				 
			
 
				-There used to be a `side` parameter up to `0.90.1` but it is now deprecated. In
			
 
				-order to emulate the behavior of `"side" : "BACK"` a
			
 
				-<<analysis-reverse-tokenfilter,`reverse` token filter>>  should be used together
			
 
				-with the <<analysis-edgengram-tokenfilter,`edgeNGram` token filter>>. The
			
 
				-`edgeNGram` filter must be enclosed in `reverse` filters like this:
			
 
				+<1> The `autocomplete` analyzer indexes the terms `[qu, qui, quic, quick, fo, fox, foxe, foxes]`.
			
 
				+<2> The `autocomplete_search` analyzer searches for the terms `[quick, fo]`, both of which appear in the index.
			
 
				+
			
 
				+/////////////////////
			
 
				 
			
 
				 [source,js]
			
 
				---------------------------------------------------
			
 
				-    "filter" : ["reverse", "edgeNGram", "reverse"]
			
 
				---------------------------------------------------
			
 
				+----------------------------
			
 
				+{
			
 
				+  "took": $body.took,
			
 
				+  "timed_out": false,
			
 
				+  "_shards": {
			
 
				+    "total": 5,
			
 
				+    "successful": 5,
			
 
				+    "failed": 0
			
 
				+  },
			
 
				+  "hits": {
			
 
				+    "total": 1,
			
 
				+    "max_score": 0.44194174,
			
 
				+    "hits": [
			
 
				+      {
			
 
				+        "_index": "my_index",
			
 
				+        "_type": "doc",
			
 
				+        "_id": "1",
			
 
				+        "_score": 0.44194174,
			
 
				+        "_source": {
			
 
				+          "title": "Quick Foxes"
			
 
				+        }
			
 
				+      }
			
 
				+    ]
			
 
				+  }
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE[s/"took".*/"took": "$body.took",/]
			
 
				+/////////////////////
			
 
				 
			
 
				-which essentially reverses the token, builds front `EdgeNGrams` and reverses
			
 
				-the ngram again. This has the same effect as the previous `"side" : "BACK"` setting.
			
 
				 
			
--- a/docs/reference/analysis/tokenizers/keyword-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/keyword-tokenizer.asciidoc
@@ -1,15 +1,60 @@
 
				 [[analysis-keyword-tokenizer]]
			
 
				 === Keyword Tokenizer
			
 
				 
			
 
				-A tokenizer of type `keyword` that emits the entire input as a single
			
 
				-output.
			
 
				+The `keyword` tokenizer  is a ``noop'' tokenizer that accepts whatever text it
			
 
				+is given and outputs the exact same text as a single term.  It can be combined
			
 
				+with token filters to normalise output, e.g. lower-casing email addresses.
			
 
				 
			
 
				-The following are settings that can be set for a `keyword` tokenizer
			
 
				-type:
			
 
				+[float]
			
 
				+=== Example output
			
 
				 
			
 
				-[cols="<,<",options="header",]
			
 
				-|=======================================================
			
 
				-|Setting |Description
			
 
				-|`buffer_size` |The term buffer size. Defaults to `256`.
			
 
				-|=======================================================
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+POST _analyze
			
 
				+{
			
 
				+  "tokenizer": "keyword",
			
 
				+  "text": "New York"
			
 
				+}
			
 
				+---------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "New York",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 8,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				+The above sentence would produce the following term:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ New York ]
			
 
				+---------------------------
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `keyword` tokenizer accepts the following parameters:
			
 
				+
			
 
				+[horizontal]
			
 
				+`buffer_size`::
			
 
				+
			
 
				+    The number of characters read into the term buffer in a single pass.
			
 
				+    Defaults to `256`.  The term buffer will grow by this size until all the
			
 
				+    text has been consumed.  It is advisable not to change this setting.
			
 
				 
			
--- a/docs/reference/analysis/tokenizers/letter-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/letter-tokenizer.asciidoc
@@ -1,7 +1,123 @@
 
				 [[analysis-letter-tokenizer]]
			
 
				 === Letter Tokenizer
			
 
				 
			
 
				-A tokenizer of type `letter` that divides text at non-letters. That's to
			
 
				-say, it defines tokens as maximal strings of adjacent letters. Note,
			
 
				-this does a decent job for most European languages, but does a terrible
			
 
				-job for some Asian languages, where words are not separated by spaces.
			
 
				+The `letter` tokenizer breaks text into terms whenever it encounters a
			
 
				+character which is not a letter. It does a reasonable job for most European
			
 
				+languages, but does a terrible job for some Asian languages, where words are
			
 
				+not separated by spaces.
			
 
				+
			
 
				+[float]
			
 
				+=== Example output
			
 
				+
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+POST _analyze
			
 
				+{
			
 
				+  "tokenizer": "letter",
			
 
				+  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
			
 
				+}
			
 
				+---------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "The",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 3,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "QUICK",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "Brown",
			
 
				+      "start_offset": 12,
			
 
				+      "end_offset": 17,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "Foxes",
			
 
				+      "start_offset": 18,
			
 
				+      "end_offset": 23,
			
 
				+      "type": "word",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "jumped",
			
 
				+      "start_offset": 24,
			
 
				+      "end_offset": 30,
			
 
				+      "type": "word",
			
 
				+      "position": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "over",
			
 
				+      "start_offset": 31,
			
 
				+      "end_offset": 35,
			
 
				+      "type": "word",
			
 
				+      "position": 5
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "the",
			
 
				+      "start_offset": 36,
			
 
				+      "end_offset": 39,
			
 
				+      "type": "word",
			
 
				+      "position": 6
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "lazy",
			
 
				+      "start_offset": 40,
			
 
				+      "end_offset": 44,
			
 
				+      "type": "word",
			
 
				+      "position": 7
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "dog",
			
 
				+      "start_offset": 45,
			
 
				+      "end_offset": 48,
			
 
				+      "type": "word",
			
 
				+      "position": 8
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "s",
			
 
				+      "start_offset": 49,
			
 
				+      "end_offset": 50,
			
 
				+      "type": "word",
			
 
				+      "position": 9
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "bone",
			
 
				+      "start_offset": 51,
			
 
				+      "end_offset": 55,
			
 
				+      "type": "word",
			
 
				+      "position": 10
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				+The above sentence would produce the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ The, QUICK, Brown, Foxes, jumped, over, the, lazy, dog, s, bone ]
			
 
				+---------------------------
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `letter` tokenizer is not configurable.
			
--- a/docs/reference/analysis/tokenizers/lowercase-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/lowercase-tokenizer.asciidoc
@@ -1,15 +1,128 @@
 
				 [[analysis-lowercase-tokenizer]]
			
 
				 === Lowercase Tokenizer
			
 
				 
			
 
				-A tokenizer of type `lowercase` that performs the function of
			
 
				-<<analysis-letter-tokenizer,Letter
			
 
				-Tokenizer>> and
			
 
				-<<analysis-lowercase-tokenfilter,Lower
			
 
				-Case Token Filter>> together. It divides text at non-letters and converts
			
 
				-them to lower case. While it is functionally equivalent to the
			
 
				-combination of
			
 
				-<<analysis-letter-tokenizer,Letter
			
 
				-Tokenizer>> and
			
 
				-<<analysis-lowercase-tokenfilter,Lower
			
 
				-Case Token Filter>>, there is a performance advantage to doing the two
			
 
				-tasks at once, hence this (redundant) implementation.
			
 
				+
			
 
				+The `lowercase` toknenizer, like the
			
 
				+<<analysis-letter-tokenizer, `letter` tokenizer>> breaks text into terms
			
 
				+whenever it encounters a character which is not a letter, but it also
			
 
				+lowecases all terms.  It is functionally equivalent to the
			
 
				+<<analysis-letter-tokenizer, `letter` tokenizer>> combined with the
			
 
				+<<analysis-lowercase-tokenfilter, `lowercase` token filter>>, but is more
			
 
				+efficient as it performs both steps in a single pass.
			
 
				+
			
 
				+
			
 
				+[float]
			
 
				+=== Example output
			
 
				+
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+POST _analyze
			
 
				+{
			
 
				+  "tokenizer": "lowercase",
			
 
				+  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
			
 
				+}
			
 
				+---------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "the",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 3,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "quick",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "brown",
			
 
				+      "start_offset": 12,
			
 
				+      "end_offset": 17,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "foxes",
			
 
				+      "start_offset": 18,
			
 
				+      "end_offset": 23,
			
 
				+      "type": "word",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "jumped",
			
 
				+      "start_offset": 24,
			
 
				+      "end_offset": 30,
			
 
				+      "type": "word",
			
 
				+      "position": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "over",
			
 
				+      "start_offset": 31,
			
 
				+      "end_offset": 35,
			
 
				+      "type": "word",
			
 
				+      "position": 5
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "the",
			
 
				+      "start_offset": 36,
			
 
				+      "end_offset": 39,
			
 
				+      "type": "word",
			
 
				+      "position": 6
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "lazy",
			
 
				+      "start_offset": 40,
			
 
				+      "end_offset": 44,
			
 
				+      "type": "word",
			
 
				+      "position": 7
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "dog",
			
 
				+      "start_offset": 45,
			
 
				+      "end_offset": 48,
			
 
				+      "type": "word",
			
 
				+      "position": 8
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "s",
			
 
				+      "start_offset": 49,
			
 
				+      "end_offset": 50,
			
 
				+      "type": "word",
			
 
				+      "position": 9
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "bone",
			
 
				+      "start_offset": 51,
			
 
				+      "end_offset": 55,
			
 
				+      "type": "word",
			
 
				+      "position": 10
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				+The above sentence would produce the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ the, quick, brown, foxes, jumped, over, the, lazy, dog, s, bone ]
			
 
				+---------------------------
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `lowercase` tokenizer is not configurable.
			
--- a/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc
@@ -1,57 +1,306 @@
 
				 [[analysis-ngram-tokenizer]]
			
 
				 === NGram Tokenizer
			
 
				 
			
 
				-A tokenizer of type `nGram`.
			
 
				+The `ngram` tokenizer first breaks text down into words whenever it encounters
			
 
				+one of a list of specified characters, then it emits
			
 
				+https://en.wikipedia.org/wiki/N-gram[N-grams] of each word of the specified
			
 
				+length.
			
 
				 
			
 
				-The following are settings that can be set for a `nGram` tokenizer type:
			
 
				+N-grams are like a sliding window that moves across the word - a continuous
			
 
				+sequence of characters of the specified length. They are useful for querying
			
 
				+languages that don't use spaces or that have long compound words, like German.
			
 
				 
			
 
				-[cols="<,<,<",options="header",]
			
 
				-|=======================================================================
			
 
				-|Setting |Description |Default value
			
 
				-|`min_gram` |Minimum size in codepoints of a single n-gram |`1`.
			
 
				+[float]
			
 
				+=== Example output
			
 
				+
			
 
				+With the default settings, the `ngram` tokenizer treats the initial text as a
			
 
				+single token and produces N-grams with minimum length `1` and maximum length
			
 
				+`2`:
			
 
				+
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+POST _analyze
			
 
				+{
			
 
				+  "tokenizer": "ngram",
			
 
				+  "text": "Quick Fox"
			
 
				+}
			
 
				+---------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "Q",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 1,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "Qu",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 2,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "u",
			
 
				+      "start_offset": 1,
			
 
				+      "end_offset": 2,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "ui",
			
 
				+      "start_offset": 1,
			
 
				+      "end_offset": 3,
			
 
				+      "type": "word",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "i",
			
 
				+      "start_offset": 2,
			
 
				+      "end_offset": 3,
			
 
				+      "type": "word",
			
 
				+      "position": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "ic",
			
 
				+      "start_offset": 2,
			
 
				+      "end_offset": 4,
			
 
				+      "type": "word",
			
 
				+      "position": 5
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "c",
			
 
				+      "start_offset": 3,
			
 
				+      "end_offset": 4,
			
 
				+      "type": "word",
			
 
				+      "position": 6
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "ck",
			
 
				+      "start_offset": 3,
			
 
				+      "end_offset": 5,
			
 
				+      "type": "word",
			
 
				+      "position": 7
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "k",
			
 
				+      "start_offset": 4,
			
 
				+      "end_offset": 5,
			
 
				+      "type": "word",
			
 
				+      "position": 8
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "k ",
			
 
				+      "start_offset": 4,
			
 
				+      "end_offset": 6,
			
 
				+      "type": "word",
			
 
				+      "position": 9
			
 
				+    },
			
 
				+    {
			
 
				+      "token": " ",
			
 
				+      "start_offset": 5,
			
 
				+      "end_offset": 6,
			
 
				+      "type": "word",
			
 
				+      "position": 10
			
 
				+    },
			
 
				+    {
			
 
				+      "token": " F",
			
 
				+      "start_offset": 5,
			
 
				+      "end_offset": 7,
			
 
				+      "type": "word",
			
 
				+      "position": 11
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "F",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 7,
			
 
				+      "type": "word",
			
 
				+      "position": 12
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "Fo",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 8,
			
 
				+      "type": "word",
			
 
				+      "position": 13
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "o",
			
 
				+      "start_offset": 7,
			
 
				+      "end_offset": 8,
			
 
				+      "type": "word",
			
 
				+      "position": 14
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "ox",
			
 
				+      "start_offset": 7,
			
 
				+      "end_offset": 9,
			
 
				+      "type": "word",
			
 
				+      "position": 15
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "x",
			
 
				+      "start_offset": 8,
			
 
				+      "end_offset": 9,
			
 
				+      "type": "word",
			
 
				+      "position": 16
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				 
			
 
				-|`max_gram` |Maximum size in codepoints of a single n-gram |`2`.
			
 
				 
			
 
				-|`token_chars` |Characters classes to keep in the
			
 
				-tokens, Elasticsearch will split on characters that don't belong to any
			
 
				-of these classes. |`[]` (Keep all characters)
			
 
				-|=======================================================================
			
 
				+The above sentence would produce the following terms:
			
 
				 
			
 
				-`token_chars` accepts the following character classes:
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ Q, Qu, u, ui, i, ic, c, ck, k, "k ", " ", " F", F, Fo, o, ox, x ]
			
 
				+---------------------------
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `ngram` tokenizer accepts the following parameters:
			
 
				 
			
 
				 [horizontal]
			
 
				-`letter`::      for example `a`, `b`, `ï` or `京`
			
 
				-`digit`::       for example `3` or `7`
			
 
				-`whitespace`::  for example `" "` or `"\n"`
			
 
				-`punctuation`:: for example `!` or `"`
			
 
				-`symbol`::      for example `$` or `√`
			
 
				+`min_gram`::
			
 
				+    Minimum length of characters in a gram.  Defaults to `1`.
			
 
				+
			
 
				+`max_gram`::
			
 
				+    Maximum length of characters in a gram.  Defaults to `2`.
			
 
				+
			
 
				+`token_chars`::
			
 
				+
			
 
				+    Character classes that should be included in a token.  Elasticsearch
			
 
				+    will split on characters that don't belong to the classes specified.
			
 
				+    Defaults to `[]` (keep all characters).
			
 
				++
			
 
				+Character classes may be any of the following:
			
 
				++
			
 
				+* `letter` --      for example `a`, `b`, `ï` or `京`
			
 
				+* `digit` --       for example `3` or `7`
			
 
				+* `whitespace` --  for example `" "` or `"\n"`
			
 
				+* `punctuation` -- for example `!` or `"`
			
 
				+* `symbol` --      for example `$` or `√`
			
 
				+
			
 
				+TIP:  It usually makes sense to set `min_gram` and `max_gram` to the same
			
 
				+value.  The smaller the length, the more documents will match but the lower
			
 
				+the quality of the matches.  The longer the length, the more specific the
			
 
				+matches.  A tri-gram (length `3`) is a good place to start.
			
 
				 
			
 
				 [float]
			
 
				-==== Example
			
 
				+=== Example configuration
			
 
				+
			
 
				+In this example, we configure the `ngram` tokenizer to treat letters and
			
 
				+digits as tokens, and to produce tri-grams (grams of length `3`):
			
 
				 
			
 
				 [source,js]
			
 
				---------------------------------------------------
			
 
				-    curl -XPUT 'localhost:9200/test' -d '
			
 
				-    {
			
 
				-        "settings" : {
			
 
				-            "analysis" : {
			
 
				-                "analyzer" : {
			
 
				-                    "my_ngram_analyzer" : {
			
 
				-                        "tokenizer" : "my_ngram_tokenizer"
			
 
				-                    }
			
 
				-                },
			
 
				-                "tokenizer" : {
			
 
				-                    "my_ngram_tokenizer" : {
			
 
				-                        "type" : "nGram",
			
 
				-                        "min_gram" : "2",
			
 
				-                        "max_gram" : "3",
			
 
				-                        "token_chars": [ "letter", "digit" ]
			
 
				-                    }
			
 
				-                }
			
 
				-            }
			
 
				+----------------------------
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_analyzer": {
			
 
				+          "tokenizer": "my_tokenizer"
			
 
				+        }
			
 
				+      },
			
 
				+      "tokenizer": {
			
 
				+        "my_tokenizer": {
			
 
				+          "type": "ngram",
			
 
				+          "min_gram": 3,
			
 
				+          "max_gram": 3,
			
 
				+          "token_chars": [
			
 
				+            "letter",
			
 
				+            "digit"
			
 
				+          ]
			
 
				         }
			
 
				-    }'
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+GET _cluster/health?wait_for_status=yellow
			
 
				+
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "analyzer": "my_analyzer",
			
 
				+  "text": "2 Quick Foxes."
			
 
				+}
			
 
				+----------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "Qui",
			
 
				+      "start_offset": 2,
			
 
				+      "end_offset": 5,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "uic",
			
 
				+      "start_offset": 3,
			
 
				+      "end_offset": 6,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "ick",
			
 
				+      "start_offset": 4,
			
 
				+      "end_offset": 7,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "Fox",
			
 
				+      "start_offset": 8,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "word",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "oxe",
			
 
				+      "start_offset": 9,
			
 
				+      "end_offset": 12,
			
 
				+      "type": "word",
			
 
				+      "position": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "xes",
			
 
				+      "start_offset": 10,
			
 
				+      "end_offset": 13,
			
 
				+      "type": "word",
			
 
				+      "position": 5
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				+The above example produces the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ Qui, uic, ick, Fox, oxe, xes ]
			
 
				+---------------------------
			
 
				+
			
 
				 
			
 
				-    curl 'localhost:9200/test/_analyze?pretty=1&analyzer=my_ngram_analyzer' -d 'FC Schalke 04'
			
 
				-    # FC, Sc, Sch, ch, cha, ha, hal, al, alk, lk, lke, ke, 04
			
 
				---------------------------------------------------
			
--- a/docs/reference/analysis/tokenizers/pathhierarchy-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/pathhierarchy-tokenizer.asciidoc
@@ -1,32 +1,175 @@
 
				 [[analysis-pathhierarchy-tokenizer]]
			
 
				 === Path Hierarchy Tokenizer
			
 
				 
			
 
				-The `path_hierarchy` tokenizer takes something like this:
			
 
				+The `path_hierarchy` tokenizer takes a hierarchical value like a filesystem
			
 
				+path, splits on the path separator, and emits a term for each component in the
			
 
				+tree.
			
 
				 
			
 
				--------------------------
			
 
				-/something/something/else
			
 
				--------------------------
			
 
				+[float]
			
 
				+=== Example output
			
 
				 
			
 
				-And produces tokens:
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+POST _analyze
			
 
				+{
			
 
				+  "tokenizer": "path_hierarchy",
			
 
				+  "text": "/one/two/three"
			
 
				+}
			
 
				+---------------------------
			
 
				+// CONSOLE
			
 
				 
			
 
				--------------------------
			
 
				-/something
			
 
				-/something/something
			
 
				-/something/something/else
			
 
				--------------------------
			
 
				+/////////////////////
			
 
				 
			
 
				-[cols="<,<",options="header",]
			
 
				-|=======================================================================
			
 
				-|Setting |Description
			
 
				-|`delimiter` |The character delimiter to use, defaults to `/`.
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "/one",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 4,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "/one/two",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 8,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "/one/two/three",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 14,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				 
			
 
				-|`replacement` |An optional replacement character to use. Defaults to
			
 
				-the `delimiter`.
			
 
				+/////////////////////
			
 
				 
			
 
				-|`buffer_size` |The buffer size to use, defaults to `1024`.
			
 
				 
			
 
				-|`reverse` |Generates tokens in reverse order, defaults to `false`.
			
 
				 
			
 
				-|`skip` |Controls initial tokens to skip, defaults to `0`.
			
 
				-|=======================================================================
			
 
				+The above text would produce the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ /one, /one/two, /one/two/three ]
			
 
				+---------------------------
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `path_hierarchy` tokenizer accepts the following parameters:
			
 
				+
			
 
				+[horizontal]
			
 
				+`delimiter`::
			
 
				+    The character to use as the path separator.  Defaults to `/`.
			
 
				+
			
 
				+`replacement`::
			
 
				+    An optional replacement character to use for the delimiter.
			
 
				+    Defaults to the `delimiter`.
			
 
				+
			
 
				+`buffer_size`::
			
 
				+    The number of characters read into the term buffer in a single pass.
			
 
				+    Defaults to `1024`.  The term buffer will grow by this size until all the
			
 
				+    text has been consumed.  It is advisable not to change this setting.
			
 
				+
			
 
				+`reverse`::
			
 
				+    If set to `true`, emits the tokens in reverse order.  Defaults to `false`.
			
 
				+
			
 
				+`skip`::
			
 
				+    The number of initial tokens to skip.  Defaults to `0`.
			
 
				+
			
 
				+[float]
			
 
				+=== Example configuration
			
 
				+
			
 
				+In this example, we configure the `path_hierarchy` tokenizer to split on `-`
			
 
				+characters, and to replace them with `/`.  The first two tokens are skipped:
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_analyzer": {
			
 
				+          "tokenizer": "my_tokenizer"
			
 
				+        }
			
 
				+      },
			
 
				+      "tokenizer": {
			
 
				+        "my_tokenizer": {
			
 
				+          "type": "path_hierarchy",
			
 
				+          "delimiter": "-",
			
 
				+          "replacement": "/",
			
 
				+          "skip": 2
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+GET _cluster/health?wait_for_status=yellow
			
 
				+
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "analyzer": "my_analyzer",
			
 
				+  "text": "one-two-three-four-five"
			
 
				+}
			
 
				+----------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "/three",
			
 
				+      "start_offset": 7,
			
 
				+      "end_offset": 13,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "/three/four",
			
 
				+      "start_offset": 7,
			
 
				+      "end_offset": 18,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "/three/four/five",
			
 
				+      "start_offset": 7,
			
 
				+      "end_offset": 23,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				+The above example produces the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ /three, /three/four, /three/four/five ]
			
 
				+---------------------------
			
 
				+
			
 
				+If we were to set `reverse` to `true`, it would produce the following:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ one/two/three/, two/three/, three/ ]
			
 
				+---------------------------
			
 
				 
			
--- a/docs/reference/analysis/tokenizers/pattern-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/pattern-tokenizer.asciidoc
@@ -1,38 +1,268 @@
 
				 [[analysis-pattern-tokenizer]]
			
 
				 === Pattern Tokenizer
			
 
				 
			
 
				-A tokenizer of type `pattern` that can flexibly separate text into terms
			
 
				-via a regular expression. Accepts the following settings:
			
 
				-
			
 
				-[cols="<,<",options="header",]
			
 
				-|======================================================================
			
 
				-|Setting |Description
			
 
				-|`pattern` |The regular expression pattern, defaults to `\W+`.
			
 
				-|`flags` |The regular expression flags.
			
 
				-|`group` |Which group to extract into tokens. Defaults to `-1` (split).
			
 
				-|======================================================================
			
 
				-
			
 
				-*IMPORTANT*: The regular expression should match the *token separators*,
			
 
				-not the tokens themselves.
			
 
				-
			
 
				-*********************************************
			
 
				-Note that you may need to escape `pattern` string literal according to
			
 
				-your client language rules. For example, in many programming languages
			
 
				-a string literal for `\W+` pattern is written as `"\\W+"`.
			
 
				-There is nothing special about `pattern` (you may have to escape other
			
 
				-string literals as well); escaping `pattern` is common just because it
			
 
				-often contains characters that should be escaped.
			
 
				-*********************************************
			
 
				-
			
 
				-`group` set to `-1` (the default) is equivalent to "split". Using group
			
 
				->= 0 selects the matching group as the token. For example, if you have:
			
 
				-
			
 
				-------------------------
			
 
				-pattern = '([^']+)'
			
 
				-group   = 0
			
 
				-input   = aaa 'bbb' 'ccc'
			
 
				-------------------------
			
 
				-
			
 
				-the output will be two tokens: `'bbb'` and `'ccc'` (including the `'`
			
 
				-marks). With the same input but using group=1, the output would be:
			
 
				-`bbb` and `ccc` (no `'` marks).
			
 
				+The `pattern` tokenizer uses a regular expression to either split text into
			
 
				+terms whenever it matches a word separator, or to capture matching text as
			
 
				+terms.
			
 
				+
			
 
				+The default pattern is `\W+`, which splits text whenever it encounters
			
 
				+non-word characters.
			
 
				+
			
 
				+[float]
			
 
				+=== Example output
			
 
				+
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+POST _analyze
			
 
				+{
			
 
				+  "tokenizer": "pattern",
			
 
				+  "text": "The foo_bar_size's default is 5."
			
 
				+}
			
 
				+---------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "The",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 3,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "foo_bar_size",
			
 
				+      "start_offset": 4,
			
 
				+      "end_offset": 16,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "s",
			
 
				+      "start_offset": 17,
			
 
				+      "end_offset": 18,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "default",
			
 
				+      "start_offset": 19,
			
 
				+      "end_offset": 26,
			
 
				+      "type": "word",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "is",
			
 
				+      "start_offset": 27,
			
 
				+      "end_offset": 29,
			
 
				+      "type": "word",
			
 
				+      "position": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "5",
			
 
				+      "start_offset": 30,
			
 
				+      "end_offset": 31,
			
 
				+      "type": "word",
			
 
				+      "position": 5
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				+The above sentence would produce the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ The, foo_bar_size, s, default, is, 5 ]
			
 
				+---------------------------
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `pattern` tokenizer accepts the following parameters:
			
 
				+
			
 
				+[horizontal]
			
 
				+`pattern`::
			
 
				+
			
 
				+    A http://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html[Java regular expression], defaults to `\W+`.
			
 
				+
			
 
				+`flags`::
			
 
				+
			
 
				+    Java regular expression http://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#field.summary[flags].
			
 
				+    lags should be pipe-separated, eg `"CASE_INSENSITIVE|COMMENTS"`.
			
 
				+
			
 
				+`group`::
			
 
				+
			
 
				+    Which capture group to extract as tokens.  Defaults to `-1` (split).
			
 
				+
			
 
				+[float]
			
 
				+=== Example configuration
			
 
				+
			
 
				+In this example, we configure the `pattern` tokenizer to break text into
			
 
				+tokens when it encounters commas:
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_analyzer": {
			
 
				+          "tokenizer": "my_tokenizer"
			
 
				+        }
			
 
				+      },
			
 
				+      "tokenizer": {
			
 
				+        "my_tokenizer": {
			
 
				+          "type": "pattern",
			
 
				+          "pattern": ","
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+GET _cluster/health?wait_for_status=yellow
			
 
				+
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "analyzer": "my_analyzer",
			
 
				+  "text": "comma,separated,values"
			
 
				+}
			
 
				+----------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "comma",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 5,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "separated",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 15,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "values",
			
 
				+      "start_offset": 16,
			
 
				+      "end_offset": 22,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				+The above example produces the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ comma, separated, values ]
			
 
				+---------------------------
			
 
				+
			
 
				+In the next example, we configure the `pattern` tokenizer to capture values
			
 
				+enclosed in double quotes (ignoring embedded escaped quotes `\"`).  The regex
			
 
				+itself looks like this:
			
 
				+
			
 
				+    "((?:\\"|[^"]|\\")*)"
			
 
				+
			
 
				+And reads as follows:
			
 
				+
			
 
				+* A literal `"`
			
 
				+* Start capturing:
			
 
				+** A literal `\"` OR any character except `"`
			
 
				+** Repeat until no more characters match
			
 
				+* A literal closing `"`
			
 
				+
			
 
				+When the pattern is specified in JSON, the `"` and `\` characters need to be
			
 
				+escaped, so the pattern ends up looking like:
			
 
				+
			
 
				+    \"((?:\\\\\"|[^\"]|\\\\\")+)\"
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_analyzer": {
			
 
				+          "tokenizer": "my_tokenizer"
			
 
				+        }
			
 
				+      },
			
 
				+      "tokenizer": {
			
 
				+        "my_tokenizer": {
			
 
				+          "type": "pattern",
			
 
				+          "pattern": "\"((?:\\\\\"|[^\"]|\\\\\")+)\"",
			
 
				+          "group": 1
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+GET _cluster/health?wait_for_status=yellow
			
 
				+
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "analyzer": "my_analyzer",
			
 
				+  "text": "\"value\", \"value with embedded \\\" quote\""
			
 
				+}
			
 
				+----------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "value",
			
 
				+      "start_offset": 1,
			
 
				+      "end_offset": 6,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "value with embedded \\\" quote",
			
 
				+      "start_offset": 10,
			
 
				+      "end_offset": 38,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+The above example produces the following two terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ value, value with embedded \" quote ]
			
 
				+---------------------------
			
--- a/docs/reference/analysis/tokenizers/standard-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/standard-tokenizer.asciidoc
@@ -1,18 +1,274 @@
 
				 [[analysis-standard-tokenizer]]
			
 
				 === Standard Tokenizer
			
 
				 
			
 
				-A tokenizer of type `standard` providing grammar based tokenizer that is
			
 
				-a good tokenizer for most European language documents. The tokenizer
			
 
				-implements the Unicode Text Segmentation algorithm, as specified in
			
 
				-http://unicode.org/reports/tr29/[Unicode Standard Annex #29].
			
 
				-
			
 
				-The following are settings that can be set for a `standard` tokenizer
			
 
				-type:
			
 
				-
			
 
				-[cols="<,<",options="header",]
			
 
				-|=======================================================================
			
 
				-|Setting |Description
			
 
				-|`max_token_length` |The maximum token length. If a token is seen that
			
 
				-exceeds this length then it is split at `max_token_length` intervals. Defaults to `255`.
			
 
				-|=======================================================================
			
 
				+The `standard` tokenizer provides grammar based tokenization (based on the
			
 
				+Unicode Text Segmentation algorithm, as specified in
			
 
				+http://unicode.org/reports/tr29/[Unicode Standard Annex #29]) and works well
			
 
				+for most languages.
			
 
				+
			
 
				+[float]
			
 
				+=== Example output
			
 
				+
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+POST _analyze
			
 
				+{
			
 
				+  "tokenizer": "standard",
			
 
				+  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
			
 
				+}
			
 
				+---------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "The",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 3,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "2",
			
 
				+      "start_offset": 4,
			
 
				+      "end_offset": 5,
			
 
				+      "type": "<NUM>",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "QUICK",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "Brown",
			
 
				+      "start_offset": 12,
			
 
				+      "end_offset": 17,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "Foxes",
			
 
				+      "start_offset": 18,
			
 
				+      "end_offset": 23,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "jumped",
			
 
				+      "start_offset": 24,
			
 
				+      "end_offset": 30,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 5
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "over",
			
 
				+      "start_offset": 31,
			
 
				+      "end_offset": 35,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 6
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "the",
			
 
				+      "start_offset": 36,
			
 
				+      "end_offset": 39,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 7
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "lazy",
			
 
				+      "start_offset": 40,
			
 
				+      "end_offset": 44,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 8
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "dog's",
			
 
				+      "start_offset": 45,
			
 
				+      "end_offset": 50,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 9
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "bone",
			
 
				+      "start_offset": 51,
			
 
				+      "end_offset": 55,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 10
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				+The above sentence would produce the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
			
 
				+---------------------------
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `standard` tokenizer accepts the following parameters:
			
 
				+
			
 
				+[horizontal]
			
 
				+`max_token_length`::
			
 
				+
			
 
				+    The maximum token length. If a token is seen that exceeds this length then
			
 
				+    it is split at `max_token_length` intervals. Defaults to `255`.
			
 
				+
			
 
				+[float]
			
 
				+=== Example configuration
			
 
				+
			
 
				+In this example, we configure the `standard` tokenizer to have a
			
 
				+`max_token_length` of 5 (for demonstration purposes):
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_analyzer": {
			
 
				+          "tokenizer": "my_tokenizer"
			
 
				+        }
			
 
				+      },
			
 
				+      "tokenizer": {
			
 
				+        "my_tokenizer": {
			
 
				+          "type": "standard",
			
 
				+          "max_token_length": 5
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+GET _cluster/health?wait_for_status=yellow
			
 
				+
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "analyzer": "my_analyzer",
			
 
				+  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
			
 
				+}
			
 
				+----------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "The",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 3,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "2",
			
 
				+      "start_offset": 4,
			
 
				+      "end_offset": 5,
			
 
				+      "type": "<NUM>",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "QUICK",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "Brown",
			
 
				+      "start_offset": 12,
			
 
				+      "end_offset": 17,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "Foxes",
			
 
				+      "start_offset": 18,
			
 
				+      "end_offset": 23,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "jumpe",
			
 
				+      "start_offset": 24,
			
 
				+      "end_offset": 29,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 5
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "d",
			
 
				+      "start_offset": 29,
			
 
				+      "end_offset": 30,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 6
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "over",
			
 
				+      "start_offset": 31,
			
 
				+      "end_offset": 35,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 7
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "the",
			
 
				+      "start_offset": 36,
			
 
				+      "end_offset": 39,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 8
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "lazy",
			
 
				+      "start_offset": 40,
			
 
				+      "end_offset": 44,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 9
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "dog's",
			
 
				+      "start_offset": 45,
			
 
				+      "end_offset": 50,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 10
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "bone",
			
 
				+      "start_offset": 51,
			
 
				+      "end_offset": 55,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 11
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				+The above example produces the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ The, 2, QUICK, Brown, Foxes, jumpe, d, over, the, lazy, dog's, bone ]
			
 
				+---------------------------
			
 
				+
			
 
				+
			
 
				 
			
--- a/docs/reference/analysis/tokenizers/thai-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/thai-tokenizer.asciidoc
@@ -1,7 +1,106 @@
 
				 [[analysis-thai-tokenizer]]
			
 
				 === Thai Tokenizer
			
 
				 
			
 
				-A tokenizer of type `thai` that segments Thai text into words. This tokenizer
			
 
				-uses the built-in Thai segmentation algorithm included with Java to divide
			
 
				-up Thai text. Text in other languages in general will be treated the same
			
 
				-as `standard`.
			
 
				+The `thai` tokenizer segments Thai text into words, using the Thai
			
 
				+segmentation algorithm included with Java. Text in other languages in general
			
 
				+will be treated the same as the
			
 
				+<<analysis-standard-tokenizer,`standard` tokenizer>>.
			
 
				+
			
 
				+WARNING: This tokenizer may not be supported by all JREs. It is known to work
			
 
				+with Sun/Oracle and OpenJDK. If your application needs to be fully portable,
			
 
				+consider using the {plugins}/analysis-icu-tokenizer.html[ICU Tokenizer] instead.
			
 
				+
			
 
				+[float]
			
 
				+=== Example output
			
 
				+
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+POST _analyze
			
 
				+{
			
 
				+  "tokenizer": "thai",
			
 
				+  "text": "การที่ได้ต้องแสดงว่างานดี"
			
 
				+}
			
 
				+---------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "การ",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 3,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "ที่",
			
 
				+      "start_offset": 3,
			
 
				+      "end_offset": 6,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "ได้",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 9,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "ต้อง",
			
 
				+      "start_offset": 9,
			
 
				+      "end_offset": 13,
			
 
				+      "type": "word",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "แสดง",
			
 
				+      "start_offset": 13,
			
 
				+      "end_offset": 17,
			
 
				+      "type": "word",
			
 
				+      "position": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "ว่า",
			
 
				+      "start_offset": 17,
			
 
				+      "end_offset": 20,
			
 
				+      "type": "word",
			
 
				+      "position": 5
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "งาน",
			
 
				+      "start_offset": 20,
			
 
				+      "end_offset": 23,
			
 
				+      "type": "word",
			
 
				+      "position": 6
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "ดี",
			
 
				+      "start_offset": 23,
			
 
				+      "end_offset": 25,
			
 
				+      "type": "word",
			
 
				+      "position": 7
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				+The above sentence would produce the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ การ, ที่, ได้, ต้อง, แสดง, ว่า, งาน, ดี ]
			
 
				+---------------------------
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `thai` tokenizer is not configurable.
			
--- a/docs/reference/analysis/tokenizers/uaxurlemail-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/uaxurlemail-tokenizer.asciidoc
@@ -1,16 +1,199 @@
 
				 [[analysis-uaxurlemail-tokenizer]]
			
 
				-=== UAX Email URL Tokenizer
			
 
				+=== UAX URL Email  Tokenizer
			
 
				 
			
 
				-A tokenizer of type `uax_url_email` which works exactly like the
			
 
				-`standard` tokenizer, but tokenizes emails and urls as single tokens.
			
 
				+The `uax_url_email` tokenizer is like the <<analysis-standard-tokenizer,`standard` tokenizer>> except that it
			
 
				+recognises URLs and email addresses as single tokens.
			
 
				 
			
 
				-The following are settings that can be set for a `uax_url_email`
			
 
				-tokenizer type:
			
 
				+[float]
			
 
				+=== Example output
			
 
				 
			
 
				-[cols="<,<",options="header",]
			
 
				-|=======================================================================
			
 
				-|Setting |Description
			
 
				-|`max_token_length` |The maximum token length. If a token is seen that
			
 
				-exceeds this length then it is discarded. Defaults to `255`.
			
 
				-|=======================================================================
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+POST _analyze
			
 
				+{
			
 
				+  "tokenizer": "uax_url_email",
			
 
				+  "text": "Email me at john.smith@global-international.com"
			
 
				+}
			
 
				+---------------------------
			
 
				+// CONSOLE
			
 
				 
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "Email",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 5,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "me",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 8,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "at",
			
 
				+      "start_offset": 9,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "john.smith@global-international.com",
			
 
				+      "start_offset": 12,
			
 
				+      "end_offset": 47,
			
 
				+      "type": "<EMAIL>",
			
 
				+      "position": 3
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				+The above sentence would produce the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ Email, me, at, john.smith@global-international.com ]
			
 
				+---------------------------
			
 
				+
			
 
				+while the `standard` tokenizer would produce:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ Email, me, at, john.smith, global, international.com ]
			
 
				+---------------------------
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `uax_url_email` tokenizer accepts the following parameters:
			
 
				+
			
 
				+[horizontal]
			
 
				+`max_token_length`::
			
 
				+
			
 
				+    The maximum token length. If a token is seen that exceeds this length then
			
 
				+    it is split at `max_token_length` intervals. Defaults to `255`.
			
 
				+
			
 
				+[float]
			
 
				+=== Example configuration
			
 
				+
			
 
				+In this example, we configure the `uax_url_email` tokenizer to have a
			
 
				+`max_token_length` of 5 (for demonstration purposes):
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_analyzer": {
			
 
				+          "tokenizer": "my_tokenizer"
			
 
				+        }
			
 
				+      },
			
 
				+      "tokenizer": {
			
 
				+        "my_tokenizer": {
			
 
				+          "type": "uax_url_email",
			
 
				+          "max_token_length": 5
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+GET _cluster/health?wait_for_status=yellow
			
 
				+
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "analyzer": "my_analyzer",
			
 
				+  "text": "john.smith@global-international.com"
			
 
				+}
			
 
				+----------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "john",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 4,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "smith",
			
 
				+      "start_offset": 5,
			
 
				+      "end_offset": 10,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "globa",
			
 
				+      "start_offset": 11,
			
 
				+      "end_offset": 16,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "l",
			
 
				+      "start_offset": 16,
			
 
				+      "end_offset": 17,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "inter",
			
 
				+      "start_offset": 18,
			
 
				+      "end_offset": 23,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "natio",
			
 
				+      "start_offset": 23,
			
 
				+      "end_offset": 28,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 5
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "nal.c",
			
 
				+      "start_offset": 28,
			
 
				+      "end_offset": 33,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 6
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "om",
			
 
				+      "start_offset": 33,
			
 
				+      "end_offset": 35,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 7
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				+The above example produces the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ john, smith, globa, l, inter, natio, nal.c, om ]
			
 
				+---------------------------
			
--- a/docs/reference/analysis/tokenizers/whitespace-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/whitespace-tokenizer.asciidoc
@@ -1,4 +1,114 @@
 
				 [[analysis-whitespace-tokenizer]]
			
 
				-=== Whitespace Tokenizer
			
 
				+=== Whitespace Analyzer
			
 
				 
			
 
				-A tokenizer of type `whitespace` that divides text at whitespace.
			
 
				+The `whitespace` tokenizer breaks text into terms whenever it encounters a
			
 
				+whitespace character.
			
 
				+
			
 
				+[float]
			
 
				+=== Example output
			
 
				+
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+POST _analyze
			
 
				+{
			
 
				+  "tokenizer": "whitespace",
			
 
				+  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
			
 
				+}
			
 
				+---------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "The",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 3,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "2",
			
 
				+      "start_offset": 4,
			
 
				+      "end_offset": 5,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "QUICK",
			
 
				+      "start_offset": 6,
			
 
				+      "end_offset": 11,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "Brown-Foxes",
			
 
				+      "start_offset": 12,
			
 
				+      "end_offset": 23,
			
 
				+      "type": "word",
			
 
				+      "position": 3
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "jumped",
			
 
				+      "start_offset": 24,
			
 
				+      "end_offset": 30,
			
 
				+      "type": "word",
			
 
				+      "position": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "over",
			
 
				+      "start_offset": 31,
			
 
				+      "end_offset": 35,
			
 
				+      "type": "word",
			
 
				+      "position": 5
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "the",
			
 
				+      "start_offset": 36,
			
 
				+      "end_offset": 39,
			
 
				+      "type": "word",
			
 
				+      "position": 6
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "lazy",
			
 
				+      "start_offset": 40,
			
 
				+      "end_offset": 44,
			
 
				+      "type": "word",
			
 
				+      "position": 7
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "dog's",
			
 
				+      "start_offset": 45,
			
 
				+      "end_offset": 50,
			
 
				+      "type": "word",
			
 
				+      "position": 8
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "bone.",
			
 
				+      "start_offset": 51,
			
 
				+      "end_offset": 56,
			
 
				+      "type": "word",
			
 
				+      "position": 9
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+
			
 
				+The above sentence would produce the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ The, 2, QUICK, Brown-Foxes, jumped, over, the, lazy, dog's, bone. ]
			
 
				+---------------------------
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `whitespace` tokenizer is not configurable.