9 years ago · 97a41ee973
--- a/docs/reference/analysis.asciidoc
+++ b/docs/reference/analysis.asciidoc
@@ -3,68 +3,113 @@
 
				 
			
 
				 [partintro]
			
 
				 --
			
 
				-The index analysis module acts as a configurable registry of Analyzers
			
 
				-that can be used in order to both break indexed (analyzed) fields when a
			
 
				-document is indexed and process query strings. It maps to the Lucene
			
 
				-`Analyzer`.
			
 
				 
			
 
				+_Analysis_ is the process of converting text, like the body of any email, into
			
 
				+_tokens_ or _terms_ which are added to the inverted index for searching.
			
 
				+Analysis is performed by an <<analysis-analyzers,_analyzer_>> which can be
			
 
				+either a built-in analyzer or a <<analysis-custom-analyzer,`custom`>> analyzer
			
 
				+defined per index.
			
 
				 
			
 
				-Analyzers are composed of a single <<analysis-tokenizers,Tokenizer>> 
			
 
				-and zero or more <<analysis-tokenfilters,TokenFilters>>. The tokenizer may 
			
 
				-be preceded by one or more <<analysis-charfilters,CharFilters>>. The
			
 
				-analysis module allows one to register `TokenFilters`, `Tokenizers` and
			
 
				-`Analyzers` under logical names that can then be referenced either in
			
 
				-mapping definitions or in certain APIs. The Analysis module
			
 
				-automatically registers (*if not explicitly defined*) built in
			
 
				-analyzers, token filters, and tokenizers.
			
 
				+[float]
			
 
				+== Index time analysis
			
 
				+
			
 
				+For instance at index time, the built-in <<english-analyzer,`english`>> _analyzer_ would
			
 
				+convert this sentence:
			
 
				+
			
 
				+[source,text]
			
 
				+------
			
 
				+"The QUICK brown foxes jumped over the lazy dog!"
			
 
				+------
			
 
				 
			
 
				-Here is a sample configuration:
			
 
				+into these terms, which would be added to the inverted index.
			
 
				+
			
 
				+[source,text]
			
 
				+------
			
 
				+[ quick, brown, fox, jump, over, lazi, dog ]
			
 
				+------
			
 
				+
			
 
				+[float]
			
 
				+=== Specifying an index time analyzer
			
 
				+
			
 
				+Each <<text,`text`>> field in a mapping can specify its own
			
 
				+<<analyzer,`analyzer`>>:
			
 
				 
			
 
				 [source,js]
			
 
				---------------------------------------------------
			
 
				-index :
			
 
				-    analysis :
			
 
				-        analyzer : 
			
 
				-            standard : 
			
 
				-                type : standard
			
 
				-                stopwords : [stop1, stop2]
			
 
				-            myAnalyzer1 :
			
 
				-                type : standard
			
 
				-                stopwords : [stop1, stop2, stop3]
			
 
				-                max_token_length : 500
			
 
				-            # configure a custom analyzer which is 
			
 
				-            # exactly like the default standard analyzer
			
 
				-            myAnalyzer2 :
			
 
				-                tokenizer : standard
			
 
				-                filter : [standard, lowercase, stop]
			
 
				-        tokenizer :
			
 
				-            myTokenizer1 :
			
 
				-                type : standard
			
 
				-                max_token_length : 900
			
 
				-            myTokenizer2 :
			
 
				-                type : keyword
			
 
				-                buffer_size : 512
			
 
				-        filter :
			
 
				-            myTokenFilter1 :
			
 
				-                type : stop
			
 
				-                stopwords : [stop1, stop2, stop3, stop4]
			
 
				-            myTokenFilter2 :
			
 
				-                type : length
			
 
				-                min : 0
			
 
				-                max : 2000
			
 
				---------------------------------------------------
			
 
				+-------------------------
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "mappings": {
			
 
				+    "my_type": {
			
 
				+      "properties": {
			
 
				+        "title": {
			
 
				+          "type":     "text",
			
 
				+          "analyzer": "standard"
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+-------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+At index time, if no `analyzer` has been specified, it looks for an analyzer
			
 
				+in the index settings called `default`.  Failing that, it defaults to using
			
 
				+the <<analysis-standard-analyzer,`standard` analyzer>>.
			
 
				+
			
 
				 
			
 
				 [float]
			
 
				-[[backwards-compatibility]]
			
 
				-=== Backwards compatibility
			
 
				+== Search time analysis
			
 
				+
			
 
				+This same analysis process is applied to the query string at search time in
			
 
				+<<full-text-queries,full text queries>> like the
			
 
				+<<query-dsl-match-query,`match` query>>
			
 
				+to convert the text in the query string into terms of the same form as those
			
 
				+that are stored in the inverted index.
			
 
				+
			
 
				+For instance, a user might search for:
			
 
				 
			
 
				-All analyzers, tokenizers, and token filters can be configured with a
			
 
				-`version` parameter to control which Lucene version behavior they should
			
 
				-use. Possible values are: `3.0` - `3.6`, `4.0` - `4.3` (the highest
			
 
				-version number is the default option).
			
 
				+[source,text]
			
 
				+------
			
 
				+"a quick fox"
			
 
				+------
			
 
				+
			
 
				+which would be analysed by the same `english` analyzer into the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+------
			
 
				+[ quick, fox ]
			
 
				+------
			
 
				+
			
 
				+Even though the exact words used in the query string don't appear in the
			
 
				+original text (`quick` vs `QUICK`, `fox` vs `foxes`), because we have applied
			
 
				+the same analyzer to both the text and the query string, the terms from the
			
 
				+query string exactly match the terms from the text in the inverted index,
			
 
				+which means that this query would match our example document.
			
 
				+
			
 
				+[float]
			
 
				+=== Specifying a search time analyzer
			
 
				+
			
 
				+Usually the same analyzer should be used both at
			
 
				+index time and at search time, and <<full-text-queries,full text queries>>
			
 
				+like the  <<query-dsl-match-query,`match` query>> will use the mapping to look
			
 
				+up the analyzer to use for each field.
			
 
				+
			
 
				+The analyzer to use to search a particular field is determined by
			
 
				+looking for:
			
 
				+
			
 
				+* An `analyzer` specified in the query itself.
			
 
				+* The <<search-analyzer,`search_analyzer`>> mapping parameter.
			
 
				+* The <<analyzer,`analyzer`>> mapping parameter.
			
 
				+* An analyzer in the index settings called `default_search`.
			
 
				+* An analyzer in the index settings called `default`.
			
 
				+* The `standard` analyzer.
			
 
				 
			
 
				 --
			
 
				 
			
 
				+include::analysis/anatomy.asciidoc[]
			
 
				+
			
 
				+include::analysis/testing.asciidoc[]
			
 
				+
			
 
				 include::analysis/analyzers.asciidoc[]
			
 
				 
			
 
				 include::analysis/tokenizers.asciidoc[]
			
--- a/docs/reference/analysis/analyzers.asciidoc
+++ b/docs/reference/analysis/analyzers.asciidoc
@@ -1,67 +1,60 @@
 
				 [[analysis-analyzers]]
			
 
				 == Analyzers
			
 
				 
			
 
				-Analyzers are composed of a single <<analysis-tokenizers,Tokenizer>> 
			
 
				-and zero or more <<analysis-tokenfilters,TokenFilters>>. The tokenizer may 
			
 
				-be preceded by one or more <<analysis-charfilters,CharFilters>>.
			
 
				-The analysis module allows you to register `Analyzers` under logical
			
 
				-names which can then be referenced either in mapping definitions or in
			
 
				-certain APIs. 
			
 
				-
			
 
				-Elasticsearch comes with a number of prebuilt analyzers which are
			
 
				-ready to use.  Alternatively, you can combine the built in
			
 
				-character filters, tokenizers and token filters to create 
			
 
				-<<analysis-custom-analyzer,custom analyzers>>.
			
 
				+Elasticsearch ships with a wide range of built-in analyzers, which can be used
			
 
				+in any index without further configuration:
			
 
				 
			
 
				-[float]
			
 
				-[[default-analyzers]]
			
 
				-=== Default Analyzers
			
 
				-
			
 
				-An analyzer is registered under a logical name. It can then be
			
 
				-referenced from mapping definitions or certain APIs. When none are
			
 
				-defined, defaults are used. There is an option to define which analyzers
			
 
				-will be used by default when none can be derived.
			
 
				-
			
 
				-The `default` logical name allows one to configure an analyzer that will
			
 
				-be used both for indexing and for searching APIs. The `default_search`
			
 
				-logical name can be used to configure a default analyzer that will be
			
 
				-used just when searching (the `default` analyzer would still be used for
			
 
				-indexing).
			
 
				-
			
 
				-For instance, the following settings could be used to perform exact matching
			
 
				-only by default:
			
 
				-
			
 
				-[source,js]
			
 
				---------------------------------------------------
			
 
				-index :
			
 
				-  analysis :
			
 
				-    analyzer :
			
 
				-      default :
			
 
				-        tokenizer : keyword
			
 
				---------------------------------------------------
			
 
				+<<analysis-standard-analyzer,Standard Analyzer>>::
			
 
				 
			
 
				-[float]
			
 
				-[[aliasing-analyzers]]
			
 
				-=== Aliasing Analyzers
			
 
				+The `standard` analyzer divides text into terms on word boundaries, as defined
			
 
				+by the Unicode Text Segmentation algorithm. It removes most punctuation,
			
 
				+lowercases terms, and supports removing stop words.
			
 
				+
			
 
				+<<analysis-simple-analyzer,Simple Analyzer>>::
			
 
				+
			
 
				+The `simple` analyzer divides text into terms whenever it encounters a
			
 
				+character which is not a letter.  It lowercases all terms.
			
 
				+
			
 
				+<<analysis-whitespace-analyzer,Whitespace Analyzer>>::
			
 
				+
			
 
				+The `whitespace` analyzer divides text into terms whenever it encounters any
			
 
				+whitespace character.  It does not lowercase terms.
			
 
				+
			
 
				+<<analysis-stop-analyzer,Stop Analyzer>>::
			
 
				+
			
 
				+The `stop` analyzer is like the `simple` analyzer, but also supports removal
			
 
				+of stop words.
			
 
				+
			
 
				+<<analysis-keyword-analyzer,Keyword Analyzer>>::
			
 
				 
			
 
				-Analyzers can be aliased to have several registered lookup names
			
 
				-associated with them. For example, the following will allow 
			
 
				-the `standard` analyzer to also be referenced with `alias1`
			
 
				-and `alias2` values.
			
 
				+The `keyword` analyzer is a ``noop'' analyzer that accepts whatever text it is
			
 
				+given and outputs the exact same text as a single term.
			
 
				 
			
 
				+<<analysis-pattern-analyzer,Pattern Analyzer>>::
			
 
				 
			
 
				-[source,js]
			
 
				---------------------------------------------------
			
 
				-index :
			
 
				-  analysis :
			
 
				-    analyzer :
			
 
				-      standard :
			
 
				-        alias: [alias1, alias2]
			
 
				-        type : standard
			
 
				-        stopwords : [test1, test2, test3]
			
 
				---------------------------------------------------
			
 
				+The `pattern` analyzer uses a regular expression to split the text into terms.
			
 
				+It supports lower-casing and stop words.
			
 
				 
			
 
				-Below is a list of the built in analyzers.
			
 
				+<<analysis-lang-analyzer,Language Analyzers>>::
			
 
				+
			
 
				+Elasticsearch provides many language-specific analyzers like `english` or
			
 
				+`french`.
			
 
				+
			
 
				+<<analysis-fingerprint-analyzer,Fingerprint Analyzer>>::
			
 
				+
			
 
				+The `fingerprint` analyzer is a specialist analyzer which creates a
			
 
				+fingerprint which can be used for duplicate detection.
			
 
				+
			
 
				+[float]
			
 
				+=== Custom analyzers
			
 
				+
			
 
				+If you do not find an analyzer suitable for your needs, you can create a
			
 
				+<<analysis-custom-analyzer,`custom`>> analyzer which combines the appropriate
			
 
				+<<analysis-charfilters, character filters>>,
			
 
				+<<analysis-tokenizers,tokenizer>>, and <<analysis-tokenfilters,token filters>>.
			
 
				+
			
 
				+
			
 
				+include::analyzers/configuring.asciidoc[]
			
 
				 
			
 
				 include::analyzers/standard-analyzer.asciidoc[]
			
 
				 
			
@@ -77,8 +70,6 @@ include::analyzers/pattern-analyzer.asciidoc[]
 
				 
			
 
				 include::analyzers/lang-analyzer.asciidoc[]
			
 
				 
			
 
				-include::analyzers/snowball-analyzer.asciidoc[]
			
 
				-
			
 
				 include::analyzers/fingerprint-analyzer.asciidoc[]
			
 
				 
			
 
				 include::analyzers/custom-analyzer.asciidoc[]
			
--- a/docs/reference/analysis/analyzers/configuring.asciidoc
+++ b/docs/reference/analysis/analyzers/configuring.asciidoc
@@ -0,0 +1,66 @@
 
				+[[configuring-analyzers]]
			
 
				+=== Configuring built-in analyzers
			
 
				+
			
 
				+The built-in analyzers can be used directly without any configuration.  Some
			
 
				+of them, however, support configuration options to alter their behaviour.  For
			
 
				+instance, the <<analysis-standard-analyzer,`standard` analyzer>> can be configured
			
 
				+to support a list of stop words:
			
 
				+
			
 
				+[source,js]
			
 
				+--------------------------------
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "std_english": { <1>
			
 
				+          "type":      "standard",
			
 
				+          "stopwords": "_english_"
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  },
			
 
				+  "mappings": {
			
 
				+    "my_type": {
			
 
				+      "properties": {
			
 
				+        "my_text": {
			
 
				+          "type":     "text",
			
 
				+          "analyzer": "standard", <2>
			
 
				+          "fields": {
			
 
				+            "english": {
			
 
				+              "type":     "text",
			
 
				+              "analyzer": "std_english" <3>
			
 
				+            }
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+GET _cluster/health?wait_for_status=yellow
			
 
				+
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "field": "my_text", <2>
			
 
				+  "text": "The old brown cow"
			
 
				+}
			
 
				+
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "field": "my_text.english", <3>
			
 
				+  "text": "The old brown cow"
			
 
				+}
			
 
				+
			
 
				+--------------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+<1> We define the `std_english` analyzer to be based on the `standard`
			
 
				+    analyzer, but configured to remove the pre-defined list of English stopwords.
			
 
				+<2> The `my_text` field uses the `standard` analyzer directly, without
			
 
				+    any configuration.  No stop words will be removed from this field.
			
 
				+    The resulting terms are: `[ the, old, brown, cow ]`
			
 
				+<3> The `my_text.english` field uses the `std_english` analyzer, so
			
 
				+    English stop words will be removed.  The resulting terms are:
			
 
				+    `[ old, brown, cow ]`
			
 
				+
			
--- a/docs/reference/analysis/analyzers/custom-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/custom-analyzer.asciidoc
@@ -1,59 +1,179 @@
 
				 [[analysis-custom-analyzer]]
			
 
				 === Custom Analyzer
			
 
				 
			
 
				-An analyzer of type `custom` that allows to combine a `Tokenizer` with
			
 
				-zero or more `Token Filters`, and zero or more `Char Filters`. The
			
 
				-custom analyzer accepts a logical/registered name of the tokenizer to
			
 
				-use, and a list of logical/registered names of token filters.
			
 
				-The name of the custom analyzer must not start with "_".
			
 
				+When the built-in analyzers do not fulfill your needs, you can create a
			
 
				+`custom` analyzer which uses the appropriate combination of:
			
 
				 
			
 
				-The following are settings that can be set for a `custom` analyzer type:
			
 
				+* zero or more <<analysis-charfilters, character filters>>
			
 
				+* a <<analysis-tokenizers,tokenizer>>
			
 
				+* zero or more <<analysis-tokenfilters,token filters>>.
			
 
				 
			
 
				-[cols="<,<",options="header",]
			
 
				-|=======================================================================
			
 
				-|Setting |Description
			
 
				-|`tokenizer` |The logical / registered name of the tokenizer to use.
			
 
				+[float]
			
 
				+=== Configuration
			
 
				 
			
 
				-|`filter` |An optional list of logical / registered name of token
			
 
				-filters.
			
 
				+The `custom` analyzer accepts the following parameters:
			
 
				 
			
 
				-|`char_filter` |An optional list of logical / registered name of char
			
 
				-filters.
			
 
				+[horizontal]
			
 
				+`tokenizer`::
			
 
				+
			
 
				+    A built-in or customised <<analysis-tokenizers,tokenizer>>.
			
 
				+    (Required)
			
 
				+
			
 
				+`char_filter`::
			
 
				+
			
 
				+    An optional array of built-in or customised
			
 
				+    <<analysis-charfilters, character filters>>.
			
 
				+
			
 
				+`filter`::
			
 
				+
			
 
				+    An optional array of built-in or customised
			
 
				+    <<analysis-tokenfilters, token filters>>.
			
 
				+
			
 
				+`position_increment_gap`::
			
 
				+
			
 
				+    When indexing an array of text values, Elasticsearch inserts a fake "gap"
			
 
				+    between the last term of one value and the first term of the next value to
			
 
				+    ensure that a phrase query doesn't match two terms from different array
			
 
				+    elements.  Defaults to `100`. See <<position-increment-gap>> for more.
			
 
				+
			
 
				+[float]
			
 
				+=== Example configuration
			
 
				+
			
 
				+Here is an example that combines the following:
			
 
				+
			
 
				+Character Filter::
			
 
				+* <<analysis-htmlstrip-charfilter,HTML Strip Character Filter>>
			
 
				+
			
 
				+Tokenizer::
			
 
				+* <<analysis-standard-tokenizer,Standard Tokenizer>>
			
 
				+
			
 
				+Token Filters::
			
 
				+* <<analysis-lowercase-tokenfilter,Lowercase Token Filter>>
			
 
				+* <<analysis-asciifolding-tokenfilter,ASCII-Folding Token Filter>>
			
 
				+
			
 
				+[source,js]
			
 
				+--------------------------------
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_custom_analyzer": {
			
 
				+          "type":      "custom",
			
 
				+          "tokenizer": "standard",
			
 
				+          "char_filter": [
			
 
				+            "html_strip"
			
 
				+          ],
			
 
				+          "filter": [
			
 
				+            "lowercase",
			
 
				+            "asciifolding"
			
 
				+          ]
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+GET _cluster/health?wait_for_status=yellow
			
 
				+
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "analyzer": "my_custom_analyzer",
			
 
				+  "text": "Is this <b>déjà vu</b>?"
			
 
				+}
			
 
				+--------------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+The above example produces the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ is, this, deja, vu ]
			
 
				+---------------------------
			
 
				+
			
 
				+The previous example used tokenizer, token filters, and character filters with
			
 
				+their default configurations, but it is possible to create configured versions
			
 
				+of each and to use them in a custom analyzer.
			
 
				+
			
 
				+Here is a more complicated example that combines the following:
			
 
				+
			
 
				+Character Filter::
			
 
				+* <<analysis-mapping-charfilter,Mapping Character Filter>>, configured to replace `:)` with `_happy_` and `:(` with `_sad_`
			
 
				+
			
 
				+Tokenizer::
			
 
				+*  <<analysis-pattern-tokenizer,Pattern Tokenizer>>, configured to split on punctuation characters
			
 
				+
			
 
				+Token Filters::
			
 
				+* <<analysis-lowercase-tokenfilter,Lowercase Token Filter>>
			
 
				+* <<analysis-stop-tokenfilter,Stop Token Filter>>, configured to use the pre-defined list of English stop words
			
 
				 
			
 
				-|`position_increment_gap` |An optional number of positions to increment
			
 
				-between each field value of a field using this analyzer. Defaults to 100.
			
 
				-100 was chosen because it prevents phrase queries with reasonably large
			
 
				-slops (less than 100) from matching terms across field values.
			
 
				-|=======================================================================
			
 
				 
			
 
				 Here is an example:
			
 
				 
			
 
				 [source,js]
			
 
				 --------------------------------------------------
			
 
				-index :
			
 
				-    analysis :
			
 
				-        analyzer :
			
 
				-            myAnalyzer2 :
			
 
				-                type : custom
			
 
				-                tokenizer : myTokenizer1
			
 
				-                filter : [myTokenFilter1, myTokenFilter2]
			
 
				-                char_filter : [my_html]
			
 
				-                position_increment_gap: 256
			
 
				-        tokenizer :
			
 
				-            myTokenizer1 :
			
 
				-                type : standard
			
 
				-                max_token_length : 900
			
 
				-        filter :
			
 
				-            myTokenFilter1 :
			
 
				-                type : stop
			
 
				-                stopwords : [stop1, stop2, stop3, stop4]
			
 
				-            myTokenFilter2 :
			
 
				-                type : length
			
 
				-                min : 0
			
 
				-                max : 2000
			
 
				-        char_filter :
			
 
				-              my_html :
			
 
				-                type : html_strip
			
 
				-                escaped_tags : [xxx, yyy]
			
 
				-                read_ahead : 1024
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_custom_analyzer": {
			
 
				+          "type": "custom",
			
 
				+
			
 
				+          "char_filter": [
			
 
				+            "emoticons" <1>
			
 
				+          ],
			
 
				+
			
 
				+          "tokenizer": "punctuation", <1>
			
 
				+
			
 
				+          "filter": [
			
 
				+            "lowercase",
			
 
				+            "english_stop" <1>
			
 
				+          ]
			
 
				+        }
			
 
				+      },
			
 
				+      "tokenizer": {
			
 
				+        "punctuation": { <1>
			
 
				+          "type": "pattern",
			
 
				+          "pattern": "[ .,!?]"
			
 
				+        }
			
 
				+      },
			
 
				+      "char_filter": {
			
 
				+        "emoticons": { <1>
			
 
				+          "type": "mapping",
			
 
				+          "mappings": [
			
 
				+            ":) => _happy_",
			
 
				+            ":( => _sad_"
			
 
				+          ]
			
 
				+        }
			
 
				+      },
			
 
				+      "filter": {
			
 
				+        "english_stop": { <1>
			
 
				+          "type": "stop",
			
 
				+          "stopwords": "_english_"
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+GET _cluster/health?wait_for_status=yellow
			
 
				+
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "analyzer": "my_custom_analyzer",
			
 
				+  "text":     "I'm a :) person, and you?"
			
 
				+}
			
 
				 --------------------------------------------------
			
 
				+
			
 
				+<1> The `emoticon` character filter, `punctuation` tokenizer and
			
 
				+    `english_stop` token filter are custom implementations which are defined
			
 
				+    in the same index settings.
			
 
				+
			
 
				+The above example produces the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ i'm, _happy_, person, you ]
			
 
				+---------------------------
			
 
				+
			
--- a/docs/reference/analysis/analyzers/fingerprint-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/fingerprint-analyzer.asciidoc
@@ -5,37 +5,115 @@ The `fingerprint` analyzer implements a
 
				 https://github.com/OpenRefine/OpenRefine/wiki/Clustering-In-Depth#fingerprint[fingerprinting algorithm]
			
 
				 which is used by the OpenRefine project to assist in clustering.
			
 
				 
			
 
				-The `fingerprint` analyzer is composed of a <<analysis-standard-tokenizer>>, and four
			
 
				-token filters (in this order): <<analysis-lowercase-tokenfilter>>, <<analysis-stop-tokenfilter>>,
			
 
				-<<analysis-fingerprint-tokenfilter>> and <<analysis-asciifolding-tokenfilter>>.
			
 
				+Input text is lowercased, normalized to remove extended characters, sorted,
			
 
				+deduplicated and concatenated into a single token.  If a stopword list is
			
 
				+configured, stop words will also be removed.
			
 
				 
			
 
				-Input text is lowercased, normalized to remove extended characters, sorted, deduplicated and
			
 
				-concatenated into a single token.  If a stopword list is configured, stop words will
			
 
				-also be removed. For example, the sentence:
			
 
				+[float]
			
 
				+=== Definition
			
 
				 
			
 
				-____
			
 
				-"Yes yes, Gödel said this sentence is consistent and."
			
 
				-____
			
 
				+It consists of:
			
 
				 
			
 
				-will be transformed into the token: `"and consistent godel is said sentence this yes"`
			
 
				+Tokenizer::
			
 
				+* <<analysis-standard-tokenizer,Standard Tokenizer>>
			
 
				 
			
 
				+Token Filters::
			
 
				+* <<analysis-lowercase-tokenfilter,Lower Case Token Filter>>
			
 
				+* <<analysis-stop-tokenfilter,Stop Token Filter>> (disabled by default)
			
 
				+* <<analysis-fingerprint-tokenfilter>>
			
 
				+* <<analysis-asciifolding-tokenfilter>>
			
 
				 
			
 
				-Notice how the words are all lowercased, the umlaut in "gödel" has been normalized to "godel",
			
 
				-punctuation has been removed, and "yes" has been de-duplicated.
			
 
				+[float]
			
 
				+=== Example output
			
 
				 
			
 
				-The `fingerprint` analyzer has these configurable settings
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+POST _analyze
			
 
				+{
			
 
				+  "analyzer": "fingerprint",
			
 
				+  "text": "Yes yes, Gödel said this sentence is consistent and."
			
 
				+}
			
 
				+---------------------------
			
 
				+// CONSOLE
			
 
				 
			
 
				-[cols="<,<",options="header",]
			
 
				-|=======================================================================
			
 
				-|Setting |Description
			
 
				-|`separator` | The character that separates the tokens after concatenation.
			
 
				-Defaults to a space.
			
 
				-|`max_output_size` | The maximum token size to emit. Defaults to `255`. See <<analysis-fingerprint-tokenfilter-max-size>>
			
 
				-|`preserve_original`| If true, emits both the original and folded version of
			
 
				- tokens that contain extended characters.  Defaults to `false`
			
 
				-|`stopwords` | A list of stop words to use. Defaults to an empty list (`_none_`).
			
 
				-|`stopwords_path` | A path (either relative to `config` location, or absolute) to a stopwords
			
 
				-                        file configuration. Each stop word should be in its own "line" (separated
			
 
				-                        by a line break). The file must be UTF-8 encoded.
			
 
				-|=======================================================================
			
 
				+The above sentence would produce the following single term:
			
 
				 
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ and consistent godel is said sentence this yes ]
			
 
				+---------------------------
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `fingerprint` analyzer accepts the following parameters:
			
 
				+
			
 
				+[horizontal]
			
 
				+`separator`::
			
 
				+
			
 
				+    The character to use to concate the terms.  Defaults to a space.
			
 
				+
			
 
				+`max_output_size`::
			
 
				+
			
 
				+    The maximum token size to emit.  Defaults to `255`. Tokens larger than
			
 
				+    this size will be discarded.
			
 
				+
			
 
				+`preserve_original`::
			
 
				+
			
 
				+    If `true`, emits two tokens: one with ASCII-folding of terms that contain
			
 
				+    extended characters (if any) and one with the original characters.
			
 
				+    Defaults to `false`.
			
 
				+
			
 
				+`stopwords`::
			
 
				+
			
 
				+    A pre-defined stop words list like `_english_` or an array  containing a
			
 
				+    list of stop words.  Defaults to `_none_`.
			
 
				+
			
 
				+`stopwords_path`::
			
 
				+
			
 
				+    The path to a file containing stop words.
			
 
				+
			
 
				+See the <<analysis-stop-tokenfilter,Stop Token Filter>> for more information
			
 
				+about stop word configuration.
			
 
				+
			
 
				+
			
 
				+[float]
			
 
				+=== Example configuration
			
 
				+
			
 
				+In this example, we configure the `fingerprint` analyzer to use the
			
 
				+pre-defined list of English stop words, and to emit a second token in
			
 
				+the presence of non-ASCII characters:
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_fingerprint_analyzer": {
			
 
				+          "type": "fingerprint",
			
 
				+          "stopwords": "_english_",
			
 
				+          "preserve_original": true
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+GET _cluster/health?wait_for_status=yellow
			
 
				+
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "analyzer": "my_fingerprint_analyzer",
			
 
				+  "text": "Yes yes, Gödel said this sentence is consistent and."
			
 
				+}
			
 
				+----------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+The above example produces the following two terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ consistent godel said sentence yes, consistent gödel said sentence yes ]
			
 
				+---------------------------
			
--- a/docs/reference/analysis/analyzers/keyword-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/keyword-analyzer.asciidoc
@@ -1,7 +1,38 @@
 
				 [[analysis-keyword-analyzer]]
			
 
				 === Keyword Analyzer
			
 
				 
			
 
				-An analyzer of type `keyword` that "tokenizes" an entire stream as a
			
 
				-single token. This is useful for data like zip codes, ids and so on.
			
 
				-Note, when using mapping definitions, it might make more sense to simply
			
 
				-map the field as a <<keyword,`keyword`>>.
			
 
				+The `keyword` analyzer is a ``noop'' analyzer which returns the entire input
			
 
				+string as a single token.
			
 
				+
			
 
				+[float]
			
 
				+=== Definition
			
 
				+
			
 
				+It consists of:
			
 
				+
			
 
				+Tokenizer::
			
 
				+* <<analysis-keyword-tokenizer,Keyword Tokenizer>>
			
 
				+
			
 
				+[float]
			
 
				+=== Example output
			
 
				+
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+POST _analyze
			
 
				+{
			
 
				+  "analyzer": "keyword",
			
 
				+  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
			
 
				+}
			
 
				+---------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+The above sentence would produce the following single term:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ The 2 QUICK Brown-Foxes jumped over the lazy dog's bone. ]
			
 
				+---------------------------
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `keyword` analyzer is not configurable.
			
--- a/docs/reference/analysis/analyzers/lang-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/lang-analyzer.asciidoc
@@ -303,8 +303,8 @@ The `catalan` analyzer could be reimplemented as a `custom` analyzer as follows:
 
				     "analysis": {
			
 
				       "filter": {
			
 
				         "catalan_elision": {
			
 
				-        "type":         "elision",
			
 
				-            "articles": [ "d", "l", "m", "n", "s", "t"]
			
 
				+          "type":       "elision",
			
 
				+          "articles":   [ "d", "l", "m", "n", "s", "t"]
			
 
				         },
			
 
				         "catalan_stop": {
			
 
				           "type":       "stop",
			
@@ -623,10 +623,10 @@ The `french` analyzer could be reimplemented as a `custom` analyzer as follows:
 
				         "french_elision": {
			
 
				           "type":         "elision",
			
 
				           "articles_case": true,
			
 
				-            "articles": [ 
			
 
				+          "articles": [
			
 
				               "l", "m", "t", "qu", "n", "s",
			
 
				-              "j", "d", "c", "jusqu", "quoiqu", 
			
 
				-              "lorsqu", "puisqu" 
			
 
				+              "j", "d", "c", "jusqu", "quoiqu",
			
 
				+              "lorsqu", "puisqu"
			
 
				             ]
			
 
				         },
			
 
				         "french_stop": {
			
@@ -1000,13 +1000,13 @@ The `italian` analyzer could be reimplemented as a `custom` analyzer as follows:
 
				     "analysis": {
			
 
				       "filter": {
			
 
				         "italian_elision": {
			
 
				-        "type":         "elision",
			
 
				-            "articles": [
			
 
				+          "type": "elision",
			
 
				+          "articles": [
			
 
				                 "c", "l", "all", "dall", "dell",
			
 
				                 "nell", "sull", "coll", "pell",
			
 
				                 "gl", "agl", "dagl", "degl", "negl",
			
 
				                 "sugl", "un", "m", "t", "s", "v", "d"
			
 
				-            ]
			
 
				+          ]
			
 
				         },
			
 
				         "italian_stop": {
			
 
				           "type":       "stop",
			
--- a/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc
@@ -1,72 +1,96 @@
 
				 [[analysis-pattern-analyzer]]
			
 
				 === Pattern Analyzer
			
 
				 
			
 
				-An analyzer of type `pattern` that can flexibly separate text into terms
			
 
				-via a regular expression. Accepts the following settings:
			
 
				+The `pattern` analyzer uses a regular expression to split the text into terms.
			
 
				+The regular expression should match the *token separators*  not the tokens
			
 
				+themselves. The regular expression defaults to `\W+` (or all non-word characters).
			
 
				 
			
 
				-The following are settings that can be set for a `pattern` analyzer
			
 
				-type:
			
 
				+[float]
			
 
				+=== Definition
			
 
				 
			
 
				-[horizontal]
			
 
				-`lowercase`::   Should terms be lowercased or not. Defaults to `true`.
			
 
				-`pattern`::     The regular expression pattern, defaults to `\W+`.
			
 
				-`flags`::       The regular expression flags.
			
 
				-`stopwords`::   A list of stopwords to initialize the stop filter with.
			
 
				-                Defaults to an 'empty' stopword list Check
			
 
				-                <<analysis-stop-analyzer,Stop Analyzer>> for more details.
			
 
				+It consists of:
			
 
				 
			
 
				-*IMPORTANT*: The regular expression should match the *token separators*,
			
 
				-not the tokens themselves.
			
 
				+Tokenizer::
			
 
				+* <<analysis-pattern-tokenizer,Pattern Tokenizer>>
			
 
				 
			
 
				-Flags should be pipe-separated, eg `"CASE_INSENSITIVE|COMMENTS"`. Check
			
 
				-http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html#field_summary[Java
			
 
				-Pattern API] for more details about `flags` options.
			
 
				+Token Filters::
			
 
				+*  <<analysis-lowercase-tokenfilter,Lower Case Token Filter>>
			
 
				+*  <<analysis-stop-tokenfilter,Stop Token Filter>> (disabled by default)
			
 
				 
			
 
				 [float]
			
 
				-==== Pattern Analyzer Examples
			
 
				-
			
 
				-In order to try out these examples, you should delete the `test` index
			
 
				-before running each example.
			
 
				-
			
 
				-[float]
			
 
				-===== Whitespace tokenizer
			
 
				+=== Example output
			
 
				 
			
 
				 [source,js]
			
 
				---------------------------------------------------
			
 
				-PUT test
			
 
				+---------------------------
			
 
				+POST _analyze
			
 
				 {
			
 
				-  "settings": {
			
 
				-    "analysis": {
			
 
				-      "analyzer": {
			
 
				-        "whitespace": {
			
 
				-          "type": "pattern",
			
 
				-          "pattern": "\\s+"
			
 
				-        }
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				+  "analyzer": "pattern",
			
 
				+  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
			
 
				 }
			
 
				+---------------------------
			
 
				+// CONSOLE
			
 
				 
			
 
				-GET _cluster/health?wait_for_status=yellow
			
 
				+The above sentence would produce the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ the, 2, quick, brown, foxes, jumped, over, the, lazy, dog, s, bone ]
			
 
				+---------------------------
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `pattern` analyzer accepts the following parameters:
			
 
				+
			
 
				+[horizontal]
			
 
				+`pattern`::
			
 
				+
			
 
				+    A http://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html[Java regular expression], defaults to `\W+`.
			
 
				+
			
 
				+`flags`::
			
 
				+
			
 
				+    Java regular expression http://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html#field.summary[flags].
			
 
				+    lags should be pipe-separated, eg `"CASE_INSENSITIVE|COMMENTS"`.
			
 
				+
			
 
				+`lowercase`::
			
 
				+
			
 
				+    Should terms be lowercased or not. Defaults to `true`.
			
 
				+
			
 
				+`max_token_length`::
			
 
				+
			
 
				+    The maximum token length. If a token is seen that exceeds this length then
			
 
				+    it is split at `max_token_length` intervals. Defaults to `255`.
			
 
				+
			
 
				+`stopwords`::
			
 
				+
			
 
				+    A pre-defined stop words list like `_english_` or an array  containing a
			
 
				+    list of stop words.  Defaults to `_none_`.
			
 
				+
			
 
				+`stopwords_path`::
			
 
				+
			
 
				+    The path to a file containing stop words.
			
 
				+
			
 
				+See the <<analysis-stop-tokenfilter,Stop Token Filter>> for more information
			
 
				+about stop word configuration.
			
 
				 
			
 
				-GET test/_analyze?analyzer=whitespace&text=foo,bar baz
			
 
				-# "foo,bar", "baz"
			
 
				---------------------------------------------------
			
 
				-// CONSOLE
			
 
				 
			
 
				 [float]
			
 
				-===== Non-word character tokenizer
			
 
				+=== Example configuration
			
 
				+
			
 
				+In this example, we configure the `pattern` analyzer to split email addresses
			
 
				+on non-word characters or on underscores (`\W|_`), and to lower-case the result:
			
 
				 
			
 
				 [source,js]
			
 
				---------------------------------------------------
			
 
				-PUT test
			
 
				+----------------------------
			
 
				+PUT my_index
			
 
				 {
			
 
				   "settings": {
			
 
				     "analysis": {
			
 
				       "analyzer": {
			
 
				-        "nonword": {
			
 
				-          "type": "pattern",
			
 
				-          "pattern": "[^\\w]+" <1>
			
 
				+        "my_email_analyzer": {
			
 
				+          "type":      "pattern",
			
 
				+          "pattern":   "\\W|_", <1>
			
 
				+          "lowercase": true
			
 
				         }
			
 
				       }
			
 
				     }
			
@@ -75,21 +99,32 @@ PUT test
 
				 
			
 
				 GET _cluster/health?wait_for_status=yellow
			
 
				 
			
 
				-GET test/_analyze?analyzer=nonword&text=foo,bar baz
			
 
				-# "foo,bar baz" becomes "foo", "bar", "baz"
			
 
				-
			
 
				-GET test/_analyze?analyzer=nonword&text=type_1-type_4
			
 
				-# "type_1","type_4"
			
 
				---------------------------------------------------
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "analyzer": "my_email_analyzer",
			
 
				+  "text": "John_Smith@foo-bar.com"
			
 
				+}
			
 
				+----------------------------
			
 
				 // CONSOLE
			
 
				 
			
 
				+<1> The backslashes in the pattern need to be escaped when specifying the
			
 
				+    pattern as a JSON string.
			
 
				+
			
 
				+The above example produces the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ john, smith, foo, bar, com ]
			
 
				+---------------------------
			
 
				 
			
 
				 [float]
			
 
				-===== CamelCase tokenizer
			
 
				+==== CamelCase tokenizer
			
 
				+
			
 
				+The following more complicated example splits CamelCase text into tokens:
			
 
				 
			
 
				 [source,js]
			
 
				 --------------------------------------------------
			
 
				-PUT test?pretty=1
			
 
				+PUT my_index
			
 
				 {
			
 
				   "settings": {
			
 
				     "analysis": {
			
@@ -105,11 +140,21 @@ PUT test?pretty=1
 
				 
			
 
				 GET _cluster/health?wait_for_status=yellow
			
 
				 
			
 
				-GET test/_analyze?analyzer=camel&text=MooseX::FTPClass2_beta
			
 
				-# "moose","x","ftp","class","2","beta"
			
 
				+GET my_index/_analyze
			
 
				+{
			
 
				+  "analyzer": "camel",
			
 
				+  "text": "MooseX::FTPClass2_beta"
			
 
				+}
			
 
				 --------------------------------------------------
			
 
				 // CONSOLE
			
 
				 
			
 
				+The above example produces the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ moose, x, ftp, class, 2, beta ]
			
 
				+---------------------------
			
 
				+
			
 
				 The regex above is easier to understand as:
			
 
				 
			
 
				 [source,js]
			
--- a/docs/reference/analysis/analyzers/simple-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/simple-analyzer.asciidoc
@@ -1,6 +1,38 @@
 
				 [[analysis-simple-analyzer]]
			
 
				 === Simple Analyzer
			
 
				 
			
 
				-An analyzer of type `simple` that is built using a
			
 
				-<<analysis-lowercase-tokenizer,Lower
			
 
				-Case Tokenizer>>.
			
 
				+The `simple` analyzer breaks text into terms whenever it encounters a
			
 
				+character which is not a letter. All terms are lower cased.
			
 
				+
			
 
				+[float]
			
 
				+=== Definition
			
 
				+
			
 
				+It consists of:
			
 
				+
			
 
				+Tokenizer::
			
 
				+* <<analysis-lowercase-tokenizer,Lower Case Tokenizer>>
			
 
				+
			
 
				+[float]
			
 
				+=== Example output
			
 
				+
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+POST _analyze
			
 
				+{
			
 
				+  "analyzer": "simple",
			
 
				+  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
			
 
				+}
			
 
				+---------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+The above sentence would produce the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ the, quick, brown, foxes, jumped, over, the, lazy, dog, s, bone ]
			
 
				+---------------------------
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `simple` analyzer is not configurable.
			
--- a/docs/reference/analysis/analyzers/snowball-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/snowball-analyzer.asciidoc
@@ -1,64 +0,0 @@
 
				-[[analysis-snowball-analyzer]]
			
 
				-=== Snowball Analyzer
			
 
				-
			
 
				-An analyzer of type `snowball` that uses the
			
 
				-<<analysis-standard-tokenizer,standard
			
 
				-tokenizer>>, with
			
 
				-<<analysis-standard-tokenfilter,standard
			
 
				-filter>>,
			
 
				-<<analysis-lowercase-tokenfilter,lowercase
			
 
				-filter>>,
			
 
				-<<analysis-stop-tokenfilter,stop
			
 
				-filter>>, and
			
 
				-<<analysis-snowball-tokenfilter,snowball
			
 
				-filter>>.
			
 
				-
			
 
				-The Snowball Analyzer is a stemming analyzer from Lucene that is
			
 
				-originally based on the snowball project from
			
 
				-http://snowballstem.org[snowballstem.org].
			
 
				-
			
 
				-Sample usage:
			
 
				-
			
 
				-[source,js]
			
 
				---------------------------------------------------
			
 
				-{
			
 
				-    "index" : {
			
 
				-        "analysis" : {
			
 
				-            "analyzer" : {
			
 
				-                "my_analyzer" : {
			
 
				-                    "type" : "snowball",
			
 
				-                    "language" : "English"
			
 
				-                }
			
 
				-            }
			
 
				-        }
			
 
				-    }
			
 
				-}
			
 
				---------------------------------------------------
			
 
				-
			
 
				-The `language` parameter can have the same values as the
			
 
				-<<analysis-snowball-tokenfilter,snowball
			
 
				-filter>> and defaults to `English`. Note that not all the language
			
 
				-analyzers have a default set of stopwords provided.
			
 
				-
			
 
				-The `stopwords` parameter can be used to provide stopwords for the
			
 
				-languages that have no defaults, or to simply replace the default set
			
 
				-with your custom list. Check <<analysis-stop-analyzer,Stop Analyzer>>
			
 
				-for more details. A default set of stopwords for many of these
			
 
				-languages is available from for instance
			
 
				-https://github.com/apache/lucene-solr/tree/trunk/lucene/analysis/common/src/resources/org/apache/lucene/analysis/[here]
			
 
				-and
			
 
				-https://github.com/apache/lucene-solr/tree/trunk/lucene/analysis/common/src/resources/org/apache/lucene/analysis/snowball[here.]
			
 
				-
			
 
				-A sample configuration (in YAML format) specifying Swedish with
			
 
				-stopwords:
			
 
				-
			
 
				-[source,js]
			
 
				---------------------------------------------------
			
 
				-index :
			
 
				-    analysis :
			
 
				-        analyzer :
			
 
				-           my_analyzer:
			
 
				-                type: snowball
			
 
				-                language: Swedish
			
 
				-                stopwords: "och,det,att,i,en,jag,hon,som,han,på,den,med,var,sig,för,så,till,är,men,ett,om,hade,de,av,icke,mig,du,henne,då,sin,nu,har,inte,hans,honom,skulle,hennes,där,min,man,ej,vid,kunde,något,från,ut,när,efter,upp,vi,dem,vara,vad,över,än,dig,kan,sina,här,ha,mot,alla,under,någon,allt,mycket,sedan,ju,denna,själv,detta,åt,utan,varit,hur,ingen,mitt,ni,bli,blev,oss,din,dessa,några,deras,blir,mina,samma,vilken,er,sådan,vår,blivit,dess,inom,mellan,sådant,varför,varje,vilka,ditt,vem,vilket,sitta,sådana,vart,dina,vars,vårt,våra,ert,era,vilkas"
			
 
				---------------------------------------------------
			
--- a/docs/reference/analysis/analyzers/standard-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/standard-analyzer.asciidoc
@@ -1,26 +1,107 @@
 
				 [[analysis-standard-analyzer]]
			
 
				 === Standard Analyzer
			
 
				 
			
 
				-An analyzer of type `standard` is built using the
			
 
				-<<analysis-standard-tokenizer,Standard
			
 
				-Tokenizer>> with the
			
 
				-<<analysis-standard-tokenfilter,Standard
			
 
				-Token Filter>>,
			
 
				-<<analysis-lowercase-tokenfilter,Lower
			
 
				-Case Token Filter>>, and
			
 
				-<<analysis-stop-tokenfilter,Stop
			
 
				-Token Filter>>.
			
 
				-
			
 
				-The following are settings that can be set for a `standard` analyzer
			
 
				-type:
			
 
				-
			
 
				-[cols="<,<",options="header",]
			
 
				-|=======================================================================
			
 
				-|Setting |Description
			
 
				-|`stopwords` |A list of stopwords to initialize the stop filter with.
			
 
				-Defaults to an 'empty' stopword list Check
			
 
				-<<analysis-stop-analyzer,Stop Analyzer>> for more details.
			
 
				-|`max_token_length` |The maximum token length. If a token is seen that exceeds
			
 
				-this length then it is split at `max_token_length` intervals. Defaults to `255`.
			
 
				-|=======================================================================
			
 
				+The `standard` analyzer is the default analyzer which is used if none is
			
 
				+specified. It provides grammar based tokenization (based on the Unicode Text
			
 
				+Segmentation algorithm, as specified in
			
 
				+http://unicode.org/reports/tr29/[Unicode Standard Annex #29]) and works well
			
 
				+for most languages.
			
 
				+
			
 
				+[float]
			
 
				+=== Definition
			
 
				+
			
 
				+It consists of:
			
 
				+
			
 
				+Tokenizer::
			
 
				+* <<analysis-standard-tokenizer,Standard Tokenizer>>
			
 
				+
			
 
				+Token Filters::
			
 
				+* <<analysis-standard-tokenfilter,Standard Token Filter>>
			
 
				+* <<analysis-lowercase-tokenfilter,Lower Case Token Filter>>
			
 
				+* <<analysis-stop-tokenfilter,Stop Token Filter>> (disabled by default)
			
 
				+
			
 
				+[float]
			
 
				+=== Example output
			
 
				+
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+POST _analyze
			
 
				+{
			
 
				+  "analyzer": "standard",
			
 
				+  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
			
 
				+}
			
 
				+---------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+The above sentence would produce the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ the, 2, quick, brown, foxes, jumped, over, the, lazy, dog's, bone ]
			
 
				+---------------------------
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `standard` analyzer accepts the following parameters:
			
 
				+
			
 
				+[horizontal]
			
 
				+`max_token_length`::
			
 
				+
			
 
				+    The maximum token length. If a token is seen that exceeds this length then
			
 
				+    it is split at `max_token_length` intervals. Defaults to `255`.
			
 
				+
			
 
				+`stopwords`::
			
 
				+
			
 
				+    A pre-defined stop words list like `_english_` or an array  containing a
			
 
				+    list of stop words.  Defaults to `_none_`.
			
 
				+
			
 
				+`stopwords_path`::
			
 
				+
			
 
				+    The path to a file containing stop words.
			
 
				+
			
 
				+See the <<analysis-stop-tokenfilter,Stop Token Filter>> for more information
			
 
				+about stop word configuration.
			
 
				+
			
 
				+
			
 
				+[float]
			
 
				+=== Example configuration
			
 
				+
			
 
				+In this example, we configure the `standard` analyzer to have a
			
 
				+`max_token_length` of 5 (for demonstration purposes), and to use the
			
 
				+pre-defined list of English stop words:
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_english_analyzer": {
			
 
				+          "type": "standard",
			
 
				+          "max_token_length": 5,
			
 
				+          "stopwords": "_english_"
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+GET _cluster/health?wait_for_status=yellow
			
 
				+
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "analyzer": "my_english_analyzer",
			
 
				+  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
			
 
				+}
			
 
				+----------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+The above example produces the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ 2, quick, brown, foxes, jumpe, d, over, lazy, dog's, bone ]
			
 
				+---------------------------
			
 
				 
			
--- a/docs/reference/analysis/analyzers/stop-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/stop-analyzer.asciidoc
@@ -1,22 +1,97 @@
 
				 [[analysis-stop-analyzer]]
			
 
				 === Stop Analyzer
			
 
				 
			
 
				-An analyzer of type `stop` that is built using a
			
 
				-<<analysis-lowercase-tokenizer,Lower
			
 
				-Case Tokenizer>>, with
			
 
				-<<analysis-stop-tokenfilter,Stop
			
 
				-Token Filter>>.
			
 
				-
			
 
				-The following are settings that can be set for a `stop` analyzer type:
			
 
				-
			
 
				-[cols="<,<",options="header",]
			
 
				-|=======================================================================
			
 
				-|Setting |Description
			
 
				-|`stopwords` |A list of stopwords to initialize the stop filter with.
			
 
				-Defaults to the english stop words.
			
 
				-|`stopwords_path` |A path (either relative to `config` location, or
			
 
				-absolute) to a stopwords file configuration.
			
 
				-|=======================================================================
			
 
				-
			
 
				-Use `stopwords: _none_` to explicitly specify an 'empty' stopword list.
			
 
				+The `stop` analyzer is the same as the <<analysis-simple-analyzer,`simple` analyzer>>
			
 
				+but adds support for removing stop words.  It defaults to using the
			
 
				+`_english_` stop words.
			
 
				+
			
 
				+[float]
			
 
				+=== Definition
			
 
				+
			
 
				+It consists of:
			
 
				+
			
 
				+Tokenizer::
			
 
				+* <<analysis-lowercase-tokenizer,Lower Case Tokenizer>>
			
 
				+
			
 
				+Token filters::
			
 
				+* <<analysis-stop-tokenfilter,Stop Token Filter>>
			
 
				+
			
 
				+[float]
			
 
				+=== Example output
			
 
				+
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+POST _analyze
			
 
				+{
			
 
				+  "analyzer": "stop",
			
 
				+  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
			
 
				+}
			
 
				+---------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+The above sentence would produce the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ quick, brown, foxes, jumped, over, lazy, dog, s, bone ]
			
 
				+---------------------------
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `stop` analyzer accepts the following parameters:
			
 
				+
			
 
				+[horizontal]
			
 
				+`stopwords`::
			
 
				+
			
 
				+    A pre-defined stop words list like `_english_` or an array  containing a
			
 
				+    list of stop words.  Defaults to `_english_`.
			
 
				+
			
 
				+`stopwords_path`::
			
 
				+
			
 
				+    The path to a file containing stop words.
			
 
				+
			
 
				+
			
 
				+See the <<analysis-stop-tokenfilter,Stop Token Filter>> for more information
			
 
				+about stop word configuration.
			
 
				+
			
 
				+[float]
			
 
				+=== Example configuration
			
 
				+
			
 
				+In this example, we configure the `stop` analyzer to use a specified list of
			
 
				+words as stop words:
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_stop_analyzer": {
			
 
				+          "type": "stop",
			
 
				+          "stopwords": ["the", "over"]
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+GET _cluster/health?wait_for_status=yellow
			
 
				+
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "analyzer": "my_stop_analyzer",
			
 
				+  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
			
 
				+}
			
 
				+----------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+The above example produces the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ quick, brown, foxes, jumped, lazy, dog, s, bone ]
			
 
				+---------------------------
			
 
				+
			
 
				 
			
--- a/docs/reference/analysis/analyzers/whitespace-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/whitespace-analyzer.asciidoc
@@ -1,6 +1,38 @@
 
				 [[analysis-whitespace-analyzer]]
			
 
				 === Whitespace Analyzer
			
 
				 
			
 
				-An analyzer of type `whitespace` that is built using a
			
 
				-<<analysis-whitespace-tokenizer,Whitespace
			
 
				-Tokenizer>>.
			
 
				+The `whitespace` analyzer breaks text into terms whenever it encounters a
			
 
				+whitespace character.
			
 
				+
			
 
				+[float]
			
 
				+=== Definition
			
 
				+
			
 
				+It consists of:
			
 
				+
			
 
				+Tokenizer::
			
 
				+* <<analysis-whitespace-tokenizer,Whitespace Tokenizer>>
			
 
				+
			
 
				+[float]
			
 
				+=== Example output
			
 
				+
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+POST _analyze
			
 
				+{
			
 
				+  "analyzer": "whitespace",
			
 
				+  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
			
 
				+}
			
 
				+---------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+The above sentence would produce the following terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ The, 2, QUICK, Brown-Foxes, jumped, over, the, lazy, dog's, bone. ]
			
 
				+---------------------------
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `whitespace` analyzer is not configurable.
			
--- a/docs/reference/analysis/anatomy.asciidoc
+++ b/docs/reference/analysis/anatomy.asciidoc
@@ -0,0 +1,60 @@
 
				+[[analyzer-anatomy]]
			
 
				+== Anatomy of an analyzer
			
 
				+
			
 
				+An _analyzer_  -- whether built-in or custom -- is just a package which
			
 
				+contains three lower-level building blocks: _character filters_,
			
 
				+_tokenizers_, and _token filters_.
			
 
				+
			
 
				+The built-in <<analysis-analyzers,analyzers>> pre-package these building
			
 
				+blocks into analyzers suitable for different languages and types of text.
			
 
				+Elasticsearch also exposes the individual building blocks so that they can be
			
 
				+combined to define new <<analysis-custom-analyzer,`custom`>> analyzers.
			
 
				+
			
 
				+[float]
			
 
				+=== Character filters
			
 
				+
			
 
				+A _character filter_ receives the original text as a stream of characters and
			
 
				+can transform the stream by adding, removing, or changing characters.  For
			
 
				+instance, a character filter could be used to convert Arabic numerals
			
 
				+(٠‎١٢٣٤٥٦٧٨‎٩‎) into their Latin equivalents (0123456789), or to strip HTML
			
 
				+elements like `<b>` from the stream.
			
 
				+
			
 
				+An analyzer may have *zero or more* <<analysis-charfilters,character filters>>,
			
 
				+which are applied in order.
			
 
				+
			
 
				+[float]
			
 
				+=== Tokenizer
			
 
				+
			
 
				+A _tokenizer_  receives a stream of characters, breaks it up into individual
			
 
				+_tokens_ (usually individual words), and outputs a stream of _tokens_. For
			
 
				+instance, a <<analysis-whitespace-tokenizer,`whitespace`>> tokenizer breaks
			
 
				+text into tokens whenever it sees any whitespace.  It would convert the text
			
 
				+`"Quick brown fox!"` into the terms `[Quick, brown, fox!]`.
			
 
				+
			
 
				+The tokenizer is also responsible for recording the order or _position_ of
			
 
				+each term and the start and end _character offsets_ of the original word which
			
 
				+the term represents.
			
 
				+
			
 
				+An analyzer must have *exactly one* <<analysis-tokenizers,tokenizer>>.
			
 
				+
			
 
				+
			
 
				+[float]
			
 
				+=== Token filters
			
 
				+
			
 
				+A _token filter_ receives the token stream and may add, remove, or change
			
 
				+tokens.  For example, a <<analysis-lowercase-tokenfilter,`lowercase`>> token
			
 
				+filter converts all tokens to lowercase, a
			
 
				+<<analysis-stop-tokenfilter,`stop`>> token filter removes common words
			
 
				+(_stop words_) like `the` from the token stream, and a
			
 
				+<<analysis-synonym-tokenfilter,`synonym`>> token filter introduces synonyms
			
 
				+into the token stream.
			
 
				+
			
 
				+Token filters are not allowed to change the position or character offsets of
			
 
				+each token.
			
 
				+
			
 
				+An analyzer may have *zero or more* <<analysis-tokenfilters,token filters>>,
			
 
				+which are applied in order.
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/docs/reference/analysis/testing.asciidoc
+++ b/docs/reference/analysis/testing.asciidoc
@@ -0,0 +1,91 @@
 
				+== Testing analyzers
			
 
				+
			
 
				+The <<indices-analyze,`analyze` API>> is an invaluable tool for viewing the
			
 
				+terms produced by an analyzer. A built-in analyzer (or combination of built-in
			
 
				+tokenizer, token filters, and character filters) can be specified inline in
			
 
				+the request:
			
 
				+
			
 
				+[source,js]
			
 
				+-------------------------------------
			
 
				+POST _analyze
			
 
				+{
			
 
				+  "analyzer": "whitespace",
			
 
				+  "text":     "The quick brown fox."
			
 
				+}
			
 
				+
			
 
				+POST _analyze
			
 
				+{
			
 
				+  "tokenizer": "standard",
			
 
				+  "filter":  [ "lowercase", "asciifolding" ],
			
 
				+  "text":      "Is this déja vu?"
			
 
				+}
			
 
				+-------------------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+
			
 
				+
			
 
				+.Positions and character offsets
			
 
				+*********************************************************
			
 
				+
			
 
				+As can be seen from the output of the `analyze` API, analyzers not only
			
 
				+convert words into terms, they also record the order or relative _positions_
			
 
				+of each term (used for phrase queries or word proximity queries), and the
			
 
				+start and end _character offsets_ of each term in the original text (used for
			
 
				+highlighting search snippets).
			
 
				+
			
 
				+*********************************************************
			
 
				+
			
 
				+
			
 
				+Alternatively, a <<analysis-custom-analyzer,`custom` analyzer>> can be
			
 
				+referred to when running the `analyze` API on a specific index:
			
 
				+
			
 
				+[source,js]
			
 
				+-------------------------------------
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "std_folded": { <1>
			
 
				+          "type": "custom",
			
 
				+          "tokenizer": "standard",
			
 
				+          "filter": [
			
 
				+            "lowercase",
			
 
				+            "asciifolding"
			
 
				+          ]
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  },
			
 
				+  "mappings": {
			
 
				+    "my_type": {
			
 
				+      "properties": {
			
 
				+        "my_text": {
			
 
				+          "type": "text",
			
 
				+          "analyzer": "std_folded" <2>
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+GET my_index/_analyze <3>
			
 
				+{
			
 
				+  "analyzer": "std_folded", <4>
			
 
				+  "text":     "Is this déjà vu?"
			
 
				+}
			
 
				+
			
 
				+GET my_index/_analyze <3>
			
 
				+{
			
 
				+  "field": "my_text", <5>
			
 
				+  "text":  "Is this déjà vu?"
			
 
				+}
			
 
				+-------------------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+<1> Define a `custom` analyzer called `std_folded`.
			
 
				+<2> The field `my_text` uses the `std_folded` analyzer.
			
 
				+<3> To refer to this analyzer, the `analyze` API must specify the index name.
			
 
				+<4> Refer to the analyzer by name.
			
 
				+<5> Refer to the analyzer used by field `my_text`.
			
 
				+
			
--- a/docs/reference/mapping/params/position-increment-gap.asciidoc
+++ b/docs/reference/mapping/params/position-increment-gap.asciidoc
@@ -1,10 +1,10 @@
 
				 [[position-increment-gap]]
			
 
				 === `position_increment_gap`
			
 
				 
			
 
				-<<mapping-index,Analyzed>> string fields take term <<index-options,positions>>
			
 
				+<<mapping-index,Analyzed>> text fields take term <<index-options,positions>>
			
 
				 into account, in order to be able to support
			
 
				 <<query-dsl-match-query-phrase,proximity or phrase queries>>.
			
 
				-When indexing string fields with multiple values a "fake" gap is added between
			
 
				+When indexing text fields with multiple values a "fake" gap is added between
			
 
				 the values to prevent most phrase queries from matching across the values. The
			
 
				 size of this gap is configured using `position_increment_gap` and defaults to
			
 
				 `100`.