5 years ago · 6ea54eecf0
--- a/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc
@@ -1,137 +1,202 @@
 
				 [[analysis-keep-types-tokenfilter]]
			
 
				-=== Keep Types Token Filter
			
 
				+=== Keep types token filter
			
 
				+++++
			
 
				+<titleabbrev>Keep types</titleabbrev>
			
 
				+++++
			
 
				 
			
 
				-A token filter of type `keep_types` that only keeps tokens with a token type
			
 
				-contained in a predefined set.
			
 
				+Keeps or removes tokens of a specific type. For example, you can use this filter
			
 
				+to change `3 quick foxes` to `quick foxes` by keeping only `<ALPHANUM>`
			
 
				+(alphanumeric) tokens.
			
 
				 
			
 
				+[NOTE]
			
 
				+.Token types
			
 
				+====
			
 
				+Token types are set by the <<analysis-tokenizers,tokenizer>> when converting
			
 
				+characters to tokens. Token types can vary between tokenizers.
			
 
				 
			
 
				-[float]
			
 
				-=== Options
			
 
				-[horizontal]
			
 
				-types:: a list of types to include (default mode) or exclude
			
 
				-mode:: if set to `include` (default) the specified token types will be kept, 
			
 
				-if set to `exclude` the specified token types will be removed from the stream
			
 
				+For example, the <<analysis-standard-tokenizer,`standard`>> tokenizer can
			
 
				+produce a variety of token types, including `<ALPHANUM>`, `<HANGUL>`, and
			
 
				+`<NUM>`. Simpler analyzers, like the
			
 
				+<<analysis-lowercase-tokenizer,`lowercase`>> tokenizer, only produce the `word`
			
 
				+token type.
			
 
				 
			
 
				-[float]
			
 
				-=== Settings example
			
 
				+Certain token filters can also add token types. For example, the 
			
 
				+<<analysis-synonym-tokenfilter,`synonym`>> filter can add the `<SYNONYM>` token
			
 
				+type.
			
 
				+====
			
 
				 
			
 
				-You can set it up like:
			
 
				+This filter uses Lucene's
			
 
				+https://lucene.apache.org/core/{lucene_version_path}/analyzers-common/org/apache/lucene/analysis/core/TypeTokenFilter.html[TypeTokenFilter].
			
 
				+
			
 
				+[[analysis-keep-types-tokenfilter-analyze-include-ex]]
			
 
				+==== Include example
			
 
				+
			
 
				+The following <<indices-analyze,analyze API>> request uses the `keep_types`
			
 
				+filter to keep only `<NUM>` (numeric) tokens from `1 quick fox 2 lazy dogs`.
			
 
				 
			
 
				 [source,console]
			
 
				 --------------------------------------------------
			
 
				-PUT /keep_types_example
			
 
				+GET _analyze
			
 
				 {
			
 
				-    "settings" : {
			
 
				-        "analysis" : {
			
 
				-            "analyzer" : {
			
 
				-                "my_analyzer" : {
			
 
				-                    "tokenizer" : "standard",
			
 
				-                    "filter" : ["lowercase", "extract_numbers"]
			
 
				-                }
			
 
				-            },
			
 
				-            "filter" : {
			
 
				-                "extract_numbers" : {
			
 
				-                    "type" : "keep_types",
			
 
				-                    "types" : [ "<NUM>" ]
			
 
				-                }
			
 
				-            }
			
 
				-        }
			
 
				+  "tokenizer": "standard",
			
 
				+  "filter": [
			
 
				+    {
			
 
				+      "type": "keep_types",
			
 
				+      "types": [ "<NUM>" ]
			
 
				     }
			
 
				+  ],
			
 
				+  "text": "1 quick fox 2 lazy dogs"
			
 
				 }
			
 
				 --------------------------------------------------
			
 
				 
			
 
				-And test it like:
			
 
				+The filter produces the following tokens:
			
 
				 
			
 
				-[source,console]
			
 
				+[source,text]
			
 
				 --------------------------------------------------
			
 
				-POST /keep_types_example/_analyze
			
 
				-{
			
 
				-  "analyzer" : "my_analyzer",
			
 
				-  "text" : "this is just 1 a test"
			
 
				-}
			
 
				+[ 1, 2 ]
			
 
				 --------------------------------------------------
			
 
				-// TEST[continued]
			
 
				-
			
 
				-The response will be:
			
 
				 
			
 
				+/////////////////////
			
 
				 [source,console-result]
			
 
				 --------------------------------------------------
			
 
				 {
			
 
				   "tokens": [
			
 
				     {
			
 
				       "token": "1",
			
 
				-      "start_offset": 13,
			
 
				-      "end_offset": 14,
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 1,
			
 
				+      "type": "<NUM>",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "2",
			
 
				+      "start_offset": 12,
			
 
				+      "end_offset": 13,
			
 
				       "type": "<NUM>",
			
 
				       "position": 3
			
 
				     }
			
 
				   ]
			
 
				 }
			
 
				 --------------------------------------------------
			
 
				+/////////////////////
			
 
				 
			
 
				-Note how only the `<NUM>` token is in the output.
			
 
				-
			
 
				-[discrete]
			
 
				-=== Exclude mode settings example
			
 
				+[[analysis-keep-types-tokenfilter-analyze-exclude-ex]]
			
 
				+==== Exclude example
			
 
				 
			
 
				-If the `mode` parameter is set to `exclude` like in the following example:
			
 
				+The following <<indices-analyze,analyze API>> request uses the `keep_types`
			
 
				+filter to remove `<NUM>` tokens from `1 quick fox 2 lazy dogs`. Note the `mode`
			
 
				+parameter is set to `exclude`.
			
 
				 
			
 
				 [source,console]
			
 
				 --------------------------------------------------
			
 
				-PUT /keep_types_exclude_example
			
 
				+GET _analyze
			
 
				 {
			
 
				-    "settings" : {
			
 
				-        "analysis" : {
			
 
				-            "analyzer" : {
			
 
				-                "my_analyzer" : {
			
 
				-                    "tokenizer" : "standard",
			
 
				-                    "filter" : ["lowercase", "remove_numbers"]
			
 
				-                }
			
 
				-            },
			
 
				-            "filter" : {
			
 
				-                "remove_numbers" : {
			
 
				-                    "type" : "keep_types",
			
 
				-                    "mode" : "exclude",
			
 
				-                    "types" : [ "<NUM>" ]
			
 
				-                }
			
 
				-            }
			
 
				-        }
			
 
				+  "tokenizer": "standard",
			
 
				+  "filter": [
			
 
				+    {
			
 
				+      "type": "keep_types",
			
 
				+      "types": [ "<NUM>" ],
			
 
				+      "mode": "exclude"
			
 
				     }
			
 
				+  ],
			
 
				+  "text": "1 quick fox 2 lazy dogs"
			
 
				 }
			
 
				 --------------------------------------------------
			
 
				 
			
 
				-And we test it like:
			
 
				+The filter produces the following tokens:
			
 
				 
			
 
				-[source,console]
			
 
				+[source,text]
			
 
				 --------------------------------------------------
			
 
				-POST /keep_types_exclude_example/_analyze
			
 
				-{
			
 
				-  "analyzer" : "my_analyzer",
			
 
				-  "text" : "hello 101 world"
			
 
				-}
			
 
				+[ quick, fox, lazy, dogs ]
			
 
				 --------------------------------------------------
			
 
				-// TEST[continued]
			
 
				-
			
 
				-The response will be:
			
 
				 
			
 
				+/////////////////////
			
 
				 [source,console-result]
			
 
				 --------------------------------------------------
			
 
				 {
			
 
				   "tokens": [
			
 
				     {
			
 
				-      "token": "hello",
			
 
				-      "start_offset": 0,
			
 
				-      "end_offset": 5,
			
 
				+      "token": "quick",
			
 
				+      "start_offset": 2,
			
 
				+      "end_offset": 7,
			
 
				       "type": "<ALPHANUM>",
			
 
				-      "position": 0
			
 
				-    }, 
			
 
				+      "position": 1
			
 
				+    },
			
 
				     {
			
 
				-      "token": "world",
			
 
				-      "start_offset": 10,
			
 
				-      "end_offset": 15,
			
 
				+      "token": "fox",
			
 
				+      "start_offset": 8,
			
 
				+      "end_offset": 11,
			
 
				       "type": "<ALPHANUM>",
			
 
				       "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "lazy",
			
 
				+      "start_offset": 14,
			
 
				+      "end_offset": 18,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 4
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "dogs",
			
 
				+      "start_offset": 19,
			
 
				+      "end_offset": 23,
			
 
				+      "type": "<ALPHANUM>",
			
 
				+      "position": 5
			
 
				     }
			
 
				   ]
			
 
				 }
			
 
				 --------------------------------------------------
			
 
				+/////////////////////
			
 
				+
			
 
				+[[analysis-keep-types-tokenfilter-configure-parms]]
			
 
				+==== Configurable parameters
			
 
				+
			
 
				+`types`::
			
 
				+(Required, array of strings)
			
 
				+List of token types to keep or remove.
			
 
				+
			
 
				+`mode`::
			
 
				+(Optional, string) 
			
 
				+Indicates whether to keep or remove the specified token types.
			
 
				+Valid values are:
			
 
				+
			
 
				+`include`:::
			
 
				+(Default) Keep only the specified token types.
			
 
				+
			
 
				+`exclude`:::
			
 
				+Remove the specified token types.
			
 
				+
			
 
				+[[analysis-keep-types-tokenfilter-customize]]
			
 
				+==== Customize and add to an analyzer
			
 
				+
			
 
				+To customize the `keep_types` filter, duplicate it to create the basis
			
 
				+for a new custom token filter. You can modify the filter using its configurable
			
 
				+parameters.
			
 
				+
			
 
				+For example, the following <<indices-create-index,create index API>> request
			
 
				+uses a custom `keep_types` filter to configure a new
			
 
				+<<analysis-custom-analyzer,custom analyzer>>. The custom `keep_types` filter
			
 
				+keeps only `<ALPHANUM>` (alphanumeric) tokens.
			
 
				+
			
 
				+[source,console]
			
 
				+--------------------------------------------------
			
 
				+PUT keep_types_example
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_analyzer": {
			
 
				+          "tokenizer": "standard",
			
 
				+          "filter": [ "extract_alpha" ]
			
 
				+        }
			
 
				+      },
			
 
				+      "filter": {
			
 
				+        "extract_alpha": {
			
 
				+          "type": "keep_types",
			
 
				+          "types": [ "<ALPHANUM>" ]
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+--------------------------------------------------
			
--- a/docs/reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc
@@ -1,50 +1,146 @@
 
				 [[analysis-keep-words-tokenfilter]]
			
 
				-=== Keep Words Token Filter
			
 
				+=== Keep words token filter
			
 
				+++++
			
 
				+<titleabbrev>Keep words</titleabbrev>
			
 
				+++++
			
 
				 
			
 
				-A token filter of type `keep` that only keeps tokens with text contained in a
			
 
				-predefined set of words. The set of words can be defined in the settings or
			
 
				-loaded from a text file containing one word per line.
			
 
				+Keeps only tokens contained in a specified word list.
			
 
				 
			
 
				+This filter uses Lucene's
			
 
				+https://lucene.apache.org/core/{lucene_version_path}/analyzers-common/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.html[KeepWordFilter].
			
 
				 
			
 
				-[float]
			
 
				-=== Options
			
 
				-[horizontal]
			
 
				-keep_words:: a list of words to keep
			
 
				-keep_words_path:: a path to a words file
			
 
				-keep_words_case:: a boolean indicating whether to lower case the words (defaults to `false`)
			
 
				+[NOTE]
			
 
				+====
			
 
				+To remove a list of words from a token stream, use the
			
 
				+<<analysis-stop-tokenfilter,`stop`>> filter.
			
 
				+====
			
 
				 
			
 
				+[[analysis-keep-words-tokenfilter-analyze-ex]]
			
 
				+==== Example
			
 
				 
			
 
				+The following <<indices-analyze,analyze API>> request uses the `keep` filter to
			
 
				+keep only the `fox` and `dog` tokens from
			
 
				+`the quick fox jumps over the lazy dog`.
			
 
				 
			
 
				-[float]
			
 
				-=== Settings example
			
 
				+[source,console]
			
 
				+--------------------------------------------------
			
 
				+GET _analyze
			
 
				+{
			
 
				+  "tokenizer": "whitespace",
			
 
				+  "filter": [
			
 
				+    {
			
 
				+      "type": "keep",
			
 
				+      "keep_words": [ "dog", "elephant", "fox" ]
			
 
				+    }
			
 
				+  ],
			
 
				+  "text": "the quick fox jumps over the lazy dog"
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+
			
 
				+The filter produces the following tokens:
			
 
				+
			
 
				+[source,text]
			
 
				+--------------------------------------------------
			
 
				+[ fox, dog ]
			
 
				+--------------------------------------------------
			
 
				+
			
 
				+/////////////////////
			
 
				+[source,console-result]
			
 
				+--------------------------------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "fox",
			
 
				+      "start_offset": 10,
			
 
				+      "end_offset": 13,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "dog",
			
 
				+      "start_offset": 34,
			
 
				+      "end_offset": 37,
			
 
				+      "type": "word",
			
 
				+      "position": 7
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+/////////////////////
			
 
				+
			
 
				+[[analysis-keep-words-tokenfilter-configure-parms]]
			
 
				+==== Configurable parameters
			
 
				+
			
 
				+`keep_words`::
			
 
				++
			
 
				+--
			
 
				+(Required+++*+++, array of strings)
			
 
				+List of words to keep. Only tokens that match words in this list are included in
			
 
				+the output.
			
 
				+
			
 
				+Either this parameter or `keep_words_path` must be specified.
			
 
				+--
			
 
				+
			
 
				+`keep_words_path`::
			
 
				++
			
 
				+--
			
 
				+(Required+++*+++, array of strings)
			
 
				+Path to a file that contains a list of words to keep. Only tokens that match
			
 
				+words in this list are included in the output.
			
 
				+
			
 
				+This path must be absolute or relative to the `config` location, and the file
			
 
				+must be UTF-8 encoded. Each word in the file must be separated by a line break.
			
 
				+
			
 
				+Either this parameter or `keep_words` must be specified.
			
 
				+--
			
 
				+
			
 
				+`keep_words_case`::
			
 
				+(Optional, boolean)
			
 
				+If `true`, lowercase all keep words. Defaults to `false`.
			
 
				+
			
 
				+[[analysis-keep-words-tokenfilter-customize]]
			
 
				+==== Customize and add to an analyzer
			
 
				+
			
 
				+To customize the `keep` filter, duplicate it to create the basis for a new
			
 
				+custom token filter. You can modify the filter using its configurable
			
 
				+parameters.
			
 
				+
			
 
				+For example, the following <<indices-create-index,create index API>> request
			
 
				+uses custom `keep` filters to configure two new
			
 
				+<<analysis-custom-analyzer,custom analyzers>>:
			
 
				+
			
 
				+* `standard_keep_word_array`, which uses a custom `keep` filter with an inline
			
 
				+  array of keep words
			
 
				+* `standard_keep_word_file`, which uses a customer `keep` filter with a keep
			
 
				+  words file
			
 
				 
			
 
				 [source,console]
			
 
				 --------------------------------------------------
			
 
				-PUT /keep_words_example
			
 
				+PUT keep_words_example
			
 
				 {
			
 
				-    "settings" : {
			
 
				-        "analysis" : {
			
 
				-            "analyzer" : {
			
 
				-                "example_1" : {
			
 
				-                    "tokenizer" : "standard",
			
 
				-                    "filter" : ["lowercase", "words_till_three"]
			
 
				-                },
			
 
				-                "example_2" : {
			
 
				-                    "tokenizer" : "standard",
			
 
				-                    "filter" : ["lowercase", "words_in_file"]
			
 
				-                }
			
 
				-            },
			
 
				-            "filter" : {
			
 
				-                "words_till_three" : {
			
 
				-                    "type" : "keep",
			
 
				-                    "keep_words" : [ "one", "two", "three"]
			
 
				-                },
			
 
				-                "words_in_file" : {
			
 
				-                    "type" : "keep",
			
 
				-                    "keep_words_path" : "analysis/example_word_list.txt"
			
 
				-                }
			
 
				-            }
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "standard_keep_word_array": {
			
 
				+          "tokenizer": "standard",
			
 
				+          "filter": [ "keep_word_array" ]
			
 
				+        },
			
 
				+        "standard_keep_word_file": {
			
 
				+          "tokenizer": "standard",
			
 
				+          "filter": [ "keep_word_file" ]
			
 
				+        }
			
 
				+      },
			
 
				+      "filter": {
			
 
				+        "keep_word_array": {
			
 
				+          "type": "keep",
			
 
				+          "keep_words": [ "one", "two", "three" ]
			
 
				+        },
			
 
				+        "keep_word_file": {
			
 
				+          "type": "keep",
			
 
				+          "keep_words_path": "analysis/example_word_list.txt"
			
 
				         }
			
 
				+      }
			
 
				     }
			
 
				+  }
			
 
				 }
			
 
				 --------------------------------------------------
			
--- a/docs/reference/analysis/tokenizers.asciidoc
+++ b/docs/reference/analysis/tokenizers.asciidoc
@@ -7,10 +7,13 @@ instance, a <<analysis-whitespace-tokenizer,`whitespace`>> tokenizer breaks
 
				 text into tokens whenever it sees any whitespace.  It would convert the text
			
 
				 `"Quick brown fox!"` into the terms `[Quick, brown, fox!]`.
			
 
				 
			
 
				-The tokenizer is also responsible for recording the order or _position_ of
			
 
				-each term (used for phrase and word proximity queries) and the start and end
			
 
				-_character offsets_ of the original word which the term represents (used for
			
 
				-highlighting search snippets).
			
 
				+The tokenizer is also responsible for recording the following:
			
 
				+
			
 
				+* Order or _position_ of each term (used for phrase and word proximity queries)
			
 
				+* Start and end _character offsets_ of the original word which the term
			
 
				+represents (used for highlighting search snippets).
			
 
				+* _Token type_, a classification of each term produced, such as `<ALPHANUM>`,
			
 
				+`<HANGUL>`, or `<NUM>`. Simpler analyzers only produce the `word` token type.
			
 
				 
			
 
				 Elasticsearch has a number of built in tokenizers which can be used to build
			
 
				 <<analysis-custom-analyzer,custom analyzers>>.