|
@@ -1,16 +1,170 @@
|
|
|
[[analysis-length-tokenfilter]]
|
|
|
-=== Length Token Filter
|
|
|
+=== Length token filter
|
|
|
+++++
|
|
|
+<titleabbrev>Length</titleabbrev>
|
|
|
+++++
|
|
|
|
|
|
-A token filter of type `length` that removes words that are too long or
|
|
|
-too short for the stream.
|
|
|
+Removes tokens shorter or longer than specified character lengths.
|
|
|
+For example, you can use the `length` filter to exclude tokens shorter than 2
|
|
|
+characters and tokens longer than 5 characters.
|
|
|
|
|
|
-The following are settings that can be set for a `length` token filter
|
|
|
-type:
|
|
|
+This filter uses Lucene's
|
|
|
+https://lucene.apache.org/core/{lucene_version_path}/analyzers-common/org/apache/lucene/analysis/miscellaneous/LengthFilter.html[LengthFilter].
|
|
|
|
|
|
-[cols="<,<",options="header",]
|
|
|
-|===========================================================
|
|
|
-|Setting |Description
|
|
|
-|`min` |The minimum number. Defaults to `0`.
|
|
|
-|`max` |The maximum number. Defaults to `Integer.MAX_VALUE`, which is `2^31-1` or 2147483647.
|
|
|
-|===========================================================
|
|
|
+[TIP]
|
|
|
+====
|
|
|
+The `length` filter removes entire tokens. If you'd prefer to shorten tokens to
|
|
|
+a specific length, use the <<analysis-truncate-tokenfilter,`truncate`>> filter.
|
|
|
+====
|
|
|
|
|
|
+[[analysis-length-tokenfilter-analyze-ex]]
|
|
|
+==== Example
|
|
|
+
|
|
|
+The following <<indices-analyze,analyze API>> request uses the `length`
|
|
|
+filter to remove tokens longer than 4 characters:
|
|
|
+
|
|
|
+[source,console]
|
|
|
+--------------------------------------------------
|
|
|
+GET _analyze
|
|
|
+{
|
|
|
+ "tokenizer": "whitespace",
|
|
|
+ "filter": [
|
|
|
+ {
|
|
|
+ "type": "length",
|
|
|
+ "min": 0,
|
|
|
+ "max": 4
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "text": "the quick brown fox jumps over the lazy dog"
|
|
|
+}
|
|
|
+--------------------------------------------------
|
|
|
+
|
|
|
+The filter produces the following tokens:
|
|
|
+
|
|
|
+[source,text]
|
|
|
+--------------------------------------------------
|
|
|
+[ the, fox, over, the, lazy, dog ]
|
|
|
+--------------------------------------------------
|
|
|
+
|
|
|
+/////////////////////
|
|
|
+[source,console-result]
|
|
|
+--------------------------------------------------
|
|
|
+{
|
|
|
+ "tokens": [
|
|
|
+ {
|
|
|
+ "token": "the",
|
|
|
+ "start_offset": 0,
|
|
|
+ "end_offset": 3,
|
|
|
+ "type": "word",
|
|
|
+ "position": 0
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "token": "fox",
|
|
|
+ "start_offset": 16,
|
|
|
+ "end_offset": 19,
|
|
|
+ "type": "word",
|
|
|
+ "position": 3
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "token": "over",
|
|
|
+ "start_offset": 26,
|
|
|
+ "end_offset": 30,
|
|
|
+ "type": "word",
|
|
|
+ "position": 5
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "token": "the",
|
|
|
+ "start_offset": 31,
|
|
|
+ "end_offset": 34,
|
|
|
+ "type": "word",
|
|
|
+ "position": 6
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "token": "lazy",
|
|
|
+ "start_offset": 35,
|
|
|
+ "end_offset": 39,
|
|
|
+ "type": "word",
|
|
|
+ "position": 7
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "token": "dog",
|
|
|
+ "start_offset": 40,
|
|
|
+ "end_offset": 43,
|
|
|
+ "type": "word",
|
|
|
+ "position": 8
|
|
|
+ }
|
|
|
+ ]
|
|
|
+}
|
|
|
+--------------------------------------------------
|
|
|
+/////////////////////
|
|
|
+
|
|
|
+[[analysis-length-tokenfilter-analyzer-ex]]
|
|
|
+==== Add to an analyzer
|
|
|
+
|
|
|
+The following <<indices-create-index,create index API>> request uses the
|
|
|
+`length` filter to configure a new
|
|
|
+<<analysis-custom-analyzer,custom analyzer>>.
|
|
|
+
|
|
|
+[source,console]
|
|
|
+--------------------------------------------------
|
|
|
+PUT length_example
|
|
|
+{
|
|
|
+ "settings": {
|
|
|
+ "analysis": {
|
|
|
+ "analyzer": {
|
|
|
+ "standard_length": {
|
|
|
+ "tokenizer": "standard",
|
|
|
+ "filter": [ "length" ]
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+--------------------------------------------------
|
|
|
+
|
|
|
+[[analysis-length-tokenfilter-configure-parms]]
|
|
|
+==== Configurable parameters
|
|
|
+
|
|
|
+`min`::
|
|
|
+(Optional, integer)
|
|
|
+Minimum character length of a token. Shorter tokens are excluded from the
|
|
|
+output. Defaults to `0`.
|
|
|
+
|
|
|
+`max`::
|
|
|
+(Optional, integer)
|
|
|
+Maximum character length of a token. Longer tokens are excluded from the output.
|
|
|
+Defaults to `Integer.MAX_VALUE`, which is `2^31-1` or `2147483647`.
|
|
|
+
|
|
|
+[[analysis-length-tokenfilter-customize]]
|
|
|
+==== Customize
|
|
|
+
|
|
|
+To customize the `length` filter, duplicate it to create the basis
|
|
|
+for a new custom token filter. You can modify the filter using its configurable
|
|
|
+parameters.
|
|
|
+
|
|
|
+For example, the following request creates a custom `length` filter that removes
|
|
|
+tokens shorter than 2 characters and tokens longer than 10 characters:
|
|
|
+
|
|
|
+[source,console]
|
|
|
+--------------------------------------------------
|
|
|
+PUT length_custom_example
|
|
|
+{
|
|
|
+ "settings": {
|
|
|
+ "analysis": {
|
|
|
+ "analyzer": {
|
|
|
+ "whitespace_length_2_to_10_char": {
|
|
|
+ "tokenizer": "whitespace",
|
|
|
+ "filter": [ "length_2_to_10_char" ]
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "filter": {
|
|
|
+ "length_2_to_10_char": {
|
|
|
+ "type": "length",
|
|
|
+ "min": 2,
|
|
|
+ "max": 10
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+--------------------------------------------------
|