|
@@ -4,8 +4,145 @@
|
|
|
<titleabbrev>Truncate</titleabbrev>
|
|
|
++++
|
|
|
|
|
|
-The `truncate` token filter can be used to truncate tokens into a
|
|
|
-specific length.
|
|
|
+Truncates tokens that exceed a specified character limit. This limit defaults to
|
|
|
+`10` but can be customized using the `length` parameter.
|
|
|
|
|
|
-It accepts a `length` parameter which control the number of characters
|
|
|
-to truncate to, defaults to `10`.
|
|
|
+For example, you can use the `truncate` filter to shorten all tokens to
|
|
|
+`3` characters or fewer, changing `jumping fox` to `jum fox`.
|
|
|
+
|
|
|
+This filter uses Lucene's
|
|
|
+https://lucene.apache.org/core/{lucene_version_path}/analyzers-common/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.html[TruncateTokenFilter].
|
|
|
+
|
|
|
+[[analysis-truncate-tokenfilter-analyze-ex]]
|
|
|
+==== Example
|
|
|
+
|
|
|
+The following <<indices-analyze,analyze API>> request uses the `truncate` filter
|
|
|
+to shorten tokens that exceed 10 characters in
|
|
|
+`the quinquennial extravaganza carried on`:
|
|
|
+
|
|
|
+[source,console]
|
|
|
+--------------------------------------------------
|
|
|
+GET _analyze
|
|
|
+{
|
|
|
+ "tokenizer" : "whitespace",
|
|
|
+ "filter" : ["truncate"],
|
|
|
+ "text" : "the quinquennial extravaganza carried on"
|
|
|
+}
|
|
|
+--------------------------------------------------
|
|
|
+
|
|
|
+The filter produces the following tokens:
|
|
|
+
|
|
|
+[source,text]
|
|
|
+--------------------------------------------------
|
|
|
+[ the, quinquenni, extravagan, carried, on ]
|
|
|
+--------------------------------------------------
|
|
|
+
|
|
|
+/////////////////////
|
|
|
+[source,console-result]
|
|
|
+--------------------------------------------------
|
|
|
+{
|
|
|
+ "tokens" : [
|
|
|
+ {
|
|
|
+ "token" : "the",
|
|
|
+ "start_offset" : 0,
|
|
|
+ "end_offset" : 3,
|
|
|
+ "type" : "word",
|
|
|
+ "position" : 0
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "token" : "quinquenni",
|
|
|
+ "start_offset" : 4,
|
|
|
+ "end_offset" : 16,
|
|
|
+ "type" : "word",
|
|
|
+ "position" : 1
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "token" : "extravagan",
|
|
|
+ "start_offset" : 17,
|
|
|
+ "end_offset" : 29,
|
|
|
+ "type" : "word",
|
|
|
+ "position" : 2
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "token" : "carried",
|
|
|
+ "start_offset" : 30,
|
|
|
+ "end_offset" : 37,
|
|
|
+ "type" : "word",
|
|
|
+ "position" : 3
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "token" : "on",
|
|
|
+ "start_offset" : 38,
|
|
|
+ "end_offset" : 40,
|
|
|
+ "type" : "word",
|
|
|
+ "position" : 4
|
|
|
+ }
|
|
|
+ ]
|
|
|
+}
|
|
|
+--------------------------------------------------
|
|
|
+/////////////////////
|
|
|
+
|
|
|
+[[analysis-truncate-tokenfilter-analyzer-ex]]
|
|
|
+==== Add to an analyzer
|
|
|
+
|
|
|
+The following <<indices-create-index,create index API>> request uses the
|
|
|
+`truncate` filter to configure a new
|
|
|
+<<analysis-custom-analyzer,custom analyzer>>.
|
|
|
+
|
|
|
+[source,console]
|
|
|
+--------------------------------------------------
|
|
|
+PUT custom_truncate_example
|
|
|
+{
|
|
|
+ "settings" : {
|
|
|
+ "analysis" : {
|
|
|
+ "analyzer" : {
|
|
|
+ "standard_truncate" : {
|
|
|
+ "tokenizer" : "standard",
|
|
|
+ "filter" : ["truncate"]
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+--------------------------------------------------
|
|
|
+
|
|
|
+[[analysis-truncate-tokenfilter-configure-parms]]
|
|
|
+==== Configurable parameters
|
|
|
+
|
|
|
+`length`::
|
|
|
+(Optional, integer)
|
|
|
+Character limit for each token. Tokens exceeding this limit are truncated.
|
|
|
+Defaults to `10`.
|
|
|
+
|
|
|
+[[analysis-truncate-tokenfilter-customize]]
|
|
|
+==== Customize
|
|
|
+
|
|
|
+To customize the `truncate` filter, duplicate it to create the basis
|
|
|
+for a new custom token filter. You can modify the filter using its configurable
|
|
|
+parameters.
|
|
|
+
|
|
|
+For example, the following request creates a custom `truncate` filter,
|
|
|
+`5_char_trunc`, that shortens tokens to a `length` of `5` or fewer characters:
|
|
|
+
|
|
|
+[source,console]
|
|
|
+--------------------------------------------------
|
|
|
+PUT 5_char_words_example
|
|
|
+{
|
|
|
+ "settings": {
|
|
|
+ "analysis": {
|
|
|
+ "analyzer": {
|
|
|
+ "lowercase_5_char": {
|
|
|
+ "tokenizer": "lowercase",
|
|
|
+ "filter": [ "5_char_trunc" ]
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "filter": {
|
|
|
+ "5_char_trunc": {
|
|
|
+ "type": "truncate",
|
|
|
+ "length": 5
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+--------------------------------------------------
|