| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256 | [[analysis-custom-analyzer]]=== Custom AnalyzerWhen the built-in analyzers do not fulfill your needs, you can create a`custom` analyzer which uses the appropriate combination of:* zero or more <<analysis-charfilters, character filters>>* a <<analysis-tokenizers,tokenizer>>* zero or more <<analysis-tokenfilters,token filters>>.[float]=== ConfigurationThe `custom` analyzer accepts the following parameters:[horizontal]`tokenizer`::    A built-in or customised <<analysis-tokenizers,tokenizer>>.    (Required)`char_filter`::    An optional array of built-in or customised    <<analysis-charfilters, character filters>>.`filter`::    An optional array of built-in or customised    <<analysis-tokenfilters, token filters>>.`position_increment_gap`::    When indexing an array of text values, Elasticsearch inserts a fake "gap"    between the last term of one value and the first term of the next value to    ensure that a phrase query doesn't match two terms from different array    elements.  Defaults to `100`. See <<position-increment-gap>> for more.[float]=== Example configurationHere is an example that combines the following:Character Filter::* <<analysis-htmlstrip-charfilter,HTML Strip Character Filter>>Tokenizer::* <<analysis-standard-tokenizer,Standard Tokenizer>>Token Filters::* <<analysis-lowercase-tokenfilter,Lowercase Token Filter>>* <<analysis-asciifolding-tokenfilter,ASCII-Folding Token Filter>>[source,js]--------------------------------PUT my_index{  "settings": {    "analysis": {      "analyzer": {        "my_custom_analyzer": {          "type":      "custom",          "tokenizer": "standard",          "char_filter": [            "html_strip"          ],          "filter": [            "lowercase",            "asciifolding"          ]        }      }    }  }}POST my_index/_analyze{  "analyzer": "my_custom_analyzer",  "text": "Is this <b>déjà vu</b>?"}--------------------------------// CONSOLE/////////////////////[source,js]----------------------------{  "tokens": [    {      "token": "is",      "start_offset": 0,      "end_offset": 2,      "type": "<ALPHANUM>",      "position": 0    },    {      "token": "this",      "start_offset": 3,      "end_offset": 7,      "type": "<ALPHANUM>",      "position": 1    },    {      "token": "deja",      "start_offset": 11,      "end_offset": 15,      "type": "<ALPHANUM>",      "position": 2    },    {      "token": "vu",      "start_offset": 16,      "end_offset": 22,      "type": "<ALPHANUM>",      "position": 3    }  ]}----------------------------// TESTRESPONSE/////////////////////The above example produces the following terms:[source,text]---------------------------[ is, this, deja, vu ]---------------------------The previous example used tokenizer, token filters, and character filters withtheir default configurations, but it is possible to create configured versionsof each and to use them in a custom analyzer.Here is a more complicated example that combines the following:Character Filter::* <<analysis-mapping-charfilter,Mapping Character Filter>>, configured to replace `:)` with `_happy_` and `:(` with `_sad_`Tokenizer::*  <<analysis-pattern-tokenizer,Pattern Tokenizer>>, configured to split on punctuation charactersToken Filters::* <<analysis-lowercase-tokenfilter,Lowercase Token Filter>>* <<analysis-stop-tokenfilter,Stop Token Filter>>, configured to use the pre-defined list of English stop wordsHere is an example:[source,js]--------------------------------------------------PUT my_index{  "settings": {    "analysis": {      "analyzer": {        "my_custom_analyzer": {          "type": "custom",          "char_filter": [            "emoticons" <1>          ],          "tokenizer": "punctuation", <1>          "filter": [            "lowercase",            "english_stop" <1>          ]        }      },      "tokenizer": {        "punctuation": { <1>          "type": "pattern",          "pattern": "[ .,!?]"        }      },      "char_filter": {        "emoticons": { <1>          "type": "mapping",          "mappings": [            ":) => _happy_",            ":( => _sad_"          ]        }      },      "filter": {        "english_stop": { <1>          "type": "stop",          "stopwords": "_english_"        }      }    }  }}POST my_index/_analyze{  "analyzer": "my_custom_analyzer",  "text":     "I'm a :) person, and you?"}--------------------------------------------------// CONSOLE<1> The `emoticons` character filter, `punctuation` tokenizer and    `english_stop` token filter are custom implementations which are defined    in the same index settings./////////////////////[source,js]----------------------------{  "tokens": [    {      "token": "i'm",      "start_offset": 0,      "end_offset": 3,      "type": "word",      "position": 0    },    {      "token": "_happy_",      "start_offset": 6,      "end_offset": 8,      "type": "word",      "position": 2    },    {      "token": "person",      "start_offset": 9,      "end_offset": 15,      "type": "word",      "position": 3    },    {      "token": "you",      "start_offset": 21,      "end_offset": 24,      "type": "word",      "position": 5    }  ]}----------------------------// TESTRESPONSE/////////////////////The above example produces the following terms:[source,text]---------------------------[ i'm, _happy_, person, you ]---------------------------
 |