|
@@ -0,0 +1,42 @@
|
|
|
+[[analysis-cjk-bigram-tokenfilter]]
|
|
|
+=== CJK Bigram Token Filter
|
|
|
+
|
|
|
+The `cjk_bigram` token filter forms bigrams out of the CJK
|
|
|
+terms that are generated by the <<analysis-standard-tokenizer,`standard` tokenizer>>
|
|
|
+or the `icu_tokenizer` (see <<icu-analysis-plugin>>).
|
|
|
+
|
|
|
+By default, when a CJK character has no adjacent characters to form a bigram,
|
|
|
+it is output in unigram form. If you always want to output both unigrams and
|
|
|
+bigrams, set the `output_unigrams` flag to `true`. This can be used for a
|
|
|
+combined unigram+bigram approach.
|
|
|
+
|
|
|
+Bigrams are generated for characters in `han`, `hiragana`, `katakana` and
|
|
|
+`hangul`, but bigrams can be disabled for particular scripts with the
|
|
|
+`ignore_scripts` parameter. All non-CJK input is passed through unmodified.
|
|
|
+
|
|
|
+[source,js]
|
|
|
+--------------------------------------------------
|
|
|
+{
|
|
|
+ "index" : {
|
|
|
+ "analysis" : {
|
|
|
+ "analyzer" : {
|
|
|
+ "han_bigrams" : {
|
|
|
+ "tokenizer" : "standard",
|
|
|
+ "filter" : ["han_bigrams_filter"]
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "filter" : {
|
|
|
+ "han_bigrams_filter" : {
|
|
|
+ "type" : "cjk_bigram",
|
|
|
+ "ignore_scripts": [
|
|
|
+ "hiragana",
|
|
|
+ "katakana"
|
|
|
+ "hangul"
|
|
|
+ ],
|
|
|
+ "output_ungirams" : true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+--------------------------------------------------
|