123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428 |
- [[analysis-smartcn]]
- === Smart Chinese Analysis Plugin
- The Smart Chinese Analysis plugin integrates Lucene's Smart Chinese analysis
- module into elasticsearch.
- It provides an analyzer for Chinese or mixed Chinese-English text. This
- analyzer uses probabilistic knowledge to find the optimal word segmentation
- for Simplified Chinese text. The text is first broken into sentences, then
- each sentence is segmented into words.
- :plugin_name: analysis-smartcn
- include::install_remove.asciidoc[]
- [[analysis-smartcn-tokenizer]]
- [float]
- ==== `smartcn` tokenizer and token filter
- The plugin provides the `smartcn` analyzer, `smartcn_tokenizer` tokenizer, and
- `smartcn_stop` token filter which are not configurable.
- NOTE: The `smartcn_word` token filter and `smartcn_sentence` have been deprecated.
- ==== Reimplementing and extending the analyzers
- The `smartcn` analyzer could be reimplemented as a `custom` analyzer that can
- then be extended and configured as follows:
- [source,console]
- ----------------------------------------------------
- PUT smartcn_example
- {
- "settings": {
- "analysis": {
- "analyzer": {
- "rebuilt_smartcn": {
- "tokenizer": "smartcn_tokenizer",
- "filter": [
- "porter_stem",
- "smartcn_stop"
- ]
- }
- }
- }
- }
- }
- ----------------------------------------------------
- // TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: smartcn_example, first: smartcn, second: rebuilt_smartcn}\nendyaml\n/]
- [[analysis-smartcn_stop]]
- ==== `smartcn_stop` token filter
- The `smartcn_stop` token filter filters out stopwords defined by `smartcn`
- analyzer (`_smartcn_`), and any other custom stopwords specified by the user.
- This filter only supports the predefined `_smartcn_` stopwords list.
- If you want to use a different predefined list, then use the
- {ref}/analysis-stop-tokenfilter.html[`stop` token filter] instead.
- [source,console]
- --------------------------------------------------
- PUT smartcn_example
- {
- "settings": {
- "index": {
- "analysis": {
- "analyzer": {
- "smartcn_with_stop": {
- "tokenizer": "smartcn_tokenizer",
- "filter": [
- "porter_stem",
- "my_smartcn_stop"
- ]
- }
- },
- "filter": {
- "my_smartcn_stop": {
- "type": "smartcn_stop",
- "stopwords": [
- "_smartcn_",
- "stack",
- "的"
- ]
- }
- }
- }
- }
- }
- }
- GET smartcn_example/_analyze
- {
- "analyzer": "smartcn_with_stop",
- "text": "哈喽,我们是 Elastic 我们是 Elastic Stack(Elasticsearch、Kibana、Beats 和 Logstash)的开发公司。从股票行情到 Twitter 消息流,从 Apache 日志到 WordPress 博文,我们可以帮助人们体验搜索的强大力量,帮助他们以截然不同的方式探索和分析数据"
- }
- --------------------------------------------------
- The above request returns:
- [source,console-result]
- --------------------------------------------------
- {
- "tokens": [
- {
- "token": "哈",
- "start_offset": 0,
- "end_offset": 1,
- "type": "word",
- "position": 0
- },
- {
- "token": "喽",
- "start_offset": 1,
- "end_offset": 2,
- "type": "word",
- "position": 1
- },
- {
- "token": "我们",
- "start_offset": 3,
- "end_offset": 5,
- "type": "word",
- "position": 3
- },
- {
- "token": "是",
- "start_offset": 5,
- "end_offset": 6,
- "type": "word",
- "position": 4
- },
- {
- "token": "elast",
- "start_offset": 7,
- "end_offset": 14,
- "type": "word",
- "position": 5
- },
- {
- "token": "我们",
- "start_offset": 17,
- "end_offset": 19,
- "type": "word",
- "position": 6
- },
- {
- "token": "是",
- "start_offset": 19,
- "end_offset": 20,
- "type": "word",
- "position": 7
- },
- {
- "token": "elast",
- "start_offset": 21,
- "end_offset": 28,
- "type": "word",
- "position": 8
- },
- {
- "token": "elasticsearch",
- "start_offset": 35,
- "end_offset": 48,
- "type": "word",
- "position": 11
- },
- {
- "token": "kibana",
- "start_offset": 49,
- "end_offset": 55,
- "type": "word",
- "position": 13
- },
- {
- "token": "beat",
- "start_offset": 56,
- "end_offset": 61,
- "type": "word",
- "position": 15
- },
- {
- "token": "和",
- "start_offset": 62,
- "end_offset": 63,
- "type": "word",
- "position": 16
- },
- {
- "token": "logstash",
- "start_offset": 64,
- "end_offset": 72,
- "type": "word",
- "position": 17
- },
- {
- "token": "开发",
- "start_offset": 74,
- "end_offset": 76,
- "type": "word",
- "position": 20
- },
- {
- "token": "公司",
- "start_offset": 76,
- "end_offset": 78,
- "type": "word",
- "position": 21
- },
- {
- "token": "从",
- "start_offset": 79,
- "end_offset": 80,
- "type": "word",
- "position": 23
- },
- {
- "token": "股票",
- "start_offset": 80,
- "end_offset": 82,
- "type": "word",
- "position": 24
- },
- {
- "token": "行情",
- "start_offset": 82,
- "end_offset": 84,
- "type": "word",
- "position": 25
- },
- {
- "token": "到",
- "start_offset": 84,
- "end_offset": 85,
- "type": "word",
- "position": 26
- },
- {
- "token": "twitter",
- "start_offset": 86,
- "end_offset": 93,
- "type": "word",
- "position": 27
- },
- {
- "token": "消息",
- "start_offset": 94,
- "end_offset": 96,
- "type": "word",
- "position": 28
- },
- {
- "token": "流",
- "start_offset": 96,
- "end_offset": 97,
- "type": "word",
- "position": 29
- },
- {
- "token": "从",
- "start_offset": 98,
- "end_offset": 99,
- "type": "word",
- "position": 31
- },
- {
- "token": "apach",
- "start_offset": 100,
- "end_offset": 106,
- "type": "word",
- "position": 32
- },
- {
- "token": "日志",
- "start_offset": 107,
- "end_offset": 109,
- "type": "word",
- "position": 33
- },
- {
- "token": "到",
- "start_offset": 109,
- "end_offset": 110,
- "type": "word",
- "position": 34
- },
- {
- "token": "wordpress",
- "start_offset": 111,
- "end_offset": 120,
- "type": "word",
- "position": 35
- },
- {
- "token": "博",
- "start_offset": 121,
- "end_offset": 122,
- "type": "word",
- "position": 36
- },
- {
- "token": "文",
- "start_offset": 122,
- "end_offset": 123,
- "type": "word",
- "position": 37
- },
- {
- "token": "我们",
- "start_offset": 124,
- "end_offset": 126,
- "type": "word",
- "position": 39
- },
- {
- "token": "可以",
- "start_offset": 126,
- "end_offset": 128,
- "type": "word",
- "position": 40
- },
- {
- "token": "帮助",
- "start_offset": 128,
- "end_offset": 130,
- "type": "word",
- "position": 41
- },
- {
- "token": "人们",
- "start_offset": 130,
- "end_offset": 132,
- "type": "word",
- "position": 42
- },
- {
- "token": "体验",
- "start_offset": 132,
- "end_offset": 134,
- "type": "word",
- "position": 43
- },
- {
- "token": "搜索",
- "start_offset": 134,
- "end_offset": 136,
- "type": "word",
- "position": 44
- },
- {
- "token": "强大",
- "start_offset": 137,
- "end_offset": 139,
- "type": "word",
- "position": 46
- },
- {
- "token": "力量",
- "start_offset": 139,
- "end_offset": 141,
- "type": "word",
- "position": 47
- },
- {
- "token": "帮助",
- "start_offset": 142,
- "end_offset": 144,
- "type": "word",
- "position": 49
- },
- {
- "token": "他们",
- "start_offset": 144,
- "end_offset": 146,
- "type": "word",
- "position": 50
- },
- {
- "token": "以",
- "start_offset": 146,
- "end_offset": 147,
- "type": "word",
- "position": 51
- },
- {
- "token": "截然不同",
- "start_offset": 147,
- "end_offset": 151,
- "type": "word",
- "position": 52
- },
- {
- "token": "方式",
- "start_offset": 152,
- "end_offset": 154,
- "type": "word",
- "position": 54
- },
- {
- "token": "探索",
- "start_offset": 154,
- "end_offset": 156,
- "type": "word",
- "position": 55
- },
- {
- "token": "和",
- "start_offset": 156,
- "end_offset": 157,
- "type": "word",
- "position": 56
- },
- {
- "token": "分析",
- "start_offset": 157,
- "end_offset": 159,
- "type": "word",
- "position": 57
- },
- {
- "token": "数据",
- "start_offset": 159,
- "end_offset": 161,
- "type": "word",
- "position": 58
- }
- ]
- }
- --------------------------------------------------
|