1
0

analysis-smartcn.asciidoc 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428
  1. [[analysis-smartcn]]
  2. === Smart Chinese Analysis Plugin
  3. The Smart Chinese Analysis plugin integrates Lucene's Smart Chinese analysis
  4. module into elasticsearch.
  5. It provides an analyzer for Chinese or mixed Chinese-English text. This
  6. analyzer uses probabilistic knowledge to find the optimal word segmentation
  7. for Simplified Chinese text. The text is first broken into sentences, then
  8. each sentence is segmented into words.
  9. :plugin_name: analysis-smartcn
  10. include::install_remove.asciidoc[]
  11. [[analysis-smartcn-tokenizer]]
  12. [float]
  13. ==== `smartcn` tokenizer and token filter
  14. The plugin provides the `smartcn` analyzer, `smartcn_tokenizer` tokenizer, and
  15. `smartcn_stop` token filter which are not configurable.
  16. NOTE: The `smartcn_word` token filter and `smartcn_sentence` have been deprecated.
  17. ==== Reimplementing and extending the analyzers
  18. The `smartcn` analyzer could be reimplemented as a `custom` analyzer that can
  19. then be extended and configured as follows:
  20. [source,console]
  21. ----------------------------------------------------
  22. PUT smartcn_example
  23. {
  24. "settings": {
  25. "analysis": {
  26. "analyzer": {
  27. "rebuilt_smartcn": {
  28. "tokenizer": "smartcn_tokenizer",
  29. "filter": [
  30. "porter_stem",
  31. "smartcn_stop"
  32. ]
  33. }
  34. }
  35. }
  36. }
  37. }
  38. ----------------------------------------------------
  39. // TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: smartcn_example, first: smartcn, second: rebuilt_smartcn}\nendyaml\n/]
  40. [[analysis-smartcn_stop]]
  41. ==== `smartcn_stop` token filter
  42. The `smartcn_stop` token filter filters out stopwords defined by `smartcn`
  43. analyzer (`_smartcn_`), and any other custom stopwords specified by the user.
  44. This filter only supports the predefined `_smartcn_` stopwords list.
  45. If you want to use a different predefined list, then use the
  46. {ref}/analysis-stop-tokenfilter.html[`stop` token filter] instead.
  47. [source,console]
  48. --------------------------------------------------
  49. PUT smartcn_example
  50. {
  51. "settings": {
  52. "index": {
  53. "analysis": {
  54. "analyzer": {
  55. "smartcn_with_stop": {
  56. "tokenizer": "smartcn_tokenizer",
  57. "filter": [
  58. "porter_stem",
  59. "my_smartcn_stop"
  60. ]
  61. }
  62. },
  63. "filter": {
  64. "my_smartcn_stop": {
  65. "type": "smartcn_stop",
  66. "stopwords": [
  67. "_smartcn_",
  68. "stack",
  69. "的"
  70. ]
  71. }
  72. }
  73. }
  74. }
  75. }
  76. }
  77. GET smartcn_example/_analyze
  78. {
  79. "analyzer": "smartcn_with_stop",
  80. "text": "哈喽,我们是 Elastic 我们是 Elastic Stack(Elasticsearch、Kibana、Beats 和 Logstash)的开发公司。从股票行情到 Twitter 消息流,从 Apache 日志到 WordPress 博文,我们可以帮助人们体验搜索的强大力量,帮助他们以截然不同的方式探索和分析数据"
  81. }
  82. --------------------------------------------------
  83. The above request returns:
  84. [source,console-result]
  85. --------------------------------------------------
  86. {
  87. "tokens": [
  88. {
  89. "token": "哈",
  90. "start_offset": 0,
  91. "end_offset": 1,
  92. "type": "word",
  93. "position": 0
  94. },
  95. {
  96. "token": "喽",
  97. "start_offset": 1,
  98. "end_offset": 2,
  99. "type": "word",
  100. "position": 1
  101. },
  102. {
  103. "token": "我们",
  104. "start_offset": 3,
  105. "end_offset": 5,
  106. "type": "word",
  107. "position": 3
  108. },
  109. {
  110. "token": "是",
  111. "start_offset": 5,
  112. "end_offset": 6,
  113. "type": "word",
  114. "position": 4
  115. },
  116. {
  117. "token": "elast",
  118. "start_offset": 7,
  119. "end_offset": 14,
  120. "type": "word",
  121. "position": 5
  122. },
  123. {
  124. "token": "我们",
  125. "start_offset": 17,
  126. "end_offset": 19,
  127. "type": "word",
  128. "position": 6
  129. },
  130. {
  131. "token": "是",
  132. "start_offset": 19,
  133. "end_offset": 20,
  134. "type": "word",
  135. "position": 7
  136. },
  137. {
  138. "token": "elast",
  139. "start_offset": 21,
  140. "end_offset": 28,
  141. "type": "word",
  142. "position": 8
  143. },
  144. {
  145. "token": "elasticsearch",
  146. "start_offset": 35,
  147. "end_offset": 48,
  148. "type": "word",
  149. "position": 11
  150. },
  151. {
  152. "token": "kibana",
  153. "start_offset": 49,
  154. "end_offset": 55,
  155. "type": "word",
  156. "position": 13
  157. },
  158. {
  159. "token": "beat",
  160. "start_offset": 56,
  161. "end_offset": 61,
  162. "type": "word",
  163. "position": 15
  164. },
  165. {
  166. "token": "和",
  167. "start_offset": 62,
  168. "end_offset": 63,
  169. "type": "word",
  170. "position": 16
  171. },
  172. {
  173. "token": "logstash",
  174. "start_offset": 64,
  175. "end_offset": 72,
  176. "type": "word",
  177. "position": 17
  178. },
  179. {
  180. "token": "开发",
  181. "start_offset": 74,
  182. "end_offset": 76,
  183. "type": "word",
  184. "position": 20
  185. },
  186. {
  187. "token": "公司",
  188. "start_offset": 76,
  189. "end_offset": 78,
  190. "type": "word",
  191. "position": 21
  192. },
  193. {
  194. "token": "从",
  195. "start_offset": 79,
  196. "end_offset": 80,
  197. "type": "word",
  198. "position": 23
  199. },
  200. {
  201. "token": "股票",
  202. "start_offset": 80,
  203. "end_offset": 82,
  204. "type": "word",
  205. "position": 24
  206. },
  207. {
  208. "token": "行情",
  209. "start_offset": 82,
  210. "end_offset": 84,
  211. "type": "word",
  212. "position": 25
  213. },
  214. {
  215. "token": "到",
  216. "start_offset": 84,
  217. "end_offset": 85,
  218. "type": "word",
  219. "position": 26
  220. },
  221. {
  222. "token": "twitter",
  223. "start_offset": 86,
  224. "end_offset": 93,
  225. "type": "word",
  226. "position": 27
  227. },
  228. {
  229. "token": "消息",
  230. "start_offset": 94,
  231. "end_offset": 96,
  232. "type": "word",
  233. "position": 28
  234. },
  235. {
  236. "token": "流",
  237. "start_offset": 96,
  238. "end_offset": 97,
  239. "type": "word",
  240. "position": 29
  241. },
  242. {
  243. "token": "从",
  244. "start_offset": 98,
  245. "end_offset": 99,
  246. "type": "word",
  247. "position": 31
  248. },
  249. {
  250. "token": "apach",
  251. "start_offset": 100,
  252. "end_offset": 106,
  253. "type": "word",
  254. "position": 32
  255. },
  256. {
  257. "token": "日志",
  258. "start_offset": 107,
  259. "end_offset": 109,
  260. "type": "word",
  261. "position": 33
  262. },
  263. {
  264. "token": "到",
  265. "start_offset": 109,
  266. "end_offset": 110,
  267. "type": "word",
  268. "position": 34
  269. },
  270. {
  271. "token": "wordpress",
  272. "start_offset": 111,
  273. "end_offset": 120,
  274. "type": "word",
  275. "position": 35
  276. },
  277. {
  278. "token": "博",
  279. "start_offset": 121,
  280. "end_offset": 122,
  281. "type": "word",
  282. "position": 36
  283. },
  284. {
  285. "token": "文",
  286. "start_offset": 122,
  287. "end_offset": 123,
  288. "type": "word",
  289. "position": 37
  290. },
  291. {
  292. "token": "我们",
  293. "start_offset": 124,
  294. "end_offset": 126,
  295. "type": "word",
  296. "position": 39
  297. },
  298. {
  299. "token": "可以",
  300. "start_offset": 126,
  301. "end_offset": 128,
  302. "type": "word",
  303. "position": 40
  304. },
  305. {
  306. "token": "帮助",
  307. "start_offset": 128,
  308. "end_offset": 130,
  309. "type": "word",
  310. "position": 41
  311. },
  312. {
  313. "token": "人们",
  314. "start_offset": 130,
  315. "end_offset": 132,
  316. "type": "word",
  317. "position": 42
  318. },
  319. {
  320. "token": "体验",
  321. "start_offset": 132,
  322. "end_offset": 134,
  323. "type": "word",
  324. "position": 43
  325. },
  326. {
  327. "token": "搜索",
  328. "start_offset": 134,
  329. "end_offset": 136,
  330. "type": "word",
  331. "position": 44
  332. },
  333. {
  334. "token": "强大",
  335. "start_offset": 137,
  336. "end_offset": 139,
  337. "type": "word",
  338. "position": 46
  339. },
  340. {
  341. "token": "力量",
  342. "start_offset": 139,
  343. "end_offset": 141,
  344. "type": "word",
  345. "position": 47
  346. },
  347. {
  348. "token": "帮助",
  349. "start_offset": 142,
  350. "end_offset": 144,
  351. "type": "word",
  352. "position": 49
  353. },
  354. {
  355. "token": "他们",
  356. "start_offset": 144,
  357. "end_offset": 146,
  358. "type": "word",
  359. "position": 50
  360. },
  361. {
  362. "token": "以",
  363. "start_offset": 146,
  364. "end_offset": 147,
  365. "type": "word",
  366. "position": 51
  367. },
  368. {
  369. "token": "截然不同",
  370. "start_offset": 147,
  371. "end_offset": 151,
  372. "type": "word",
  373. "position": 52
  374. },
  375. {
  376. "token": "方式",
  377. "start_offset": 152,
  378. "end_offset": 154,
  379. "type": "word",
  380. "position": 54
  381. },
  382. {
  383. "token": "探索",
  384. "start_offset": 154,
  385. "end_offset": 156,
  386. "type": "word",
  387. "position": 55
  388. },
  389. {
  390. "token": "和",
  391. "start_offset": 156,
  392. "end_offset": 157,
  393. "type": "word",
  394. "position": 56
  395. },
  396. {
  397. "token": "分析",
  398. "start_offset": 157,
  399. "end_offset": 159,
  400. "type": "word",
  401. "position": 57
  402. },
  403. {
  404. "token": "数据",
  405. "start_offset": 159,
  406. "end_offset": 161,
  407. "type": "word",
  408. "position": 58
  409. }
  410. ]
  411. }
  412. --------------------------------------------------