stop-analyzer.asciidoc 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. [[analysis-stop-analyzer]]
  2. === Stop Analyzer
  3. The `stop` analyzer is the same as the <<analysis-simple-analyzer,`simple` analyzer>>
  4. but adds support for removing stop words. It defaults to using the
  5. `_english_` stop words.
  6. [float]
  7. === Example output
  8. [source,console]
  9. ---------------------------
  10. POST _analyze
  11. {
  12. "analyzer": "stop",
  13. "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
  14. }
  15. ---------------------------
  16. /////////////////////
  17. [source,console-result]
  18. ----------------------------
  19. {
  20. "tokens": [
  21. {
  22. "token": "quick",
  23. "start_offset": 6,
  24. "end_offset": 11,
  25. "type": "word",
  26. "position": 1
  27. },
  28. {
  29. "token": "brown",
  30. "start_offset": 12,
  31. "end_offset": 17,
  32. "type": "word",
  33. "position": 2
  34. },
  35. {
  36. "token": "foxes",
  37. "start_offset": 18,
  38. "end_offset": 23,
  39. "type": "word",
  40. "position": 3
  41. },
  42. {
  43. "token": "jumped",
  44. "start_offset": 24,
  45. "end_offset": 30,
  46. "type": "word",
  47. "position": 4
  48. },
  49. {
  50. "token": "over",
  51. "start_offset": 31,
  52. "end_offset": 35,
  53. "type": "word",
  54. "position": 5
  55. },
  56. {
  57. "token": "lazy",
  58. "start_offset": 40,
  59. "end_offset": 44,
  60. "type": "word",
  61. "position": 7
  62. },
  63. {
  64. "token": "dog",
  65. "start_offset": 45,
  66. "end_offset": 48,
  67. "type": "word",
  68. "position": 8
  69. },
  70. {
  71. "token": "s",
  72. "start_offset": 49,
  73. "end_offset": 50,
  74. "type": "word",
  75. "position": 9
  76. },
  77. {
  78. "token": "bone",
  79. "start_offset": 51,
  80. "end_offset": 55,
  81. "type": "word",
  82. "position": 10
  83. }
  84. ]
  85. }
  86. ----------------------------
  87. /////////////////////
  88. The above sentence would produce the following terms:
  89. [source,text]
  90. ---------------------------
  91. [ quick, brown, foxes, jumped, over, lazy, dog, s, bone ]
  92. ---------------------------
  93. [float]
  94. === Configuration
  95. The `stop` analyzer accepts the following parameters:
  96. [horizontal]
  97. `stopwords`::
  98. A pre-defined stop words list like `_english_` or an array containing a
  99. list of stop words. Defaults to `_english_`.
  100. `stopwords_path`::
  101. The path to a file containing stop words. This path is relative to the
  102. Elasticsearch `config` directory.
  103. See the <<analysis-stop-tokenfilter,Stop Token Filter>> for more information
  104. about stop word configuration.
  105. [float]
  106. === Example configuration
  107. In this example, we configure the `stop` analyzer to use a specified list of
  108. words as stop words:
  109. [source,console]
  110. ----------------------------
  111. PUT my_index
  112. {
  113. "settings": {
  114. "analysis": {
  115. "analyzer": {
  116. "my_stop_analyzer": {
  117. "type": "stop",
  118. "stopwords": ["the", "over"]
  119. }
  120. }
  121. }
  122. }
  123. }
  124. POST my_index/_analyze
  125. {
  126. "analyzer": "my_stop_analyzer",
  127. "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
  128. }
  129. ----------------------------
  130. /////////////////////
  131. [source,console-result]
  132. ----------------------------
  133. {
  134. "tokens": [
  135. {
  136. "token": "quick",
  137. "start_offset": 6,
  138. "end_offset": 11,
  139. "type": "word",
  140. "position": 1
  141. },
  142. {
  143. "token": "brown",
  144. "start_offset": 12,
  145. "end_offset": 17,
  146. "type": "word",
  147. "position": 2
  148. },
  149. {
  150. "token": "foxes",
  151. "start_offset": 18,
  152. "end_offset": 23,
  153. "type": "word",
  154. "position": 3
  155. },
  156. {
  157. "token": "jumped",
  158. "start_offset": 24,
  159. "end_offset": 30,
  160. "type": "word",
  161. "position": 4
  162. },
  163. {
  164. "token": "lazy",
  165. "start_offset": 40,
  166. "end_offset": 44,
  167. "type": "word",
  168. "position": 7
  169. },
  170. {
  171. "token": "dog",
  172. "start_offset": 45,
  173. "end_offset": 48,
  174. "type": "word",
  175. "position": 8
  176. },
  177. {
  178. "token": "s",
  179. "start_offset": 49,
  180. "end_offset": 50,
  181. "type": "word",
  182. "position": 9
  183. },
  184. {
  185. "token": "bone",
  186. "start_offset": 51,
  187. "end_offset": 55,
  188. "type": "word",
  189. "position": 10
  190. }
  191. ]
  192. }
  193. ----------------------------
  194. /////////////////////
  195. The above example produces the following terms:
  196. [source,text]
  197. ---------------------------
  198. [ quick, brown, foxes, jumped, lazy, dog, s, bone ]
  199. ---------------------------
  200. [float]
  201. === Definition
  202. It consists of:
  203. Tokenizer::
  204. * <<analysis-lowercase-tokenizer,Lower Case Tokenizer>>
  205. Token filters::
  206. * <<analysis-stop-tokenfilter,Stop Token Filter>>
  207. If you need to customize the `stop` analyzer beyond the configuration
  208. parameters then you need to recreate it as a `custom` analyzer and modify
  209. it, usually by adding token filters. This would recreate the built-in
  210. `stop` analyzer and you can use it as a starting point for further
  211. customization:
  212. [source,console]
  213. ----------------------------------------------------
  214. PUT /stop_example
  215. {
  216. "settings": {
  217. "analysis": {
  218. "filter": {
  219. "english_stop": {
  220. "type": "stop",
  221. "stopwords": "_english_" <1>
  222. }
  223. },
  224. "analyzer": {
  225. "rebuilt_stop": {
  226. "tokenizer": "lowercase",
  227. "filter": [
  228. "english_stop" <2>
  229. ]
  230. }
  231. }
  232. }
  233. }
  234. }
  235. ----------------------------------------------------
  236. // TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: stop_example, first: stop, second: rebuilt_stop}\nendyaml\n/]
  237. <1> The default stopwords can be overridden with the `stopwords`
  238. or `stopwords_path` parameters.
  239. <2> You'd add any token filters after `english_stop`.