stop-analyzer.asciidoc 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. [[analysis-stop-analyzer]]
  2. === Stop Analyzer
  3. The `stop` analyzer is the same as the <<analysis-simple-analyzer,`simple` analyzer>>
  4. but adds support for removing stop words. It defaults to using the
  5. `_english_` stop words.
  6. [float]
  7. === Example output
  8. [source,js]
  9. ---------------------------
  10. POST _analyze
  11. {
  12. "analyzer": "stop",
  13. "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
  14. }
  15. ---------------------------
  16. // CONSOLE
  17. /////////////////////
  18. [source,console-result]
  19. ----------------------------
  20. {
  21. "tokens": [
  22. {
  23. "token": "quick",
  24. "start_offset": 6,
  25. "end_offset": 11,
  26. "type": "word",
  27. "position": 1
  28. },
  29. {
  30. "token": "brown",
  31. "start_offset": 12,
  32. "end_offset": 17,
  33. "type": "word",
  34. "position": 2
  35. },
  36. {
  37. "token": "foxes",
  38. "start_offset": 18,
  39. "end_offset": 23,
  40. "type": "word",
  41. "position": 3
  42. },
  43. {
  44. "token": "jumped",
  45. "start_offset": 24,
  46. "end_offset": 30,
  47. "type": "word",
  48. "position": 4
  49. },
  50. {
  51. "token": "over",
  52. "start_offset": 31,
  53. "end_offset": 35,
  54. "type": "word",
  55. "position": 5
  56. },
  57. {
  58. "token": "lazy",
  59. "start_offset": 40,
  60. "end_offset": 44,
  61. "type": "word",
  62. "position": 7
  63. },
  64. {
  65. "token": "dog",
  66. "start_offset": 45,
  67. "end_offset": 48,
  68. "type": "word",
  69. "position": 8
  70. },
  71. {
  72. "token": "s",
  73. "start_offset": 49,
  74. "end_offset": 50,
  75. "type": "word",
  76. "position": 9
  77. },
  78. {
  79. "token": "bone",
  80. "start_offset": 51,
  81. "end_offset": 55,
  82. "type": "word",
  83. "position": 10
  84. }
  85. ]
  86. }
  87. ----------------------------
  88. /////////////////////
  89. The above sentence would produce the following terms:
  90. [source,text]
  91. ---------------------------
  92. [ quick, brown, foxes, jumped, over, lazy, dog, s, bone ]
  93. ---------------------------
  94. [float]
  95. === Configuration
  96. The `stop` analyzer accepts the following parameters:
  97. [horizontal]
  98. `stopwords`::
  99. A pre-defined stop words list like `_english_` or an array containing a
  100. list of stop words. Defaults to `_english_`.
  101. `stopwords_path`::
  102. The path to a file containing stop words. This path is relative to the
  103. Elasticsearch `config` directory.
  104. See the <<analysis-stop-tokenfilter,Stop Token Filter>> for more information
  105. about stop word configuration.
  106. [float]
  107. === Example configuration
  108. In this example, we configure the `stop` analyzer to use a specified list of
  109. words as stop words:
  110. [source,js]
  111. ----------------------------
  112. PUT my_index
  113. {
  114. "settings": {
  115. "analysis": {
  116. "analyzer": {
  117. "my_stop_analyzer": {
  118. "type": "stop",
  119. "stopwords": ["the", "over"]
  120. }
  121. }
  122. }
  123. }
  124. }
  125. POST my_index/_analyze
  126. {
  127. "analyzer": "my_stop_analyzer",
  128. "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
  129. }
  130. ----------------------------
  131. // CONSOLE
  132. /////////////////////
  133. [source,console-result]
  134. ----------------------------
  135. {
  136. "tokens": [
  137. {
  138. "token": "quick",
  139. "start_offset": 6,
  140. "end_offset": 11,
  141. "type": "word",
  142. "position": 1
  143. },
  144. {
  145. "token": "brown",
  146. "start_offset": 12,
  147. "end_offset": 17,
  148. "type": "word",
  149. "position": 2
  150. },
  151. {
  152. "token": "foxes",
  153. "start_offset": 18,
  154. "end_offset": 23,
  155. "type": "word",
  156. "position": 3
  157. },
  158. {
  159. "token": "jumped",
  160. "start_offset": 24,
  161. "end_offset": 30,
  162. "type": "word",
  163. "position": 4
  164. },
  165. {
  166. "token": "lazy",
  167. "start_offset": 40,
  168. "end_offset": 44,
  169. "type": "word",
  170. "position": 7
  171. },
  172. {
  173. "token": "dog",
  174. "start_offset": 45,
  175. "end_offset": 48,
  176. "type": "word",
  177. "position": 8
  178. },
  179. {
  180. "token": "s",
  181. "start_offset": 49,
  182. "end_offset": 50,
  183. "type": "word",
  184. "position": 9
  185. },
  186. {
  187. "token": "bone",
  188. "start_offset": 51,
  189. "end_offset": 55,
  190. "type": "word",
  191. "position": 10
  192. }
  193. ]
  194. }
  195. ----------------------------
  196. /////////////////////
  197. The above example produces the following terms:
  198. [source,text]
  199. ---------------------------
  200. [ quick, brown, foxes, jumped, lazy, dog, s, bone ]
  201. ---------------------------
  202. [float]
  203. === Definition
  204. It consists of:
  205. Tokenizer::
  206. * <<analysis-lowercase-tokenizer,Lower Case Tokenizer>>
  207. Token filters::
  208. * <<analysis-stop-tokenfilter,Stop Token Filter>>
  209. If you need to customize the `stop` analyzer beyond the configuration
  210. parameters then you need to recreate it as a `custom` analyzer and modify
  211. it, usually by adding token filters. This would recreate the built-in
  212. `stop` analyzer and you can use it as a starting point for further
  213. customization:
  214. [source,js]
  215. ----------------------------------------------------
  216. PUT /stop_example
  217. {
  218. "settings": {
  219. "analysis": {
  220. "filter": {
  221. "english_stop": {
  222. "type": "stop",
  223. "stopwords": "_english_" <1>
  224. }
  225. },
  226. "analyzer": {
  227. "rebuilt_stop": {
  228. "tokenizer": "lowercase",
  229. "filter": [
  230. "english_stop" <2>
  231. ]
  232. }
  233. }
  234. }
  235. }
  236. }
  237. ----------------------------------------------------
  238. // CONSOLE
  239. // TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: stop_example, first: stop, second: rebuilt_stop}\nendyaml\n/]
  240. <1> The default stopwords can be overridden with the `stopwords`
  241. or `stopwords_path` parameters.
  242. <2> You'd add any token filters after `english_stop`.