standard-tokenizer.asciidoc 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265
  1. [[analysis-standard-tokenizer]]
  2. === Standard Tokenizer
  3. The `standard` tokenizer provides grammar based tokenization (based on the
  4. Unicode Text Segmentation algorithm, as specified in
  5. http://unicode.org/reports/tr29/[Unicode Standard Annex #29]) and works well
  6. for most languages.
  7. [float]
  8. === Example output
  9. [source,console]
  10. ---------------------------
  11. POST _analyze
  12. {
  13. "tokenizer": "standard",
  14. "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
  15. }
  16. ---------------------------
  17. /////////////////////
  18. [source,console-result]
  19. ----------------------------
  20. {
  21. "tokens": [
  22. {
  23. "token": "The",
  24. "start_offset": 0,
  25. "end_offset": 3,
  26. "type": "<ALPHANUM>",
  27. "position": 0
  28. },
  29. {
  30. "token": "2",
  31. "start_offset": 4,
  32. "end_offset": 5,
  33. "type": "<NUM>",
  34. "position": 1
  35. },
  36. {
  37. "token": "QUICK",
  38. "start_offset": 6,
  39. "end_offset": 11,
  40. "type": "<ALPHANUM>",
  41. "position": 2
  42. },
  43. {
  44. "token": "Brown",
  45. "start_offset": 12,
  46. "end_offset": 17,
  47. "type": "<ALPHANUM>",
  48. "position": 3
  49. },
  50. {
  51. "token": "Foxes",
  52. "start_offset": 18,
  53. "end_offset": 23,
  54. "type": "<ALPHANUM>",
  55. "position": 4
  56. },
  57. {
  58. "token": "jumped",
  59. "start_offset": 24,
  60. "end_offset": 30,
  61. "type": "<ALPHANUM>",
  62. "position": 5
  63. },
  64. {
  65. "token": "over",
  66. "start_offset": 31,
  67. "end_offset": 35,
  68. "type": "<ALPHANUM>",
  69. "position": 6
  70. },
  71. {
  72. "token": "the",
  73. "start_offset": 36,
  74. "end_offset": 39,
  75. "type": "<ALPHANUM>",
  76. "position": 7
  77. },
  78. {
  79. "token": "lazy",
  80. "start_offset": 40,
  81. "end_offset": 44,
  82. "type": "<ALPHANUM>",
  83. "position": 8
  84. },
  85. {
  86. "token": "dog's",
  87. "start_offset": 45,
  88. "end_offset": 50,
  89. "type": "<ALPHANUM>",
  90. "position": 9
  91. },
  92. {
  93. "token": "bone",
  94. "start_offset": 51,
  95. "end_offset": 55,
  96. "type": "<ALPHANUM>",
  97. "position": 10
  98. }
  99. ]
  100. }
  101. ----------------------------
  102. /////////////////////
  103. The above sentence would produce the following terms:
  104. [source,text]
  105. ---------------------------
  106. [ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ]
  107. ---------------------------
  108. [float]
  109. === Configuration
  110. The `standard` tokenizer accepts the following parameters:
  111. [horizontal]
  112. `max_token_length`::
  113. The maximum token length. If a token is seen that exceeds this length then
  114. it is split at `max_token_length` intervals. Defaults to `255`.
  115. [float]
  116. === Example configuration
  117. In this example, we configure the `standard` tokenizer to have a
  118. `max_token_length` of 5 (for demonstration purposes):
  119. [source,console]
  120. ----------------------------
  121. PUT my_index
  122. {
  123. "settings": {
  124. "analysis": {
  125. "analyzer": {
  126. "my_analyzer": {
  127. "tokenizer": "my_tokenizer"
  128. }
  129. },
  130. "tokenizer": {
  131. "my_tokenizer": {
  132. "type": "standard",
  133. "max_token_length": 5
  134. }
  135. }
  136. }
  137. }
  138. }
  139. POST my_index/_analyze
  140. {
  141. "analyzer": "my_analyzer",
  142. "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
  143. }
  144. ----------------------------
  145. /////////////////////
  146. [source,console-result]
  147. ----------------------------
  148. {
  149. "tokens": [
  150. {
  151. "token": "The",
  152. "start_offset": 0,
  153. "end_offset": 3,
  154. "type": "<ALPHANUM>",
  155. "position": 0
  156. },
  157. {
  158. "token": "2",
  159. "start_offset": 4,
  160. "end_offset": 5,
  161. "type": "<NUM>",
  162. "position": 1
  163. },
  164. {
  165. "token": "QUICK",
  166. "start_offset": 6,
  167. "end_offset": 11,
  168. "type": "<ALPHANUM>",
  169. "position": 2
  170. },
  171. {
  172. "token": "Brown",
  173. "start_offset": 12,
  174. "end_offset": 17,
  175. "type": "<ALPHANUM>",
  176. "position": 3
  177. },
  178. {
  179. "token": "Foxes",
  180. "start_offset": 18,
  181. "end_offset": 23,
  182. "type": "<ALPHANUM>",
  183. "position": 4
  184. },
  185. {
  186. "token": "jumpe",
  187. "start_offset": 24,
  188. "end_offset": 29,
  189. "type": "<ALPHANUM>",
  190. "position": 5
  191. },
  192. {
  193. "token": "d",
  194. "start_offset": 29,
  195. "end_offset": 30,
  196. "type": "<ALPHANUM>",
  197. "position": 6
  198. },
  199. {
  200. "token": "over",
  201. "start_offset": 31,
  202. "end_offset": 35,
  203. "type": "<ALPHANUM>",
  204. "position": 7
  205. },
  206. {
  207. "token": "the",
  208. "start_offset": 36,
  209. "end_offset": 39,
  210. "type": "<ALPHANUM>",
  211. "position": 8
  212. },
  213. {
  214. "token": "lazy",
  215. "start_offset": 40,
  216. "end_offset": 44,
  217. "type": "<ALPHANUM>",
  218. "position": 9
  219. },
  220. {
  221. "token": "dog's",
  222. "start_offset": 45,
  223. "end_offset": 50,
  224. "type": "<ALPHANUM>",
  225. "position": 10
  226. },
  227. {
  228. "token": "bone",
  229. "start_offset": 51,
  230. "end_offset": 55,
  231. "type": "<ALPHANUM>",
  232. "position": 11
  233. }
  234. ]
  235. }
  236. ----------------------------
  237. /////////////////////
  238. The above example produces the following terms:
  239. [source,text]
  240. ---------------------------
  241. [ The, 2, QUICK, Brown, Foxes, jumpe, d, over, the, lazy, dog's, bone ]
  242. ---------------------------