uaxurlemail-tokenizer.asciidoc 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. [[analysis-uaxurlemail-tokenizer]]
  2. === UAX URL Email Tokenizer
  3. The `uax_url_email` tokenizer is like the <<analysis-standard-tokenizer,`standard` tokenizer>> except that it
  4. recognises URLs and email addresses as single tokens.
  5. [float]
  6. === Example output
  7. [source,js]
  8. ---------------------------
  9. POST _analyze
  10. {
  11. "tokenizer": "uax_url_email",
  12. "text": "Email me at john.smith@global-international.com"
  13. }
  14. ---------------------------
  15. // CONSOLE
  16. /////////////////////
  17. [source,js]
  18. ----------------------------
  19. {
  20. "tokens": [
  21. {
  22. "token": "Email",
  23. "start_offset": 0,
  24. "end_offset": 5,
  25. "type": "<ALPHANUM>",
  26. "position": 0
  27. },
  28. {
  29. "token": "me",
  30. "start_offset": 6,
  31. "end_offset": 8,
  32. "type": "<ALPHANUM>",
  33. "position": 1
  34. },
  35. {
  36. "token": "at",
  37. "start_offset": 9,
  38. "end_offset": 11,
  39. "type": "<ALPHANUM>",
  40. "position": 2
  41. },
  42. {
  43. "token": "john.smith@global-international.com",
  44. "start_offset": 12,
  45. "end_offset": 47,
  46. "type": "<EMAIL>",
  47. "position": 3
  48. }
  49. ]
  50. }
  51. ----------------------------
  52. // TESTRESPONSE
  53. /////////////////////
  54. The above sentence would produce the following terms:
  55. [source,text]
  56. ---------------------------
  57. [ Email, me, at, john.smith@global-international.com ]
  58. ---------------------------
  59. while the `standard` tokenizer would produce:
  60. [source,text]
  61. ---------------------------
  62. [ Email, me, at, john.smith, global, international.com ]
  63. ---------------------------
  64. [float]
  65. === Configuration
  66. The `uax_url_email` tokenizer accepts the following parameters:
  67. [horizontal]
  68. `max_token_length`::
  69. The maximum token length. If a token is seen that exceeds this length then
  70. it is split at `max_token_length` intervals. Defaults to `255`.
  71. [float]
  72. === Example configuration
  73. In this example, we configure the `uax_url_email` tokenizer to have a
  74. `max_token_length` of 5 (for demonstration purposes):
  75. [source,js]
  76. ----------------------------
  77. PUT my_index
  78. {
  79. "settings": {
  80. "analysis": {
  81. "analyzer": {
  82. "my_analyzer": {
  83. "tokenizer": "my_tokenizer"
  84. }
  85. },
  86. "tokenizer": {
  87. "my_tokenizer": {
  88. "type": "uax_url_email",
  89. "max_token_length": 5
  90. }
  91. }
  92. }
  93. }
  94. }
  95. GET _cluster/health?wait_for_status=yellow
  96. POST my_index/_analyze
  97. {
  98. "analyzer": "my_analyzer",
  99. "text": "john.smith@global-international.com"
  100. }
  101. ----------------------------
  102. // CONSOLE
  103. /////////////////////
  104. [source,js]
  105. ----------------------------
  106. {
  107. "tokens": [
  108. {
  109. "token": "john",
  110. "start_offset": 0,
  111. "end_offset": 4,
  112. "type": "<ALPHANUM>",
  113. "position": 0
  114. },
  115. {
  116. "token": "smith",
  117. "start_offset": 5,
  118. "end_offset": 10,
  119. "type": "<ALPHANUM>",
  120. "position": 1
  121. },
  122. {
  123. "token": "globa",
  124. "start_offset": 11,
  125. "end_offset": 16,
  126. "type": "<ALPHANUM>",
  127. "position": 2
  128. },
  129. {
  130. "token": "l",
  131. "start_offset": 16,
  132. "end_offset": 17,
  133. "type": "<ALPHANUM>",
  134. "position": 3
  135. },
  136. {
  137. "token": "inter",
  138. "start_offset": 18,
  139. "end_offset": 23,
  140. "type": "<ALPHANUM>",
  141. "position": 4
  142. },
  143. {
  144. "token": "natio",
  145. "start_offset": 23,
  146. "end_offset": 28,
  147. "type": "<ALPHANUM>",
  148. "position": 5
  149. },
  150. {
  151. "token": "nal.c",
  152. "start_offset": 28,
  153. "end_offset": 33,
  154. "type": "<ALPHANUM>",
  155. "position": 6
  156. },
  157. {
  158. "token": "om",
  159. "start_offset": 33,
  160. "end_offset": 35,
  161. "type": "<ALPHANUM>",
  162. "position": 7
  163. }
  164. ]
  165. }
  166. ----------------------------
  167. // TESTRESPONSE
  168. /////////////////////
  169. The above example produces the following terms:
  170. [source,text]
  171. ---------------------------
  172. [ john, smith, globa, l, inter, natio, nal.c, om ]
  173. ---------------------------