uaxurlemail-tokenizer.asciidoc 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. [[analysis-uaxurlemail-tokenizer]]
  2. === UAX URL Email Tokenizer
  3. The `uax_url_email` tokenizer is like the <<analysis-standard-tokenizer,`standard` tokenizer>> except that it
  4. recognises URLs and email addresses as single tokens.
  5. [float]
  6. === Example output
  7. [source,console]
  8. ---------------------------
  9. POST _analyze
  10. {
  11. "tokenizer": "uax_url_email",
  12. "text": "Email me at john.smith@global-international.com"
  13. }
  14. ---------------------------
  15. /////////////////////
  16. [source,console-result]
  17. ----------------------------
  18. {
  19. "tokens": [
  20. {
  21. "token": "Email",
  22. "start_offset": 0,
  23. "end_offset": 5,
  24. "type": "<ALPHANUM>",
  25. "position": 0
  26. },
  27. {
  28. "token": "me",
  29. "start_offset": 6,
  30. "end_offset": 8,
  31. "type": "<ALPHANUM>",
  32. "position": 1
  33. },
  34. {
  35. "token": "at",
  36. "start_offset": 9,
  37. "end_offset": 11,
  38. "type": "<ALPHANUM>",
  39. "position": 2
  40. },
  41. {
  42. "token": "john.smith@global-international.com",
  43. "start_offset": 12,
  44. "end_offset": 47,
  45. "type": "<EMAIL>",
  46. "position": 3
  47. }
  48. ]
  49. }
  50. ----------------------------
  51. /////////////////////
  52. The above sentence would produce the following terms:
  53. [source,text]
  54. ---------------------------
  55. [ Email, me, at, john.smith@global-international.com ]
  56. ---------------------------
  57. while the `standard` tokenizer would produce:
  58. [source,text]
  59. ---------------------------
  60. [ Email, me, at, john.smith, global, international.com ]
  61. ---------------------------
  62. [float]
  63. === Configuration
  64. The `uax_url_email` tokenizer accepts the following parameters:
  65. [horizontal]
  66. `max_token_length`::
  67. The maximum token length. If a token is seen that exceeds this length then
  68. it is split at `max_token_length` intervals. Defaults to `255`.
  69. [float]
  70. === Example configuration
  71. In this example, we configure the `uax_url_email` tokenizer to have a
  72. `max_token_length` of 5 (for demonstration purposes):
  73. [source,console]
  74. ----------------------------
  75. PUT my_index
  76. {
  77. "settings": {
  78. "analysis": {
  79. "analyzer": {
  80. "my_analyzer": {
  81. "tokenizer": "my_tokenizer"
  82. }
  83. },
  84. "tokenizer": {
  85. "my_tokenizer": {
  86. "type": "uax_url_email",
  87. "max_token_length": 5
  88. }
  89. }
  90. }
  91. }
  92. }
  93. POST my_index/_analyze
  94. {
  95. "analyzer": "my_analyzer",
  96. "text": "john.smith@global-international.com"
  97. }
  98. ----------------------------
  99. /////////////////////
  100. [source,console-result]
  101. ----------------------------
  102. {
  103. "tokens": [
  104. {
  105. "token": "john",
  106. "start_offset": 0,
  107. "end_offset": 4,
  108. "type": "<ALPHANUM>",
  109. "position": 0
  110. },
  111. {
  112. "token": "smith",
  113. "start_offset": 5,
  114. "end_offset": 10,
  115. "type": "<ALPHANUM>",
  116. "position": 1
  117. },
  118. {
  119. "token": "globa",
  120. "start_offset": 11,
  121. "end_offset": 16,
  122. "type": "<ALPHANUM>",
  123. "position": 2
  124. },
  125. {
  126. "token": "l",
  127. "start_offset": 16,
  128. "end_offset": 17,
  129. "type": "<ALPHANUM>",
  130. "position": 3
  131. },
  132. {
  133. "token": "inter",
  134. "start_offset": 18,
  135. "end_offset": 23,
  136. "type": "<ALPHANUM>",
  137. "position": 4
  138. },
  139. {
  140. "token": "natio",
  141. "start_offset": 23,
  142. "end_offset": 28,
  143. "type": "<ALPHANUM>",
  144. "position": 5
  145. },
  146. {
  147. "token": "nal.c",
  148. "start_offset": 28,
  149. "end_offset": 33,
  150. "type": "<ALPHANUM>",
  151. "position": 6
  152. },
  153. {
  154. "token": "om",
  155. "start_offset": 33,
  156. "end_offset": 35,
  157. "type": "<ALPHANUM>",
  158. "position": 7
  159. }
  160. ]
  161. }
  162. ----------------------------
  163. /////////////////////
  164. The above example produces the following terms:
  165. [source,text]
  166. ---------------------------
  167. [ john, smith, globa, l, inter, natio, nal.c, om ]
  168. ---------------------------