letter-tokenizer.asciidoc 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. [[analysis-letter-tokenizer]]
  2. === Letter Tokenizer
  3. The `letter` tokenizer breaks text into terms whenever it encounters a
  4. character which is not a letter. It does a reasonable job for most European
  5. languages, but does a terrible job for some Asian languages, where words are
  6. not separated by spaces.
  7. [float]
  8. === Example output
  9. [source,console]
  10. ---------------------------
  11. POST _analyze
  12. {
  13. "tokenizer": "letter",
  14. "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
  15. }
  16. ---------------------------
  17. /////////////////////
  18. [source,console-result]
  19. ----------------------------
  20. {
  21. "tokens": [
  22. {
  23. "token": "The",
  24. "start_offset": 0,
  25. "end_offset": 3,
  26. "type": "word",
  27. "position": 0
  28. },
  29. {
  30. "token": "QUICK",
  31. "start_offset": 6,
  32. "end_offset": 11,
  33. "type": "word",
  34. "position": 1
  35. },
  36. {
  37. "token": "Brown",
  38. "start_offset": 12,
  39. "end_offset": 17,
  40. "type": "word",
  41. "position": 2
  42. },
  43. {
  44. "token": "Foxes",
  45. "start_offset": 18,
  46. "end_offset": 23,
  47. "type": "word",
  48. "position": 3
  49. },
  50. {
  51. "token": "jumped",
  52. "start_offset": 24,
  53. "end_offset": 30,
  54. "type": "word",
  55. "position": 4
  56. },
  57. {
  58. "token": "over",
  59. "start_offset": 31,
  60. "end_offset": 35,
  61. "type": "word",
  62. "position": 5
  63. },
  64. {
  65. "token": "the",
  66. "start_offset": 36,
  67. "end_offset": 39,
  68. "type": "word",
  69. "position": 6
  70. },
  71. {
  72. "token": "lazy",
  73. "start_offset": 40,
  74. "end_offset": 44,
  75. "type": "word",
  76. "position": 7
  77. },
  78. {
  79. "token": "dog",
  80. "start_offset": 45,
  81. "end_offset": 48,
  82. "type": "word",
  83. "position": 8
  84. },
  85. {
  86. "token": "s",
  87. "start_offset": 49,
  88. "end_offset": 50,
  89. "type": "word",
  90. "position": 9
  91. },
  92. {
  93. "token": "bone",
  94. "start_offset": 51,
  95. "end_offset": 55,
  96. "type": "word",
  97. "position": 10
  98. }
  99. ]
  100. }
  101. ----------------------------
  102. /////////////////////
  103. The above sentence would produce the following terms:
  104. [source,text]
  105. ---------------------------
  106. [ The, QUICK, Brown, Foxes, jumped, over, the, lazy, dog, s, bone ]
  107. ---------------------------
  108. [float]
  109. === Configuration
  110. The `letter` tokenizer is not configurable.