letter-tokenizer.asciidoc 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. [[analysis-letter-tokenizer]]
  2. === Letter Tokenizer
  3. The `letter` tokenizer breaks text into terms whenever it encounters a
  4. character which is not a letter. It does a reasonable job for most European
  5. languages, but does a terrible job for some Asian languages, where words are
  6. not separated by spaces.
  7. [float]
  8. === Example output
  9. [source,js]
  10. ---------------------------
  11. POST _analyze
  12. {
  13. "tokenizer": "letter",
  14. "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
  15. }
  16. ---------------------------
  17. // CONSOLE
  18. /////////////////////
  19. [source,js]
  20. ----------------------------
  21. {
  22. "tokens": [
  23. {
  24. "token": "The",
  25. "start_offset": 0,
  26. "end_offset": 3,
  27. "type": "word",
  28. "position": 0
  29. },
  30. {
  31. "token": "QUICK",
  32. "start_offset": 6,
  33. "end_offset": 11,
  34. "type": "word",
  35. "position": 1
  36. },
  37. {
  38. "token": "Brown",
  39. "start_offset": 12,
  40. "end_offset": 17,
  41. "type": "word",
  42. "position": 2
  43. },
  44. {
  45. "token": "Foxes",
  46. "start_offset": 18,
  47. "end_offset": 23,
  48. "type": "word",
  49. "position": 3
  50. },
  51. {
  52. "token": "jumped",
  53. "start_offset": 24,
  54. "end_offset": 30,
  55. "type": "word",
  56. "position": 4
  57. },
  58. {
  59. "token": "over",
  60. "start_offset": 31,
  61. "end_offset": 35,
  62. "type": "word",
  63. "position": 5
  64. },
  65. {
  66. "token": "the",
  67. "start_offset": 36,
  68. "end_offset": 39,
  69. "type": "word",
  70. "position": 6
  71. },
  72. {
  73. "token": "lazy",
  74. "start_offset": 40,
  75. "end_offset": 44,
  76. "type": "word",
  77. "position": 7
  78. },
  79. {
  80. "token": "dog",
  81. "start_offset": 45,
  82. "end_offset": 48,
  83. "type": "word",
  84. "position": 8
  85. },
  86. {
  87. "token": "s",
  88. "start_offset": 49,
  89. "end_offset": 50,
  90. "type": "word",
  91. "position": 9
  92. },
  93. {
  94. "token": "bone",
  95. "start_offset": 51,
  96. "end_offset": 55,
  97. "type": "word",
  98. "position": 10
  99. }
  100. ]
  101. }
  102. ----------------------------
  103. // TESTRESPONSE
  104. /////////////////////
  105. The above sentence would produce the following terms:
  106. [source,text]
  107. ---------------------------
  108. [ The, QUICK, Brown, Foxes, jumped, over, the, lazy, dog, s, bone ]
  109. ---------------------------
  110. [float]
  111. === Configuration
  112. The `letter` tokenizer is not configurable.