whitespace-analyzer.asciidoc 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. [[analysis-whitespace-analyzer]]
  2. === Whitespace Analyzer
  3. The `whitespace` analyzer breaks text into terms whenever it encounters a
  4. whitespace character.
  5. [float]
  6. === Definition
  7. It consists of:
  8. Tokenizer::
  9. * <<analysis-whitespace-tokenizer,Whitespace Tokenizer>>
  10. [float]
  11. === Example output
  12. [source,js]
  13. ---------------------------
  14. POST _analyze
  15. {
  16. "analyzer": "whitespace",
  17. "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
  18. }
  19. ---------------------------
  20. // CONSOLE
  21. /////////////////////
  22. [source,js]
  23. ----------------------------
  24. {
  25. "tokens": [
  26. {
  27. "token": "The",
  28. "start_offset": 0,
  29. "end_offset": 3,
  30. "type": "word",
  31. "position": 0
  32. },
  33. {
  34. "token": "2",
  35. "start_offset": 4,
  36. "end_offset": 5,
  37. "type": "word",
  38. "position": 1
  39. },
  40. {
  41. "token": "QUICK",
  42. "start_offset": 6,
  43. "end_offset": 11,
  44. "type": "word",
  45. "position": 2
  46. },
  47. {
  48. "token": "Brown-Foxes",
  49. "start_offset": 12,
  50. "end_offset": 23,
  51. "type": "word",
  52. "position": 3
  53. },
  54. {
  55. "token": "jumped",
  56. "start_offset": 24,
  57. "end_offset": 30,
  58. "type": "word",
  59. "position": 4
  60. },
  61. {
  62. "token": "over",
  63. "start_offset": 31,
  64. "end_offset": 35,
  65. "type": "word",
  66. "position": 5
  67. },
  68. {
  69. "token": "the",
  70. "start_offset": 36,
  71. "end_offset": 39,
  72. "type": "word",
  73. "position": 6
  74. },
  75. {
  76. "token": "lazy",
  77. "start_offset": 40,
  78. "end_offset": 44,
  79. "type": "word",
  80. "position": 7
  81. },
  82. {
  83. "token": "dog's",
  84. "start_offset": 45,
  85. "end_offset": 50,
  86. "type": "word",
  87. "position": 8
  88. },
  89. {
  90. "token": "bone.",
  91. "start_offset": 51,
  92. "end_offset": 56,
  93. "type": "word",
  94. "position": 9
  95. }
  96. ]
  97. }
  98. ----------------------------
  99. // TESTRESPONSE
  100. /////////////////////
  101. The above sentence would produce the following terms:
  102. [source,text]
  103. ---------------------------
  104. [ The, 2, QUICK, Brown-Foxes, jumped, over, the, lazy, dog's, bone. ]
  105. ---------------------------
  106. [float]
  107. === Configuration
  108. The `whitespace` analyzer is not configurable.