analysis-stempel.asciidoc 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. [[analysis-stempel]]
  2. === Stempel Polish Analysis Plugin
  3. The Stempel Analysis plugin integrates Lucene's Stempel analysis
  4. module for Polish into elasticsearch.
  5. :plugin_name: analysis-stempel
  6. include::install_remove.asciidoc[]
  7. [[analysis-stempel-tokenizer]]
  8. [discrete]
  9. ==== `stempel` tokenizer and token filters
  10. The plugin provides the `polish` analyzer and the `polish_stem` and `polish_stop` token filters,
  11. which are not configurable.
  12. ==== Reimplementing and extending the analyzers
  13. The `polish` analyzer could be reimplemented as a `custom` analyzer that can
  14. then be extended and configured differently as follows:
  15. [source,console]
  16. ----------------------------------------------------
  17. PUT /stempel_example
  18. {
  19. "settings": {
  20. "analysis": {
  21. "analyzer": {
  22. "rebuilt_stempel": {
  23. "tokenizer": "standard",
  24. "filter": [
  25. "lowercase",
  26. "polish_stop",
  27. "polish_stem"
  28. ]
  29. }
  30. }
  31. }
  32. }
  33. }
  34. ----------------------------------------------------
  35. // TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: stempel_example, first: polish, second: rebuilt_stempel}\nendyaml\n/]
  36. [[analysis-polish-stop]]
  37. ==== `polish_stop` token filter
  38. The `polish_stop` token filter filters out Polish stopwords (`_polish_`), and
  39. any other custom stopwords specified by the user. This filter only supports
  40. the predefined `_polish_` stopwords list. If you want to use a different
  41. predefined list, then use the
  42. {ref}/analysis-stop-tokenfilter.html[`stop` token filter] instead.
  43. [source,console]
  44. --------------------------------------------------
  45. PUT /polish_stop_example
  46. {
  47. "settings": {
  48. "index": {
  49. "analysis": {
  50. "analyzer": {
  51. "analyzer_with_stop": {
  52. "tokenizer": "standard",
  53. "filter": [
  54. "lowercase",
  55. "polish_stop"
  56. ]
  57. }
  58. },
  59. "filter": {
  60. "polish_stop": {
  61. "type": "polish_stop",
  62. "stopwords": [
  63. "_polish_",
  64. "jeść"
  65. ]
  66. }
  67. }
  68. }
  69. }
  70. }
  71. }
  72. GET polish_stop_example/_analyze
  73. {
  74. "analyzer": "analyzer_with_stop",
  75. "text": "Gdzie kucharek sześć, tam nie ma co jeść."
  76. }
  77. --------------------------------------------------
  78. The above request returns:
  79. [source,console-result]
  80. --------------------------------------------------
  81. {
  82. "tokens" : [
  83. {
  84. "token" : "kucharek",
  85. "start_offset" : 6,
  86. "end_offset" : 14,
  87. "type" : "<ALPHANUM>",
  88. "position" : 1
  89. },
  90. {
  91. "token" : "sześć",
  92. "start_offset" : 15,
  93. "end_offset" : 20,
  94. "type" : "<ALPHANUM>",
  95. "position" : 2
  96. }
  97. ]
  98. }
  99. --------------------------------------------------