analysis-stempel.asciidoc 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. [[analysis-stempel]]
  2. === Stempel Polish Analysis Plugin
  3. The Stempel Analysis plugin integrates Lucene's Stempel analysis
  4. module for Polish into elasticsearch.
  5. It provides high quality stemming for Polish, based on the
  6. http://www.egothor.org/[Egothor project].
  7. :plugin_name: analysis-stempel
  8. include::install_remove.asciidoc[]
  9. [[analysis-stempel-tokenizer]]
  10. [float]
  11. ==== `stempel` tokenizer and token filters
  12. The plugin provides the `polish` analyzer and the `polish_stem` and `polish_stop` token filters,
  13. which are not configurable.
  14. ==== Reimplementing and extending the analyzers
  15. The `polish` analyzer could be reimplemented as a `custom` analyzer that can
  16. then be extended and configured differently as follows:
  17. [source,js]
  18. ----------------------------------------------------
  19. PUT /stempel_example
  20. {
  21. "settings": {
  22. "analysis": {
  23. "analyzer": {
  24. "rebuilt_stempel": {
  25. "tokenizer": "standard",
  26. "filter": [
  27. "lowercase",
  28. "polish_stop",
  29. "polish_stem"
  30. ]
  31. }
  32. }
  33. }
  34. }
  35. }
  36. ----------------------------------------------------
  37. // CONSOLE
  38. // TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: stempel_example, first: polish, second: rebuilt_stempel}\nendyaml\n/]
  39. [[analysis-polish-stop]]
  40. ==== `polish_stop` token filter
  41. The `polish_stop` token filter filters out Polish stopwords (`_polish_`), and
  42. any other custom stopwords specified by the user. This filter only supports
  43. the predefined `_polish_` stopwords list. If you want to use a different
  44. predefined list, then use the
  45. {ref}/analysis-stop-tokenfilter.html[`stop` token filter] instead.
  46. [source,js]
  47. --------------------------------------------------
  48. PUT /polish_stop_example
  49. {
  50. "settings": {
  51. "index": {
  52. "analysis": {
  53. "analyzer": {
  54. "analyzer_with_stop": {
  55. "tokenizer": "standard",
  56. "filter": [
  57. "lowercase",
  58. "polish_stop"
  59. ]
  60. }
  61. },
  62. "filter": {
  63. "polish_stop": {
  64. "type": "polish_stop",
  65. "stopwords": [
  66. "_polish_",
  67. "jeść"
  68. ]
  69. }
  70. }
  71. }
  72. }
  73. }
  74. }
  75. GET polish_stop_example/_analyze
  76. {
  77. "analyzer": "analyzer_with_stop",
  78. "text": "Gdzie kucharek sześć, tam nie ma co jeść."
  79. }
  80. --------------------------------------------------
  81. // CONSOLE
  82. The above request returns:
  83. [source,js]
  84. --------------------------------------------------
  85. {
  86. "tokens" : [
  87. {
  88. "token" : "kucharek",
  89. "start_offset" : 6,
  90. "end_offset" : 14,
  91. "type" : "<ALPHANUM>",
  92. "position" : 1
  93. },
  94. {
  95. "token" : "sześć",
  96. "start_offset" : 15,
  97. "end_offset" : 20,
  98. "type" : "<ALPHANUM>",
  99. "position" : 2
  100. }
  101. ]
  102. }
  103. --------------------------------------------------
  104. // TESTRESPONSE