analysis-stempel.asciidoc 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. [[analysis-stempel]]
  2. === Stempel Polish Analysis Plugin
  3. The Stempel Analysis plugin integrates Lucene's Stempel analysis
  4. module for Polish into elasticsearch.
  5. It provides high quality stemming for Polish, based on the
  6. http://www.egothor.org/[Egothor project].
  7. :plugin_name: analysis-stempel
  8. include::install_remove.asciidoc[]
  9. [[analysis-stempel-tokenizer]]
  10. [float]
  11. ==== `stempel` tokenizer and token filters
  12. The plugin provides the `polish` analyzer and the `polish_stem` and `polish_stop` token filters,
  13. which are not configurable.
  14. ==== Reimplementing and extending the analyzers
  15. The `polish` analyzer could be reimplemented as a `custom` analyzer that can
  16. then be extended and configured differently as follows:
  17. [source,console]
  18. ----------------------------------------------------
  19. PUT /stempel_example
  20. {
  21. "settings": {
  22. "analysis": {
  23. "analyzer": {
  24. "rebuilt_stempel": {
  25. "tokenizer": "standard",
  26. "filter": [
  27. "lowercase",
  28. "polish_stop",
  29. "polish_stem"
  30. ]
  31. }
  32. }
  33. }
  34. }
  35. }
  36. ----------------------------------------------------
  37. // TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: stempel_example, first: polish, second: rebuilt_stempel}\nendyaml\n/]
  38. [[analysis-polish-stop]]
  39. ==== `polish_stop` token filter
  40. The `polish_stop` token filter filters out Polish stopwords (`_polish_`), and
  41. any other custom stopwords specified by the user. This filter only supports
  42. the predefined `_polish_` stopwords list. If you want to use a different
  43. predefined list, then use the
  44. {ref}/analysis-stop-tokenfilter.html[`stop` token filter] instead.
  45. [source,console]
  46. --------------------------------------------------
  47. PUT /polish_stop_example
  48. {
  49. "settings": {
  50. "index": {
  51. "analysis": {
  52. "analyzer": {
  53. "analyzer_with_stop": {
  54. "tokenizer": "standard",
  55. "filter": [
  56. "lowercase",
  57. "polish_stop"
  58. ]
  59. }
  60. },
  61. "filter": {
  62. "polish_stop": {
  63. "type": "polish_stop",
  64. "stopwords": [
  65. "_polish_",
  66. "jeść"
  67. ]
  68. }
  69. }
  70. }
  71. }
  72. }
  73. }
  74. GET polish_stop_example/_analyze
  75. {
  76. "analyzer": "analyzer_with_stop",
  77. "text": "Gdzie kucharek sześć, tam nie ma co jeść."
  78. }
  79. --------------------------------------------------
  80. The above request returns:
  81. [source,console-result]
  82. --------------------------------------------------
  83. {
  84. "tokens" : [
  85. {
  86. "token" : "kucharek",
  87. "start_offset" : 6,
  88. "end_offset" : 14,
  89. "type" : "<ALPHANUM>",
  90. "position" : 1
  91. },
  92. {
  93. "token" : "sześć",
  94. "start_offset" : 15,
  95. "end_offset" : 20,
  96. "type" : "<ALPHANUM>",
  97. "position" : 2
  98. }
  99. ]
  100. }
  101. --------------------------------------------------