Browse Source

Expose the Lucene Korean analyzer module in a plugin (#30397)

This change adds a new plugin called `analysis-nori` that exposes
Korean text analysis in es using the new Lucene Korean analyzer module named (`nori`).
The plugin adds:
* a Korean analyzer: `nori`
* a Korean tokenizer: `nori_tokenizer`
* a part of speech stop filter: `nori_part_of_speech`
* a filter that can replace Hanja characters with their Hangul transcription: `nori_readingform`
Jim Ferenczi 7 years ago
parent
commit
891d3bd9c3
23 changed files with 1731 additions and 1 deletions
  1. 3 0
      docs/CHANGELOG.asciidoc
  2. 1 0
      docs/build.gradle
  3. 408 0
      docs/plugins/analysis-nori.asciidoc
  4. 6 0
      docs/plugins/analysis.asciidoc
  5. 2 1
      docs/reference/cat/plugins.asciidoc
  6. 5 0
      docs/src/test/cluster/config/userdict_ko.txt
  7. 0 0
      plugins/analysis-kuromoji/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml
  8. 0 0
      plugins/analysis-kuromoji/src/test/resources/rest-api-spec/test/analysis_nori/20_search.yml
  9. 32 0
      plugins/analysis-nori/build.gradle
  10. 475 0
      plugins/analysis-nori/licenses/lucene-LICENSE.txt
  11. 204 0
      plugins/analysis-nori/licenses/lucene-NOTICE.txt
  12. 1 0
      plugins/analysis-nori/licenses/lucene-analyzers-nori-7.4.0-snapshot-1ed95c097b.jar.sha1
  13. 54 0
      plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriAnalyzerProvider.java
  14. 55 0
      plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriPartOfSpeechStopFilterFactory.java
  15. 37 0
      plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriReadingFormFilterFactory.java
  16. 72 0
      plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java
  17. 57 0
      plugins/analysis-nori/src/main/java/org/elasticsearch/plugin/analysis/nori/AnalysisNoriPlugin.java
  18. 48 0
      plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/AnalysisNoriFactoryTests.java
  19. 147 0
      plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java
  20. 39 0
      plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriClientYamlTestSuiteIT.java
  21. 5 0
      plugins/analysis-nori/src/test/resources/org/elasticsearch/index/analysis/user_dict.txt
  22. 48 0
      plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml
  23. 32 0
      plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/20_search.yml

+ 3 - 0
docs/CHANGELOG.asciidoc

@@ -146,6 +146,9 @@ The new <<mapping-ignored-field,`_ignored`>> field allows to know which fields
 got ignored at index time because of the <<ignore-malformed,`ignore_malformed`>>
 option. ({pull}30140[#29658])
 
+A new analysis plugin called `analysis_nori` that exposes the Lucene Korean
+analysis module.  ({pull}30397[#30397])
+
 [float]
 === Enhancements
 

+ 1 - 0
docs/build.gradle

@@ -32,6 +32,7 @@ integTestCluster {
   configFile 'analysis/synonym.txt'
   configFile 'analysis/stemmer_override.txt'
   configFile 'userdict_ja.txt'
+  configFile 'userdict_ko.txt'
   configFile 'KeywordTokenizer.rbbi'
   extraConfigFile 'hunspell/en_US/en_US.aff', '../server/src/test/resources/indices/analyze/conf_dir/hunspell/en_US/en_US.aff'
   extraConfigFile 'hunspell/en_US/en_US.dic', '../server/src/test/resources/indices/analyze/conf_dir/hunspell/en_US/en_US.dic'

+ 408 - 0
docs/plugins/analysis-nori.asciidoc

@@ -0,0 +1,408 @@
+[[analysis-nori]]
+=== Korean (nori) Analysis Plugin
+
+The Korean (nori) Analysis plugin integrates Lucene nori analysis
+module into elasticsearch. It uses the https://bitbucket.org/eunjeon/mecab-ko-dic[mecab-ko-dic dictionary]
+to perform morphological analysis of Korean texts.
+
+:plugin_name: analysis-nori
+include::install_remove.asciidoc[]
+
+[[analysis-nori-analyzer]]
+==== `nori` analyzer
+
+The `nori` analyzer consists of the following tokenizer and token filters:
+
+* <<analysis-nori-tokenizer,`nori_tokenizer`>>
+* <<analysis-nori-speech,`nori_part_of_speech`>> token filter
+* <<analysis-nori-reading,`nori_readingform`>> token filter
+* {ref}/analysis-lowercase-tokenfilter.html[`lowercase`] token filter
+
+It supports the `decompound_mode` and `user_dictionary` settings from
+<<analysis-nori-tokenizer,`nori_tokenizer`>> and the `stoptags` setting from
+<<analysis-nori-speech,`nori_part_of_speech`>>.
+
+[[analysis-nori-tokenizer]]
+==== `nori_tokenizer`
+
+The `nori_tokenizer` accepts the following settings:
+
+`decompound_mode`::
++
+--
+
+The decompound mode determines how the tokenizer handles compound tokens.
+It can be set to:
+
+`none`::
+
+    No decomposition for compounds. Example output:
+
+    가거도항
+    가곡역
+
+`discard`::
+
+    Decomposes compounds and discards the original form (*default*). Example output:
+
+    가곡역 => 가곡, 역
+
+`mixed`::
+
+    Decomposes compounds and keeps the original form. Example output:
+
+    가곡역 => 가곡역, 가곡, 역
+--
+
+`user_dictionary`::
++
+--
+The Nori tokenizer uses the https://bitbucket.org/eunjeon/mecab-ko-dic[mecab-ko-dic dictionary] by default.
+A `user_dictionary` with custom nouns (`NNG`) may be appended to the default dictionary.
+The dictionary should have the following format:
+
+[source,txt]
+-----------------------
+<token> [<token 1> ... <token n>]
+-----------------------
+
+The first token is mandatory and represents the custom noun that should be added in
+the dictionary. For compound nouns the custom segmentation can be provided
+after the first token (`[<token 1> ... <token n>]`). The segmentation of the
+custom compound nouns is controlled by the `decompound_mode` setting.
+--
+
+As a demonstration of how the user dictionary can be used, save the following
+dictionary to `$ES_HOME/config/userdict_ko.txt`:
+
+[source,txt]
+-----------------------
+c++                 <1>
+C샤프
+세종
+세종시 세종 시        <2>
+-----------------------
+--
+
+<1> A simple noun
+<2> A compound noun (`세종시`) followed by its decomposition: `세종` and `시`.
+
+
+Then create an analyzer as follows:
+
+[source,js]
+--------------------------------------------------
+PUT nori_sample
+{
+  "settings": {
+    "index": {
+      "analysis": {
+        "tokenizer": {
+          "nori_user_dict": {
+            "type": "nori_tokenizer",
+            "decompound_mode": "mixed",
+            "user_dictionary": "userdict_ko.txt"
+          }
+        },
+        "analyzer": {
+          "my_analyzer": {
+            "type": "custom",
+            "tokenizer": "nori_user_dict"
+          }
+        }
+      }
+    }
+  }
+}
+
+GET nori_sample/_analyze
+{
+  "analyzer": "my_analyzer",
+  "text": "세종시"  <1>
+}
+--------------------------------------------------
+// CONSOLE
+
+<1> Sejong city
+
+The above `analyze` request returns the following:
+
+[source,js]
+--------------------------------------------------
+{
+  "tokens" : [ {
+    "token" : "세종시",
+    "start_offset" : 0,
+    "end_offset" : 3,
+    "type" : "word",
+    "position" : 0,
+    "positionLength" : 2    <1>
+  }, {
+    "token" : "세종",
+    "start_offset" : 0,
+    "end_offset" : 2,
+    "type" : "word",
+    "position" : 0
+  }, {
+    "token" : "시",
+    "start_offset" : 2,
+    "end_offset" : 3,
+    "type" : "word",
+    "position" : 1
+   }]
+}
+--------------------------------------------------
+// TESTRESPONSE
+
+<1> This is a compound token that spans two positions (`mixed` mode).
+
+The `nori_tokenizer` sets a number of additional attributes per token that are used by token filters
+to modify the stream.
+You can view all these additional attributes with the following request:
+
+[source,js]
+--------------------------------------------------
+GET _analyze
+{
+  "tokenizer": "nori_tokenizer",
+  "text": "뿌리가 깊은 나무는",   <1>
+  "attributes" : ["posType", "leftPOS", "rightPOS", "morphemes", "reading"],
+  "explain": true
+}
+--------------------------------------------------
+// CONSOLE
+
+<1> A tree with deep roots
+
+Which responds with:
+
+[source,js]
+--------------------------------------------------
+{
+    "detail": {
+        "custom_analyzer": true,
+        "charfilters": [],
+        "tokenizer": {
+            "name": "nori_tokenizer",
+            "tokens": [
+                {
+                    "token": "뿌리",
+                    "start_offset": 0,
+                    "end_offset": 2,
+                    "type": "word",
+                    "position": 0,
+                    "leftPOS": "NNG(General Noun)",
+                    "morphemes": null,
+                    "posType": "MORPHEME",
+                    "reading": null,
+                    "rightPOS": "NNG(General Noun)"
+                },
+                {
+                    "token": "가",
+                    "start_offset": 2,
+                    "end_offset": 3,
+                    "type": "word",
+                    "position": 1,
+                    "leftPOS": "J(Ending Particle)",
+                    "morphemes": null,
+                    "posType": "MORPHEME",
+                    "reading": null,
+                    "rightPOS": "J(Ending Particle)"
+                },
+                {
+                    "token": "깊",
+                    "start_offset": 4,
+                    "end_offset": 5,
+                    "type": "word",
+                    "position": 2,
+                    "leftPOS": "VA(Adjective)",
+                    "morphemes": null,
+                    "posType": "MORPHEME",
+                    "reading": null,
+                    "rightPOS": "VA(Adjective)"
+                },
+                {
+                    "token": "은",
+                    "start_offset": 5,
+                    "end_offset": 6,
+                    "type": "word",
+                    "position": 3,
+                    "leftPOS": "E(Verbal endings)",
+                    "morphemes": null,
+                    "posType": "MORPHEME",
+                    "reading": null,
+                    "rightPOS": "E(Verbal endings)"
+                },
+                {
+                    "token": "나무",
+                    "start_offset": 7,
+                    "end_offset": 9,
+                    "type": "word",
+                    "position": 4,
+                    "leftPOS": "NNG(General Noun)",
+                    "morphemes": null,
+                    "posType": "MORPHEME",
+                    "reading": null,
+                    "rightPOS": "NNG(General Noun)"
+                },
+                {
+                    "token": "는",
+                    "start_offset": 9,
+                    "end_offset": 10,
+                    "type": "word",
+                    "position": 5,
+                    "leftPOS": "J(Ending Particle)",
+                    "morphemes": null,
+                    "posType": "MORPHEME",
+                    "reading": null,
+                    "rightPOS": "J(Ending Particle)"
+                }
+            ]
+        },
+        "tokenfilters": []
+    }
+}
+--------------------------------------------------
+// TESTRESPONSE
+
+[[analysis-nori-speech]]
+==== `nori_part_of_speech` token filter
+
+The `nori_part_of_speech` token filter removes tokens that match a set of
+part-of-speech tags. The list of supported tags and their meanings can be found here:
+{lucene_version_path}/org/apache/lucene/analysis/ko/POS.Tag.html[Part of speech tags]
+
+It accepts the following setting:
+
+`stoptags`::
+
+    An array of part-of-speech tags that should be removed.
+
+and defaults to:
+
+```
+"stoptags": [
+    "E",
+    "IC",
+    "J",
+    "MAG", "MAJ", "MM",
+    "SP", "SSC", "SSO", "SC", "SE",
+    "XPN", "XSA", "XSN", "XSV",
+    "UNA", "NA", "VSV"
+]
+```
+
+For example:
+
+[source,js]
+--------------------------------------------------
+PUT nori_sample
+{
+  "settings": {
+    "index": {
+      "analysis": {
+        "analyzer": {
+          "my_analyzer": {
+            "tokenizer": "nori_tokenizer",
+            "filter": [
+              "my_posfilter"
+            ]
+          }
+        },
+        "filter": {
+          "my_posfilter": {
+            "type": "nori_part_of_speech",
+            "stoptags": [
+              "NR"   <1>
+            ]
+          }
+        }
+      }
+    }
+  }
+}
+
+GET nori_sample/_analyze
+{
+  "analyzer": "my_analyzer",
+  "text": "여섯 용이"  <2>
+}
+--------------------------------------------------
+// CONSOLE
+
+<1> Korean numerals should be removed (`NR`)
+<2> Six dragons
+
+Which responds with:
+
+[source,js]
+--------------------------------------------------
+{
+  "tokens" : [ {
+    "token" : "용",
+    "start_offset" : 3,
+    "end_offset" : 4,
+    "type" : "word",
+    "position" : 1
+  }, {
+    "token" : "이",
+    "start_offset" : 4,
+    "end_offset" : 5,
+    "type" : "word",
+    "position" : 2
+  } ]
+}
+--------------------------------------------------
+// TESTRESPONSE
+
+[[analysis-nori-readingform]]
+==== `nori_readingform` token filter
+
+The `nori_readingform` token filter rewrites tokens written in Hanja to their Hangul form.
+
+[source,js]
+--------------------------------------------------
+PUT nori_sample
+{
+    "settings": {
+        "index":{
+            "analysis":{
+                "analyzer" : {
+                    "my_analyzer" : {
+                        "tokenizer" : "nori_tokenizer",
+                        "filter" : ["nori_readingform"]
+                    }
+                }
+            }
+        }
+    }
+}
+
+GET nori_sample/_analyze
+{
+  "analyzer": "my_analyzer",
+  "text": "鄕歌" <1>
+}
+--------------------------------------------------
+// CONSOLE
+
+<1> Hyangga
+
+Which responds with:
+
+[source,js]
+--------------------------------------------------
+{
+  "tokens" : [ {
+    "token" : "향가", <2>
+    "start_offset" : 0,
+    "end_offset" : 2,
+    "type" : "word",
+    "position" : 0
+  }]
+}
+--------------------------------------------------
+// TESTRESPONSE
+
+<1> A token written in Hanja.
+<2> The Hanja form is replaced by the Hangul translation.

+ 6 - 0
docs/plugins/analysis.asciidoc

@@ -20,6 +20,10 @@ transliteration.
 
 Advanced analysis of Japanese using the http://www.atilika.org/[Kuromoji analyzer].
 
+<<analysis-nori,Nori>>::
+
+Morphological analysis of Korean using the Lucene Nori analyzer.
+
 <<analysis-phonetic,Phonetic>>::
 
 Analyzes tokens into their phonetic equivalent using Soundex, Metaphone,
@@ -59,6 +63,8 @@ include::analysis-icu.asciidoc[]
 
 include::analysis-kuromoji.asciidoc[]
 
+include::analysis-nori.asciidoc[]
+
 include::analysis-phonetic.asciidoc[]
 
 include::analysis-smartcn.asciidoc[]

+ 2 - 1
docs/reference/cat/plugins.asciidoc

@@ -16,10 +16,11 @@ Might look like:
 name    component               version   description
 U7321H6 analysis-icu            {version} The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components.
 U7321H6 analysis-kuromoji       {version} The Japanese (kuromoji) Analysis plugin integrates Lucene kuromoji analysis module into elasticsearch.
+U7321H6 analysis-nori           {version} The Korean (nori) Analysis plugin integrates Lucene nori analysis module into elasticsearch.
 U7321H6 analysis-phonetic       {version} The Phonetic Analysis plugin integrates phonetic token filter analysis with elasticsearch.
 U7321H6 analysis-smartcn        {version} Smart Chinese Analysis plugin integrates Lucene Smart Chinese analysis module into elasticsearch.
 U7321H6 analysis-stempel        {version} The Stempel (Polish) Analysis plugin integrates Lucene stempel (polish) analysis module into elasticsearch.
-U7321H6 analysis-ukrainian        {version} The Ukrainian Analysis plugin integrates the Lucene UkrainianMorfologikAnalyzer into elasticsearch.
+U7321H6 analysis-ukrainian      {version} The Ukrainian Analysis plugin integrates the Lucene UkrainianMorfologikAnalyzer into elasticsearch.
 U7321H6 discovery-azure-classic {version} The Azure Classic Discovery plugin allows to use Azure Classic API for the unicast discovery mechanism
 U7321H6 discovery-ec2           {version} The EC2 discovery plugin allows to use AWS API for the unicast discovery mechanism.
 U7321H6 discovery-file          {version} Discovery file plugin enables unicast discovery from hosts stored in a file.

+ 5 - 0
docs/src/test/cluster/config/userdict_ko.txt

@@ -0,0 +1,5 @@
+# Additional nouns
+c++
+C샤프
+세종
+세종시 세종 시

+ 0 - 0
plugins/analysis-kuromoji/src/test/resources/rest-api-spec/test/analysis_kuromoji/10_basic.yml → plugins/analysis-kuromoji/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml


+ 0 - 0
plugins/analysis-kuromoji/src/test/resources/rest-api-spec/test/analysis_kuromoji/20_search.yml → plugins/analysis-kuromoji/src/test/resources/rest-api-spec/test/analysis_nori/20_search.yml


+ 32 - 0
plugins/analysis-nori/build.gradle

@@ -0,0 +1,32 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+esplugin {
+  description 'The Korean (nori) Analysis plugin integrates Lucene nori analysis module into elasticsearch.'
+  classname 'org.elasticsearch.plugin.analysis.nori.AnalysisNoriPlugin'
+}
+
+dependencies {
+  compile "org.apache.lucene:lucene-analyzers-nori:${versions.lucene}"
+}
+
+dependencyLicenses {
+  mapping from: /lucene-.*/, to: 'lucene'
+}
+

+ 475 - 0
plugins/analysis-nori/licenses/lucene-LICENSE.txt

@@ -0,0 +1,475 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+
+
+Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was
+derived from unicode conversion examples available at
+http://www.unicode.org/Public/PROGRAMS/CVTUTF.  Here is the copyright
+from those sources:
+
+/*
+ * Copyright 2001-2004 Unicode, Inc.
+ * 
+ * Disclaimer
+ * 
+ * This source code is provided as is by Unicode, Inc. No claims are
+ * made as to fitness for any particular purpose. No warranties of any
+ * kind are expressed or implied. The recipient agrees to determine
+ * applicability of information provided. If this file has been
+ * purchased on magnetic or optical media from Unicode, Inc., the
+ * sole remedy for any claim will be exchange of defective media
+ * within 90 days of receipt.
+ * 
+ * Limitations on Rights to Redistribute This Code
+ * 
+ * Unicode, Inc. hereby grants the right to freely use the information
+ * supplied in this file in the creation of products supporting the
+ * Unicode Standard, and to make copies of this file in any form
+ * for internal or external distribution as long as this notice
+ * remains attached.
+ */
+
+
+Some code in core/src/java/org/apache/lucene/util/ArrayUtil.java was
+derived from Python 2.4.2 sources available at
+http://www.python.org. Full license is here:
+
+  http://www.python.org/download/releases/2.4.2/license/
+
+Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was
+derived from Python 3.1.2 sources available at
+http://www.python.org. Full license is here:
+
+  http://www.python.org/download/releases/3.1.2/license/
+
+Some code in core/src/java/org/apache/lucene/util/automaton was
+derived from Brics automaton sources available at
+www.brics.dk/automaton/. Here is the copyright from those sources:
+
+/*
+ * Copyright (c) 2001-2009 Anders Moeller
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+ 
+The levenshtein automata tables in core/src/java/org/apache/lucene/util/automaton 
+were automatically generated with the moman/finenight FSA package.
+Here is the copyright for those sources:
+
+# Copyright (c) 2010, Jean-Philippe Barrette-LaPierre, <jpb@rrette.com>
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was
+derived from ICU (http://www.icu-project.org)
+The full license is available here: 
+  http://source.icu-project.org/repos/icu/icu/trunk/license.html
+
+/*
+ * Copyright (C) 1999-2010, International Business Machines
+ * Corporation and others.  All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy 
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights 
+ * to use, copy, modify, merge, publish, distribute, and/or sell copies of the 
+ * Software, and to permit persons to whom the Software is furnished to do so, 
+ * provided that the above copyright notice(s) and this permission notice appear 
+ * in all copies of the Software and that both the above copyright notice(s) and
+ * this permission notice appear in supporting documentation.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. 
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE 
+ * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR 
+ * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 
+ * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 
+ * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall not 
+ * be used in advertising or otherwise to promote the sale, use or other 
+ * dealings in this Software without prior written authorization of the 
+ * copyright holder.
+ */
+ 
+The following license applies to the Snowball stemmers:
+
+Copyright (c) 2001, Dr Martin Porter
+Copyright (c) 2002, Richard Boulton
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+    * notice, this list of conditions and the following disclaimer in the
+    * documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holders nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The following license applies to the KStemmer:
+
+Copyright © 2003,
+Center for Intelligent Information Retrieval,
+University of Massachusetts, Amherst.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. The names "Center for Intelligent Information Retrieval" and
+"University of Massachusetts" must not be used to endorse or promote products
+derived from this software without prior written permission. To obtain
+permission, contact info@ciir.cs.umass.edu.
+
+THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
+
+The following license applies to the Morfologik project:
+
+Copyright (c) 2006 Dawid Weiss
+Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, 
+are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer.
+    
+    * Redistributions in binary form must reproduce the above copyright notice, 
+    this list of conditions and the following disclaimer in the documentation 
+    and/or other materials provided with the distribution.
+    
+    * Neither the name of Morfologik nor the names of its contributors 
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+---
+
+The dictionary comes from Morfologik project. Morfologik uses data from 
+Polish ispell/myspell dictionary hosted at http://www.sjp.pl/slownik/en/ and 
+is licenced on the terms of (inter alia) LGPL and Creative Commons 
+ShareAlike. The part-of-speech tags were added in Morfologik project and
+are not found in the data from sjp.pl. The tagset is similar to IPI PAN
+tagset.
+
+---
+
+The following license applies to the Morfeusz project,
+used by org.apache.lucene.analysis.morfologik.
+
+BSD-licensed dictionary of Polish (SGJP)
+http://sgjp.pl/morfeusz/
+
+Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński, 
+             Marcin Woliński, Robert Wołosz
+
+All rights reserved.
+
+Redistribution and  use in  source and binary  forms, with  or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the
+   distribution.
+
+THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
+OR  IMPLIED WARRANTIES,  INCLUDING, BUT  NOT LIMITED  TO,  THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED.  IN NO EVENT  SHALL COPYRIGHT  HOLDERS OR  CONTRIBUTORS BE
+LIABLE FOR  ANY DIRECT,  INDIRECT, INCIDENTAL, SPECIAL,  EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES  (INCLUDING, BUT NOT LIMITED  TO, PROCUREMENT OF
+SUBSTITUTE  GOODS OR  SERVICES;  LOSS  OF USE,  DATA,  OR PROFITS;  OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF LIABILITY,
+WHETHER IN  CONTRACT, STRICT LIABILITY, OR  TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ 204 - 0
plugins/analysis-nori/licenses/lucene-NOTICE.txt

@@ -0,0 +1,204 @@
+Apache Lucene
+Copyright 2001-2018 The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).
+
+Includes software from other Apache Software Foundation projects,
+including, but not limited to:
+ - Apache Ant
+ - Apache Jakarta Regexp
+ - Apache Commons
+ - Apache Xerces
+
+ICU4J, (under analysis/icu) is licensed under an MIT styles license
+and Copyright (c) 1995-2008 International Business Machines Corporation and others
+
+Some data files (under analysis/icu/src/data) are derived from Unicode data such
+as the Unicode Character Database. See http://unicode.org/copyright.html for more
+details.
+
+Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is
+BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/
+
+The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were
+automatically generated with the moman/finenight FSA library, created by
+Jean-Philippe Barrette-LaPierre. This library is available under an MIT license,
+see http://sites.google.com/site/rrettesite/moman and
+http://bitbucket.org/jpbarrette/moman/overview/
+
+The class org.apache.lucene.util.WeakIdentityMap was derived from
+the Apache CXF project and is Apache License 2.0.
+
+The Google Code Prettify is Apache License 2.0.
+See http://code.google.com/p/google-code-prettify/
+
+JUnit (junit-4.10) is licensed under the Common Public License v. 1.0
+See http://junit.sourceforge.net/cpl-v10.html
+
+This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin
+g Package (jaspell): http://jaspell.sourceforge.net/
+License: The BSD License (http://www.opensource.org/licenses/bsd-license.php)
+
+The snowball stemmers in
+  analysis/common/src/java/net/sf/snowball
+were developed by Martin Porter and Richard Boulton.
+The snowball stopword lists in
+  analysis/common/src/resources/org/apache/lucene/analysis/snowball
+were developed by Martin Porter and Richard Boulton.
+The full snowball package is available from
+  http://snowball.tartarus.org/
+
+The KStem stemmer in
+  analysis/common/src/org/apache/lucene/analysis/en
+was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
+under the BSD-license.
+
+The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
+stopword list that is BSD-licensed created by Jacques Savoy.  These files reside in:
+analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
+analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
+analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
+analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
+analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
+analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
+See http://members.unine.ch/jacques.savoy/clef/index.html.
+
+The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
+(common) are based on BSD-licensed reference implementations created by Jacques Savoy and
+Ljiljana Dolamic. These files reside in:
+analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java
+analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java
+analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java
+analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java
+analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java
+analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java
+analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java
+analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java
+analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java
+analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java
+analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java
+
+The Stempel analyzer (stempel) includes BSD-licensed software developed
+by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil,
+and Edmond Nolan.
+
+The Polish analyzer (stempel) comes with a default
+stopword list that is BSD-licensed created by the Carrot2 project. The file resides
+in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt.
+See http://project.carrot2.org/license.html.
+
+The SmartChineseAnalyzer source code (smartcn) was
+provided by Xiaoping Gao and copyright 2009 by www.imdict.net.
+
+WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/)
+is derived from Unicode data such as the Unicode Character Database.
+See http://unicode.org/copyright.html for more details.
+
+The Morfologik analyzer (morfologik) includes BSD-licensed software
+developed by Dawid Weiss and Marcin Miłkowski (http://morfologik.blogspot.com/).
+
+Morfologik uses data from Polish ispell/myspell dictionary
+(http://www.sjp.pl/slownik/en/) licenced on the terms of (inter alia)
+LGPL and Creative Commons ShareAlike.
+
+Morfologic includes data from BSD-licensed dictionary of Polish (SGJP)
+(http://sgjp.pl/morfeusz/)
+
+Servlet-api.jar and javax.servlet-*.jar are under the CDDL license, the original
+source code for this can be found at http://www.eclipse.org/jetty/downloads.php
+
+===========================================================================
+Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration
+===========================================================================
+
+This software includes a binary and/or source version of data from
+
+  mecab-ipadic-2.7.0-20070801
+
+which can be obtained from
+
+  http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz
+
+or
+
+  http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz
+
+===========================================================================
+mecab-ipadic-2.7.0-20070801 Notice
+===========================================================================
+
+Nara Institute of Science and Technology (NAIST),
+the copyright holders, disclaims all warranties with regard to this
+software, including all implied warranties of merchantability and
+fitness, in no event shall NAIST be liable for
+any special, indirect or consequential damages or any damages
+whatsoever resulting from loss of use, data or profits, whether in an
+action of contract, negligence or other tortuous action, arising out
+of or in connection with the use or performance of this software.
+
+A large portion of the dictionary entries
+originate from ICOT Free Software.  The following conditions for ICOT
+Free Software applies to the current dictionary as well.
+
+Each User may also freely distribute the Program, whether in its
+original form or modified, to any third party or parties, PROVIDED
+that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
+on, or be attached to, the Program, which is distributed substantially
+in the same form as set out herein and that such intended
+distribution, if actually made, will neither violate or otherwise
+contravene any of the laws and regulations of the countries having
+jurisdiction over the User or the intended distribution itself.
+
+NO WARRANTY
+
+The program was produced on an experimental basis in the course of the
+research and development conducted during the project and is provided
+to users as so produced on an experimental basis.  Accordingly, the
+program is provided without any warranty whatsoever, whether express,
+implied, statutory or otherwise.  The term "warranty" used herein
+includes, but is not limited to, any warranty of the quality,
+performance, merchantability and fitness for a particular purpose of
+the program and the nonexistence of any infringement or violation of
+any right of any third party.
+
+Each user of the program will agree and understand, and be deemed to
+have agreed and understood, that there is no warranty whatsoever for
+the program and, accordingly, the entire risk arising from or
+otherwise connected with the program is assumed by the user.
+
+Therefore, neither ICOT, the copyright holder, or any other
+organization that participated in or was otherwise related to the
+development of the program and their respective officials, directors,
+officers and other employees shall be held liable for any and all
+damages, including, without limitation, general, special, incidental
+and consequential damages, arising out of or otherwise in connection
+with the use or inability to use the program or any product, material
+or result produced or otherwise obtained by using the program,
+regardless of whether they have been advised of, or otherwise had
+knowledge of, the possibility of such damages at any time during the
+project or thereafter.  Each user will be deemed to have agreed to the
+foregoing by his or her commencement of use of the program.  The term
+"use" as used herein includes, but is not limited to, the use,
+modification, copying and distribution of the program and the
+production of secondary products from the program.
+
+In the case where the program, whether in its original form or
+modified, was distributed or delivered to or received by a user from
+any person, organization or entity other than ICOT, unless it makes or
+grants independently of ICOT any specific warranty to the user in
+writing, such person, organization or entity, will also be exempted
+from and not be held liable to the user for any such damages as noted
+above as far as the program is concerned.
+
+===========================================================================
+Nori Korean Morphological Analyzer - Apache Lucene Integration
+===========================================================================
+
+This software includes a binary and/or source version of data from
+
+  mecab-ko-dic-2.0.3-20170922
+
+which can be obtained from
+
+  https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.0.3-20170922.tar.gz

+ 1 - 0
plugins/analysis-nori/licenses/lucene-analyzers-nori-7.4.0-snapshot-1ed95c097b.jar.sha1

@@ -0,0 +1 @@
+a7daed3dc3a67674862002f315cd9193944de783

+ 54 - 0
plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriAnalyzerProvider.java

@@ -0,0 +1,54 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import java.util.List;
+import java.util.Set;
+import org.apache.lucene.analysis.ko.KoreanAnalyzer;
+import org.apache.lucene.analysis.ko.KoreanPartOfSpeechStopFilter;
+import org.apache.lucene.analysis.ko.KoreanTokenizer;
+import org.apache.lucene.analysis.ko.dict.UserDictionary;
+import org.apache.lucene.analysis.ko.POS;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+
+import static org.elasticsearch.index.analysis.NoriPartOfSpeechStopFilterFactory.resolvePOSList;
+
+
+public class NoriAnalyzerProvider extends AbstractIndexAnalyzerProvider<KoreanAnalyzer> {
+    private final KoreanAnalyzer analyzer;
+
+    public NoriAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+        super(indexSettings, name, settings);
+        final KoreanTokenizer.DecompoundMode mode = NoriTokenizerFactory.getMode(settings);
+        final UserDictionary userDictionary = NoriTokenizerFactory.getUserDictionary(env, settings);
+        final List<String> tagList = Analysis.getWordList(env, settings, "stoptags");
+        final Set<POS.Tag> stopTags = tagList != null ? resolvePOSList(tagList) : KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS;
+        analyzer = new KoreanAnalyzer(userDictionary, mode, stopTags, false);
+    }
+
+    @Override
+    public KoreanAnalyzer get() {
+        return analyzer;
+    }
+
+
+}

+ 55 - 0
plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriPartOfSpeechStopFilterFactory.java

@@ -0,0 +1,55 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ko.KoreanPartOfSpeechStopFilter;
+import org.apache.lucene.analysis.ko.POS;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+public class NoriPartOfSpeechStopFilterFactory extends AbstractTokenFilterFactory {
+    private final Set<POS.Tag> stopTags;
+
+    public NoriPartOfSpeechStopFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+        super(indexSettings, name, settings);
+        List<String> tagList = Analysis.getWordList(env, settings, "stoptags");
+        this.stopTags = tagList != null ? resolvePOSList(tagList) : KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS;
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        return new KoreanPartOfSpeechStopFilter(tokenStream, stopTags);
+    }
+
+
+    static Set<POS.Tag> resolvePOSList(List<String> tagList) {
+        Set<POS.Tag> stopTags = new HashSet<>();
+        for (String tag : tagList) {
+            stopTags.add(POS.resolveTag(tag));
+        }
+        return stopTags;
+    }
+}

+ 37 - 0
plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriReadingFormFilterFactory.java

@@ -0,0 +1,37 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ko.KoreanReadingFormFilter;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+
+public class NoriReadingFormFilterFactory extends AbstractTokenFilterFactory {
+    public NoriReadingFormFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+        super(indexSettings, name, settings);
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        return new KoreanReadingFormFilter(tokenStream);
+    }
+}

+ 72 - 0
plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java

@@ -0,0 +1,72 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ko.KoreanTokenizer;
+import org.apache.lucene.analysis.ko.dict.UserDictionary;
+import org.elasticsearch.ElasticsearchException;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Locale;
+
+public class NoriTokenizerFactory extends AbstractTokenizerFactory {
+    private static final String USER_DICT_OPTION = "user_dictionary";
+
+    private final UserDictionary userDictionary;
+    private final KoreanTokenizer.DecompoundMode decompoundMode;
+
+    public NoriTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+        super(indexSettings, name, settings);
+        decompoundMode = getMode(settings);
+        userDictionary = getUserDictionary(env, settings);
+    }
+
+    public static UserDictionary getUserDictionary(Environment env, Settings settings) {
+        try (Reader reader = Analysis.getReaderFromFile(env, settings, USER_DICT_OPTION)) {
+            if (reader == null) {
+                return null;
+            } else {
+                return UserDictionary.open(reader);
+            }
+        } catch (IOException e) {
+            throw new ElasticsearchException("failed to load nori user dictionary", e);
+        }
+    }
+
+    public static KoreanTokenizer.DecompoundMode getMode(Settings settings) {
+        KoreanTokenizer.DecompoundMode mode = KoreanTokenizer.DEFAULT_DECOMPOUND;
+        String modeSetting = settings.get("decompound_mode", null);
+        if (modeSetting != null) {
+            mode = KoreanTokenizer.DecompoundMode.valueOf(modeSetting.toUpperCase(Locale.ENGLISH));
+        }
+        return mode;
+    }
+
+    @Override
+    public Tokenizer create() {
+        return new KoreanTokenizer(KoreanTokenizer.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, userDictionary, decompoundMode, false);
+    }
+
+}

+ 57 - 0
plugins/analysis-nori/src/main/java/org/elasticsearch/plugin/analysis/nori/AnalysisNoriPlugin.java

@@ -0,0 +1,57 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.plugin.analysis.nori;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.elasticsearch.index.analysis.AnalyzerProvider;
+import org.elasticsearch.index.analysis.NoriAnalyzerProvider;
+import org.elasticsearch.index.analysis.NoriPartOfSpeechStopFilterFactory;
+import org.elasticsearch.index.analysis.NoriReadingFormFilterFactory;
+import org.elasticsearch.index.analysis.NoriTokenizerFactory;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
+import org.elasticsearch.index.analysis.TokenizerFactory;
+import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
+import org.elasticsearch.plugins.AnalysisPlugin;
+import org.elasticsearch.plugins.Plugin;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import static java.util.Collections.singletonMap;
+
+public class AnalysisNoriPlugin extends Plugin implements AnalysisPlugin {
+    @Override
+    public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
+        Map<String, AnalysisProvider<TokenFilterFactory>> extra = new HashMap<>();
+        extra.put("nori_part_of_speech", NoriPartOfSpeechStopFilterFactory::new);
+        extra.put("nori_readingform", NoriReadingFormFilterFactory::new);
+        return extra;
+    }
+
+    @Override
+    public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
+        return singletonMap("nori_tokenizer", NoriTokenizerFactory::new);
+    }
+
+    @Override
+    public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
+        return singletonMap("nori", NoriAnalyzerProvider::new);
+    }
+}

+ 48 - 0
plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/AnalysisNoriFactoryTests.java

@@ -0,0 +1,48 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.ko.KoreanTokenizerFactory;
+import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase;
+import org.elasticsearch.plugin.analysis.nori.AnalysisNoriPlugin;
+
+import java.util.HashMap;
+import java.util.Map;
+
+public class AnalysisNoriFactoryTests extends AnalysisFactoryTestCase {
+    public AnalysisNoriFactoryTests() {
+        super(new AnalysisNoriPlugin());
+    }
+
+    @Override
+    protected Map<String, Class<?>> getTokenizers() {
+        Map<String, Class<?>> tokenizers = new HashMap<>(super.getTokenizers());
+        tokenizers.put("korean", KoreanTokenizerFactory.class);
+        return tokenizers;
+    }
+
+    @Override
+    protected Map<String, Class<?>> getTokenFilters() {
+        Map<String, Class<?>> filters = new HashMap<>(super.getTokenFilters());
+        filters.put("koreanpartofspeechstop", NoriPartOfSpeechStopFilterFactory.class);
+        filters.put("koreanreadingform", NoriReadingFormFilterFactory.class);
+        return filters;
+    }
+}

+ 147 - 0
plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java

@@ -0,0 +1,147 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ko.KoreanAnalyzer;
+import org.apache.lucene.analysis.ko.KoreanTokenizer;
+import org.elasticsearch.Version;
+import org.elasticsearch.cluster.metadata.IndexMetaData;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.plugin.analysis.nori.AnalysisNoriPlugin;
+import org.elasticsearch.test.ESTestCase.TestAnalysis;
+import org.elasticsearch.test.ESTokenStreamTestCase;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringReader;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import static org.hamcrest.Matchers.instanceOf;
+
+public class NoriAnalysisTests extends ESTokenStreamTestCase {
+    public void testDefaultsNoriAnalysis() throws IOException {
+        TestAnalysis analysis = createTestAnalysis(Settings.EMPTY);
+
+        TokenizerFactory tokenizerFactory = analysis.tokenizer.get("nori_tokenizer");
+        assertThat(tokenizerFactory, instanceOf(NoriTokenizerFactory.class));
+
+        TokenFilterFactory filterFactory = analysis.tokenFilter.get("nori_part_of_speech");
+        assertThat(filterFactory, instanceOf(NoriPartOfSpeechStopFilterFactory.class));
+
+        filterFactory = analysis.tokenFilter.get("nori_readingform");
+        assertThat(filterFactory, instanceOf(NoriReadingFormFilterFactory.class));
+
+        IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
+        NamedAnalyzer analyzer = indexAnalyzers.get("nori");
+        assertThat(analyzer.analyzer(), instanceOf(KoreanAnalyzer.class));
+    }
+
+    public void testNoriAnalyzer() throws Exception {
+        Settings settings = Settings.builder()
+            .put("index.analysis.analyzer.my_analyzer.type", "nori")
+            .put("index.analysis.analyzer.my_analyzer.stoptags", "NR, SP")
+            .put("index.analysis.analyzer.my_analyzer.decompound_mode", "mixed")
+            .build();
+        TestAnalysis analysis = createTestAnalysis(settings);
+        Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
+        try (TokenStream stream = analyzer.tokenStream("", "여섯 용이" )) {
+            assertTokenStreamContents(stream, new String[] {"용", "이"});
+        }
+
+        try (TokenStream stream = analyzer.tokenStream("", "가늠표")) {
+            assertTokenStreamContents(stream, new String[] {"가늠표", "가늠", "표"});
+        }
+    }
+
+    public void testNoriAnalyzerUserDict() throws Exception {
+        Settings settings = Settings.builder()
+            .put("index.analysis.analyzer.my_analyzer.type", "nori")
+            .put("index.analysis.analyzer.my_analyzer.user_dictionary", "user_dict.txt")
+            .build();
+        TestAnalysis analysis = createTestAnalysis(settings);
+        Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
+        try (TokenStream stream = analyzer.tokenStream("", "세종시" )) {
+            assertTokenStreamContents(stream, new String[] {"세종", "시"});
+        }
+
+        try (TokenStream stream = analyzer.tokenStream("", "c++world")) {
+            assertTokenStreamContents(stream, new String[] {"c++", "world"});
+        }
+    }
+
+    public void testNoriTokenizer() throws Exception {
+        Settings settings = Settings.builder()
+            .put("index.analysis.tokenizer.my_tokenizer.type", "nori_tokenizer")
+            .put("index.analysis.tokenizer.my_tokenizer.decompound_mode", "mixed")
+            .build();
+        TestAnalysis analysis = createTestAnalysis(settings);
+        Tokenizer tokenizer = analysis.tokenizer.get("my_tokenizer").create();
+        tokenizer.setReader(new StringReader("뿌리가 깊은 나무"));
+        assertTokenStreamContents(tokenizer, new String[] {"뿌리", "가", "깊", "은", "나무"});
+        tokenizer.setReader(new StringReader("가늠표"));
+        assertTokenStreamContents(tokenizer, new String[] {"가늠표", "가늠", "표"});
+    }
+
+    public void testNoriPartOfSpeech() throws IOException {
+        Settings settings = Settings.builder()
+            .put("index.analysis.filter.my_filter.type", "nori_part_of_speech")
+            .put("index.analysis.filter.my_filter.stoptags", "NR, SP")
+            .build();
+        TestAnalysis analysis = createTestAnalysis(settings);
+        TokenFilterFactory factory = analysis.tokenFilter.get("my_filter");
+        Tokenizer tokenizer = new KoreanTokenizer();
+        tokenizer.setReader(new StringReader("여섯 용이"));
+        TokenStream stream = factory.create(tokenizer);
+        assertTokenStreamContents(stream, new String[] {"용", "이"});
+    }
+
+    public void testNoriReadingForm() throws IOException {
+        Settings settings = Settings.builder()
+            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
+            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+            .put("index.analysis.filter.my_filter.type", "nori_readingform")
+            .build();
+        TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new AnalysisNoriPlugin());
+        TokenFilterFactory factory = analysis.tokenFilter.get("my_filter");
+        Tokenizer tokenizer = new KoreanTokenizer();
+        tokenizer.setReader(new StringReader("鄕歌"));
+        TokenStream stream = factory.create(tokenizer);
+        assertTokenStreamContents(stream, new String[] {"향가"});
+    }
+
+    private TestAnalysis createTestAnalysis(Settings analysisSettings) throws IOException {
+        InputStream dict = NoriAnalysisTests.class.getResourceAsStream("user_dict.txt");
+        Path home = createTempDir();
+        Path config = home.resolve("config");
+        Files.createDirectory(config);
+        Files.copy(dict, config.resolve("user_dict.txt"));
+        Settings settings = Settings.builder()
+            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
+            .put(Environment.PATH_HOME_SETTING.getKey(), home)
+            .put(analysisSettings)
+            .build();
+        return AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new AnalysisNoriPlugin());
+    }
+}

+ 39 - 0
plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriClientYamlTestSuiteIT.java

@@ -0,0 +1,39 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import com.carrotsearch.randomizedtesting.annotations.Name;
+import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
+
+import org.elasticsearch.test.rest.yaml.ClientYamlTestCandidate;
+import org.elasticsearch.test.rest.yaml.ESClientYamlSuiteTestCase;
+
+public class NoriClientYamlTestSuiteIT extends ESClientYamlSuiteTestCase {
+
+    public NoriClientYamlTestSuiteIT(@Name("yaml") ClientYamlTestCandidate testCandidate) {
+        super(testCandidate);
+    }
+
+    @ParametersFactory
+    public static Iterable<Object[]> parameters() throws Exception {
+        return ESClientYamlSuiteTestCase.createParameters();
+    }
+}
+

+ 5 - 0
plugins/analysis-nori/src/test/resources/org/elasticsearch/index/analysis/user_dict.txt

@@ -0,0 +1,5 @@
+# Additional nouns
+c++
+C샤프
+세종
+세종시 세종 시

+ 48 - 0
plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml

@@ -0,0 +1,48 @@
+# Integration tests for Korean analysis components
+#
+---
+"Analyzer":
+    - do:
+        indices.analyze:
+          body:
+            text:         뿌리가 깊은 나무
+            analyzer:     nori
+    - length: { tokens: 3 }
+    - match:  { tokens.0.token: 뿌리 }
+    - match:  { tokens.1.token: 깊 }
+    - match:  { tokens.2.token: 나무 }
+---
+"Tokenizer":
+    - do:
+        indices.analyze:
+          body:
+            text:         뿌리가 깊은 나무
+            tokenizer:    nori_tokenizer
+    - length: { tokens: 5 }
+    - match:  { tokens.0.token: 뿌리 }
+    - match:  { tokens.1.token: 가  }
+    - match:  { tokens.2.token: 깊  }
+    - match:  { tokens.3.token: 은  }
+    - match:  { tokens.4.token: 나무 }
+---
+"Part of speech filter":
+    - do:
+        indices.analyze:
+          body:
+            text:         뿌리가 깊은 나무
+            tokenizer:    nori_tokenizer
+            filter:       [nori_part_of_speech]
+    - length: { tokens: 3 }
+    - match:  { tokens.0.token: 뿌리 }
+    - match:  { tokens.1.token: 깊  }
+    - match:  { tokens.2.token: 나무 }
+---
+"Reading filter":
+    - do:
+        indices.analyze:
+          body:
+            text:         鄕歌
+            tokenizer:    nori_tokenizer
+            filter:       [nori_readingform]
+    - length: { tokens: 1 }
+    - match:  { tokens.0.token: 향가 }

+ 32 - 0
plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/20_search.yml

@@ -0,0 +1,32 @@
+# Integration tests for Korean analysis components
+#
+---
+"Index Korean content":
+  - do:
+      indices.create:
+        index: test
+        body:
+          mappings:
+            type:
+              properties:
+                text:
+                  type:     text
+                  analyzer: nori
+
+  - do:
+      index:
+        index:  test
+        type:   type
+        id:     1
+        body:   { "text": "뿌리가 깊은 나무는" }
+  - do:
+      indices.refresh: {}
+
+  - do:
+      search:
+        index: test
+        body:
+          query:
+            match:
+              text: 나무
+  - match: { hits.total: 1 }