9 years ago · a9a0f262af
--- a/docs/plugins/analysis-kuromoji.asciidoc
+++ b/docs/plugins/analysis-kuromoji.asciidoc
@@ -122,6 +122,28 @@ dictionary to `$ES_HOME/config/userdict_ja.txt`:
 
				 東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞
			
 
				 -----------------------
			
 
				 
			
 
				+`nbest_cost`/`nbest_examples`::
			
 
				++
			
 
				+--
			
 
				+Additional expert user parameters `nbest_cost` and `nbest_examples` can be used
			
 
				+to include additional tokens that most likely according to the statistical model.
			
 
				+If both parameters are used, the largest number of both is applied.
			
 
				+
			
 
				+`nbest_cost`::
			
 
				+
			
 
				+    The `nbest_cost` parameter specifies an additional Viterbi cost.
			
 
				+    The KuromojiTokenizer will include all tokens in Viterbi paths that are
			
 
				+    within the nbest_cost value of the best path.
			
 
				+
			
 
				+`nbest_examples`::
			
 
				+
			
 
				+    The `nbest_examples` can be used to find a `nbest_cost` value based on examples.
			
 
				+    For example, a value of /箱根山-箱根/成田空港-成田/ indicates that in the texts,
			
 
				+    箱根山 (Mt. Hakone) and 成田空港 (Narita Airport) we'd like a cost that gives is us
			
 
				+    箱根 (Hakone) and 成田 (Narita).
			
 
				+--
			
 
				+
			
 
				+
			
 
				 Then create an analyzer as follows:
			
 
				 
			
 
				 [source,json]
			
@@ -452,3 +474,48 @@ The above request returns:
 
				 }
			
 
				 --------------------------------------------------
			
 
				 
			
 
				+[[analysis-kuromoji-number]]
			
 
				+===== `kuromoji_number` token filter
			
 
				+
			
 
				+The `kuromoji_number` token filter normalizes Japanese numbers (kansūji)
			
 
				+to regular Arabic decimal numbers in half-width characters.
			
 
				+
			
 
				+[source,json]
			
 
				+--------------------------------------------------
			
 
				+PUT kuromoji_sample
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "index": {
			
 
				+      "analysis": {
			
 
				+        "analyzer": {
			
 
				+          "my_analyzer": {
			
 
				+            "tokenizer": "kuromoji_tokenizer",
			
 
				+            "filter": [
			
 
				+              "kuromoji_number"
			
 
				+            ]
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+POST kuromoji_sample/_analyze?analyzer=my_analyzer&text=一〇〇〇
			
 
				+
			
 
				+--------------------------------------------------
			
 
				+// AUTOSENSE
			
 
				+
			
 
				+[source,text]
			
 
				+--------------------------------------------------
			
 
				+# Result
			
 
				+{
			
 
				+  "tokens" : [ {
			
 
				+    "token" : "1000",
			
 
				+    "start_offset" : 0,
			
 
				+    "end_offset" : 4,
			
 
				+    "type" : "word",
			
 
				+    "position" : 1
			
 
				+  } ]
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+
			
--- a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiNumberFilterFactory.java
+++ b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiNumberFilterFactory.java
@@ -0,0 +1,37 @@
 
				+/*
			
 
				+ * Licensed to Elasticsearch under one or more contributor
			
 
				+ * license agreements. See the NOTICE file distributed with
			
 
				+ * this work for additional information regarding copyright
			
 
				+ * ownership. Elasticsearch licenses this file to you under
			
 
				+ * the Apache License, Version 2.0 (the "License"); you may
			
 
				+ * not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing,
			
 
				+ * software distributed under the License is distributed on an
			
 
				+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
			
 
				+ * KIND, either express or implied.  See the License for the
			
 
				+ * specific language governing permissions and limitations
			
 
				+ * under the License.
			
 
				+ */
			
 
				+package org.elasticsearch.index.analysis;
			
 
				+
			
 
				+import org.apache.lucene.analysis.TokenStream;
			
 
				+import org.apache.lucene.analysis.ja.JapaneseNumberFilter;
			
 
				+import org.elasticsearch.common.settings.Settings;
			
 
				+import org.elasticsearch.env.Environment;
			
 
				+import org.elasticsearch.index.IndexSettings;
			
 
				+
			
 
				+public class KuromojiNumberFilterFactory extends AbstractTokenFilterFactory {
			
 
				+
			
 
				+    public KuromojiNumberFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
			
 
				+        super(indexSettings, name, settings);
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public TokenStream create(TokenStream tokenStream) {
			
 
				+        return new JapaneseNumberFilter(tokenStream);
			
 
				+    }
			
 
				+}
			
--- a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java
+++ b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java
@@ -36,9 +36,13 @@ import java.io.Reader;
 
				 public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
			
 
				 
			
 
				     private static final String USER_DICT_OPTION = "user_dictionary";
			
 
				+    private static final String NBEST_COST = "nbest_cost";
			
 
				+    private static final String NBEST_EXAMPLES = "nbest_examples";
			
 
				 
			
 
				     private final UserDictionary userDictionary;
			
 
				     private final Mode mode;
			
 
				+    private final String nBestExamples;
			
 
				+    private final int nBestCost;
			
 
				 
			
 
				     private boolean discartPunctuation;
			
 
				 
			
@@ -47,6 +51,8 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
 
				         mode = getMode(settings);
			
 
				         userDictionary = getUserDictionary(env, settings);
			
 
				         discartPunctuation = settings.getAsBoolean("discard_punctuation", true);
			
 
				+        nBestCost = settings.getAsInt(NBEST_COST, -1);
			
 
				+        nBestExamples = settings.get(NBEST_EXAMPLES);
			
 
				     }
			
 
				 
			
 
				     public static UserDictionary getUserDictionary(Environment env, Settings settings) {
			
@@ -83,7 +89,13 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
 
				 
			
 
				     @Override
			
 
				     public Tokenizer create() {
			
 
				-        return new JapaneseTokenizer(userDictionary, discartPunctuation, mode);
			
 
				+        JapaneseTokenizer t = new JapaneseTokenizer(userDictionary, discartPunctuation, mode);
			
 
				+        int nBestCost = this.nBestCost;
			
 
				+        if (nBestExamples != null) {
			
 
				+            nBestCost = Math.max(nBestCost, t.calcNBestCost(nBestExamples));
			
 
				+        }
			
 
				+        t.setNBestCost(nBestCost);
			
 
				+        return t;
			
 
				     }
			
 
				 
			
 
				 }
			
--- a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java
+++ b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java
@@ -24,6 +24,7 @@ import org.elasticsearch.index.analysis.KuromojiAnalyzerProvider;
 
				 import org.elasticsearch.index.analysis.KuromojiBaseFormFilterFactory;
			
 
				 import org.elasticsearch.index.analysis.KuromojiIterationMarkCharFilterFactory;
			
 
				 import org.elasticsearch.index.analysis.KuromojiKatakanaStemmerFactory;
			
 
				+import org.elasticsearch.index.analysis.KuromojiNumberFilterFactory;
			
 
				 import org.elasticsearch.index.analysis.KuromojiPartOfSpeechFilterFactory;
			
 
				 import org.elasticsearch.index.analysis.KuromojiReadingFormFilterFactory;
			
 
				 import org.elasticsearch.index.analysis.KuromojiTokenizerFactory;
			
@@ -55,5 +56,6 @@ public class AnalysisKuromojiPlugin extends Plugin {
 
				         module.registerTokenFilter("kuromoji_readingform", KuromojiReadingFormFilterFactory::new);
			
 
				         module.registerTokenFilter("kuromoji_stemmer", KuromojiKatakanaStemmerFactory::new);
			
 
				         module.registerTokenFilter("ja_stop", JapaneseStopTokenFilterFactory::new);
			
 
				+        module.registerTokenFilter("kuromoji_number", KuromojiNumberFilterFactory::new);
			
 
				     }
			
 
				 }
			
--- a/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java
+++ b/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java
@@ -24,7 +24,11 @@ import org.apache.lucene.analysis.Tokenizer;
 
				 import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
			
 
				 import org.apache.lucene.analysis.ja.JapaneseTokenizer;
			
 
				 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
			
 
				+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
			
 
				+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
			
 
				+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
			
 
				 import org.elasticsearch.Version;
			
 
				+import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
			
 
				 import org.elasticsearch.cluster.metadata.IndexMetaData;
			
 
				 import org.elasticsearch.common.inject.Injector;
			
 
				 import org.elasticsearch.common.inject.ModulesBuilder;
			
@@ -75,6 +79,9 @@ public class KuromojiAnalysisTests extends ESTestCase {
 
				         filterFactory = analysisService.tokenFilter("ja_stop");
			
 
				         assertThat(filterFactory, instanceOf(JapaneseStopTokenFilterFactory.class));
			
 
				 
			
 
				+        filterFactory = analysisService.tokenFilter("kuromoji_number");
			
 
				+        assertThat(filterFactory, instanceOf(KuromojiNumberFilterFactory.class));
			
 
				+
			
 
				         NamedAnalyzer analyzer = analysisService.analyzer("kuromoji");
			
 
				         assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class));
			
 
				 
			
@@ -262,4 +269,49 @@ public class KuromojiAnalysisTests extends ESTestCase {
 
				         TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_empty_user_dict");
			
 
				         assertThat(tokenizerFactory, instanceOf(KuromojiTokenizerFactory.class));
			
 
				     }
			
 
				+
			
 
				+    public void testNbestCost() throws IOException {
			
 
				+        AnalysisService analysisService = createAnalysisService();
			
 
				+        TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_nbest_cost");
			
 
				+        String source = "鳩山積み";
			
 
				+        String[] expected = new String[] {"鳩", "鳩山", "山積み", "積み"};
			
 
				+
			
 
				+        Tokenizer tokenizer = tokenizerFactory.create();
			
 
				+        tokenizer.setReader(new StringReader(source));
			
 
				+        assertSimpleTSOutput(tokenizer, expected);
			
 
				+    }
			
 
				+
			
 
				+    public void testNbestExample() throws IOException {
			
 
				+        AnalysisService analysisService = createAnalysisService();
			
 
				+        TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_nbest_examples");
			
 
				+        String source = "鳩山積み";
			
 
				+        String[] expected = new String[] {"鳩", "鳩山", "山積み", "積み"};
			
 
				+
			
 
				+        Tokenizer tokenizer = tokenizerFactory.create();
			
 
				+        tokenizer.setReader(new StringReader(source));
			
 
				+        assertSimpleTSOutput(tokenizer, expected);
			
 
				+    }
			
 
				+
			
 
				+    public void testNbestBothOptions() throws IOException {
			
 
				+        AnalysisService analysisService = createAnalysisService();
			
 
				+        TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_nbest_both");
			
 
				+        String source = "鳩山積み";
			
 
				+        String[] expected = new String[] {"鳩", "鳩山", "山積み", "積み"};
			
 
				+
			
 
				+        Tokenizer tokenizer = tokenizerFactory.create();
			
 
				+        tokenizer.setReader(new StringReader(source));
			
 
				+        assertSimpleTSOutput(tokenizer, expected);
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+    public void testNumberFilterFactory() throws Exception {
			
 
				+        AnalysisService analysisService = createAnalysisService();
			
 
				+        TokenFilterFactory tokenFilter = analysisService.tokenFilter("kuromoji_number");
			
 
				+        assertThat(tokenFilter, instanceOf(KuromojiNumberFilterFactory.class));
			
 
				+        String source = "本日十万二千五百円のワインを買った";
			
 
				+        String[] expected = new String[]{"本日", "102500", "円", "の", "ワイン", "を", "買っ", "た"};
			
 
				+        Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
			
 
				+        tokenizer.setReader(new StringReader(source));
			
 
				+        assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
			
 
				+    }
			
 
				 }
			
--- a/plugins/analysis-kuromoji/src/test/resources/org/elasticsearch/index/analysis/kuromoji_analysis.json
+++ b/plugins/analysis-kuromoji/src/test/resources/org/elasticsearch/index/analysis/kuromoji_analysis.json
@@ -18,7 +18,6 @@
 
				                     "type": "ja_stop",
			
 
				                     "stopwords": ["_japanese_", "スピード"]
			
 
				                 }
			
 
				-                
			
 
				             },
			
 
				 
			
 
				             "char_filter":{
			
@@ -48,6 +47,19 @@
 
				                 "kuromoji_user_dict" : {
			
 
				                     "type":"kuromoji_tokenizer",
			
 
				                     "user_dictionary":"user_dict.txt"
			
 
				+                },
			
 
				+                "kuromoji_nbest_cost" : {
			
 
				+                    "type": "kuromoji_tokenizer",
			
 
				+                    "nbest_cost" : "2000"
			
 
				+                },
			
 
				+                "kuromoji_nbest_examples" : {
			
 
				+                    "type": "kuromoji_tokenizer",
			
 
				+                    "nbest_examples" : "/鳩山積み-鳩山/鳩山積み-鳩/"
			
 
				+                },
			
 
				+                "kuromoji_nbest_both" : {
			
 
				+                    "type": "kuromoji_tokenizer",
			
 
				+                    "nbest_examples" : "/鳩山積み-鳩山/鳩山積み-鳩/",
			
 
				+                    "nbest_cost" : "1000"
			
 
				                 }
			
 
				             },
			
 
				             "analyzer" : {