6 سال پیش · 2eef9707cc
--- a/docs/plugins/analysis-smartcn.asciidoc
+++ b/docs/plugins/analysis-smartcn.asciidoc
@@ -17,7 +17,415 @@ include::install_remove.asciidoc[]
 
				 [float]
			
 
				 ==== `smartcn` tokenizer and token filter
			
 
				 
			
 
				-The plugin provides the `smartcn` analyzer and `smartcn_tokenizer` tokenizer,
			
 
				-which are not configurable.
			
 
				+The plugin provides the `smartcn` analyzer, `smartcn_tokenizer` tokenizer, and
			
 
				+`smartcn_stop` token filter which are not configurable.
			
 
				 
			
 
				 NOTE: The `smartcn_word` token filter and `smartcn_sentence` have been deprecated.
			
 
				+
			
 
				+==== Reimplementing and extending the analyzers
			
 
				+
			
 
				+The `smartcn` analyzer could be reimplemented as a `custom` analyzer that can
			
 
				+then be extended and configured as follows:
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------------------------------
			
 
				+PUT smartcn_example
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "rebuilt_smartcn": {
			
 
				+          "tokenizer":  "smartcn_tokenizer",
			
 
				+          "filter": [
			
 
				+            "porter_stem",
			
 
				+            "smartcn_stop"
			
 
				+          ]
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+----------------------------------------------------
			
 
				+// CONSOLE
			
 
				+// TEST[s/\n$/\nstartyaml\n  - compare_analyzers: {index: smartcn_example, first: smartcn, second: rebuilt_smartcn}\nendyaml\n/]
			
 
				+
			
 
				+[[analysis-smartcn_stop]]
			
 
				+==== `smartcn_stop` token filter
			
 
				+
			
 
				+The `smartcn_stop` token filter filters out stopwords defined by `smartcn`
			
 
				+analyzer (`_smartcn_`), and any other custom stopwords specified by the user.
			
 
				+This filter only supports the predefined `_smartcn_` stopwords list.
			
 
				+If you want to use a different predefined list, then use the
			
 
				+{ref}/analysis-stop-tokenfilter.html[`stop` token filter] instead.
			
 
				+
			
 
				+[source,js]
			
 
				+--------------------------------------------------
			
 
				+PUT smartcn_example
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "index": {
			
 
				+      "analysis": {
			
 
				+        "analyzer": {
			
 
				+          "smartcn_with_stop": {
			
 
				+            "tokenizer": "smartcn_tokenizer",
			
 
				+            "filter": [
			
 
				+              "porter_stem",
			
 
				+              "my_smartcn_stop"
			
 
				+            ]
			
 
				+          }
			
 
				+        },
			
 
				+        "filter": {
			
 
				+          "my_smartcn_stop": {
			
 
				+            "type": "smartcn_stop",
			
 
				+            "stopwords": [
			
 
				+              "_smartcn_",
			
 
				+              "stack",
			
 
				+              "的"
			
 
				+            ]
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+GET smartcn_example/_analyze
			
 
				+{
			
 
				+  "analyzer": "smartcn_with_stop",
			
 
				+  "text": "哈喽，我们是 Elastic   我们是 Elastic Stack（Elasticsearch、Kibana、Beats 和 Logstash）的开发公司。从股票行情到 Twitter 消息流，从 Apache 日志到 WordPress 博文，我们可以帮助人们体验搜索的强大力量，帮助他们以截然不同的方式探索和分析数据"
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+The above request returns:
			
 
				+
			
 
				+[source,js]
			
 
				+--------------------------------------------------
			
 
				+{
			
 
				+    "tokens": [
			
 
				+        {
			
 
				+            "token": "哈",
			
 
				+            "start_offset": 0,
			
 
				+            "end_offset": 1,
			
 
				+            "type": "word",
			
 
				+            "position": 0
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "喽",
			
 
				+            "start_offset": 1,
			
 
				+            "end_offset": 2,
			
 
				+            "type": "word",
			
 
				+            "position": 1
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "我们",
			
 
				+            "start_offset": 3,
			
 
				+            "end_offset": 5,
			
 
				+            "type": "word",
			
 
				+            "position": 3
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "是",
			
 
				+            "start_offset": 5,
			
 
				+            "end_offset": 6,
			
 
				+            "type": "word",
			
 
				+            "position": 4
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "elast",
			
 
				+            "start_offset": 7,
			
 
				+            "end_offset": 14,
			
 
				+            "type": "word",
			
 
				+            "position": 5
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "我们",
			
 
				+            "start_offset": 17,
			
 
				+            "end_offset": 19,
			
 
				+            "type": "word",
			
 
				+            "position": 6
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "是",
			
 
				+            "start_offset": 19,
			
 
				+            "end_offset": 20,
			
 
				+            "type": "word",
			
 
				+            "position": 7
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "elast",
			
 
				+            "start_offset": 21,
			
 
				+            "end_offset": 28,
			
 
				+            "type": "word",
			
 
				+            "position": 8
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "elasticsearch",
			
 
				+            "start_offset": 35,
			
 
				+            "end_offset": 48,
			
 
				+            "type": "word",
			
 
				+            "position": 11
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "kibana",
			
 
				+            "start_offset": 49,
			
 
				+            "end_offset": 55,
			
 
				+            "type": "word",
			
 
				+            "position": 13
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "beat",
			
 
				+            "start_offset": 56,
			
 
				+            "end_offset": 61,
			
 
				+            "type": "word",
			
 
				+            "position": 15
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "和",
			
 
				+            "start_offset": 62,
			
 
				+            "end_offset": 63,
			
 
				+            "type": "word",
			
 
				+            "position": 16
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "logstash",
			
 
				+            "start_offset": 64,
			
 
				+            "end_offset": 72,
			
 
				+            "type": "word",
			
 
				+            "position": 17
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "开发",
			
 
				+            "start_offset": 74,
			
 
				+            "end_offset": 76,
			
 
				+            "type": "word",
			
 
				+            "position": 20
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "公司",
			
 
				+            "start_offset": 76,
			
 
				+            "end_offset": 78,
			
 
				+            "type": "word",
			
 
				+            "position": 21
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "从",
			
 
				+            "start_offset": 79,
			
 
				+            "end_offset": 80,
			
 
				+            "type": "word",
			
 
				+            "position": 23
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "股票",
			
 
				+            "start_offset": 80,
			
 
				+            "end_offset": 82,
			
 
				+            "type": "word",
			
 
				+            "position": 24
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "行情",
			
 
				+            "start_offset": 82,
			
 
				+            "end_offset": 84,
			
 
				+            "type": "word",
			
 
				+            "position": 25
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "到",
			
 
				+            "start_offset": 84,
			
 
				+            "end_offset": 85,
			
 
				+            "type": "word",
			
 
				+            "position": 26
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "twitter",
			
 
				+            "start_offset": 86,
			
 
				+            "end_offset": 93,
			
 
				+            "type": "word",
			
 
				+            "position": 27
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "消息",
			
 
				+            "start_offset": 94,
			
 
				+            "end_offset": 96,
			
 
				+            "type": "word",
			
 
				+            "position": 28
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "流",
			
 
				+            "start_offset": 96,
			
 
				+            "end_offset": 97,
			
 
				+            "type": "word",
			
 
				+            "position": 29
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "从",
			
 
				+            "start_offset": 98,
			
 
				+            "end_offset": 99,
			
 
				+            "type": "word",
			
 
				+            "position": 31
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "apach",
			
 
				+            "start_offset": 100,
			
 
				+            "end_offset": 106,
			
 
				+            "type": "word",
			
 
				+            "position": 32
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "日志",
			
 
				+            "start_offset": 107,
			
 
				+            "end_offset": 109,
			
 
				+            "type": "word",
			
 
				+            "position": 33
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "到",
			
 
				+            "start_offset": 109,
			
 
				+            "end_offset": 110,
			
 
				+            "type": "word",
			
 
				+            "position": 34
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "wordpress",
			
 
				+            "start_offset": 111,
			
 
				+            "end_offset": 120,
			
 
				+            "type": "word",
			
 
				+            "position": 35
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "博",
			
 
				+            "start_offset": 121,
			
 
				+            "end_offset": 122,
			
 
				+            "type": "word",
			
 
				+            "position": 36
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "文",
			
 
				+            "start_offset": 122,
			
 
				+            "end_offset": 123,
			
 
				+            "type": "word",
			
 
				+            "position": 37
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "我们",
			
 
				+            "start_offset": 124,
			
 
				+            "end_offset": 126,
			
 
				+            "type": "word",
			
 
				+            "position": 39
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "可以",
			
 
				+            "start_offset": 126,
			
 
				+            "end_offset": 128,
			
 
				+            "type": "word",
			
 
				+            "position": 40
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "帮助",
			
 
				+            "start_offset": 128,
			
 
				+            "end_offset": 130,
			
 
				+            "type": "word",
			
 
				+            "position": 41
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "人们",
			
 
				+            "start_offset": 130,
			
 
				+            "end_offset": 132,
			
 
				+            "type": "word",
			
 
				+            "position": 42
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "体验",
			
 
				+            "start_offset": 132,
			
 
				+            "end_offset": 134,
			
 
				+            "type": "word",
			
 
				+            "position": 43
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "搜索",
			
 
				+            "start_offset": 134,
			
 
				+            "end_offset": 136,
			
 
				+            "type": "word",
			
 
				+            "position": 44
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "强大",
			
 
				+            "start_offset": 137,
			
 
				+            "end_offset": 139,
			
 
				+            "type": "word",
			
 
				+            "position": 46
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "力量",
			
 
				+            "start_offset": 139,
			
 
				+            "end_offset": 141,
			
 
				+            "type": "word",
			
 
				+            "position": 47
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "帮助",
			
 
				+            "start_offset": 142,
			
 
				+            "end_offset": 144,
			
 
				+            "type": "word",
			
 
				+            "position": 49
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "他们",
			
 
				+            "start_offset": 144,
			
 
				+            "end_offset": 146,
			
 
				+            "type": "word",
			
 
				+            "position": 50
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "以",
			
 
				+            "start_offset": 146,
			
 
				+            "end_offset": 147,
			
 
				+            "type": "word",
			
 
				+            "position": 51
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "截然不同",
			
 
				+            "start_offset": 147,
			
 
				+            "end_offset": 151,
			
 
				+            "type": "word",
			
 
				+            "position": 52
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "方式",
			
 
				+            "start_offset": 152,
			
 
				+            "end_offset": 154,
			
 
				+            "type": "word",
			
 
				+            "position": 54
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "探索",
			
 
				+            "start_offset": 154,
			
 
				+            "end_offset": 156,
			
 
				+            "type": "word",
			
 
				+            "position": 55
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "和",
			
 
				+            "start_offset": 156,
			
 
				+            "end_offset": 157,
			
 
				+            "type": "word",
			
 
				+            "position": 56
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "分析",
			
 
				+            "start_offset": 157,
			
 
				+            "end_offset": 159,
			
 
				+            "type": "word",
			
 
				+            "position": 57
			
 
				+        },
			
 
				+        {
			
 
				+            "token": "数据",
			
 
				+            "start_offset": 159,
			
 
				+            "end_offset": 161,
			
 
				+            "type": "word",
			
 
				+            "position": 58
			
 
				+        }
			
 
				+    ]
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+// TESTRESPONSE
			
--- a/plugins/analysis-smartcn/src/main/java/org/elasticsearch/index/analysis/SmartChineseStopTokenFilterFactory.java
+++ b/plugins/analysis-smartcn/src/main/java/org/elasticsearch/index/analysis/SmartChineseStopTokenFilterFactory.java
@@ -0,0 +1,61 @@
 
				+/*
			
 
				+ * Licensed to Elasticsearch under one or more contributor
			
 
				+ * license agreements. See the NOTICE file distributed with
			
 
				+ * this work for additional information regarding copyright
			
 
				+ * ownership. Elasticsearch licenses this file to you under
			
 
				+ * the Apache License, Version 2.0 (the "License"); you may
			
 
				+ * not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing,
			
 
				+ * software distributed under the License is distributed on an
			
 
				+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
			
 
				+ * KIND, either express or implied.  See the License for the
			
 
				+ * specific language governing permissions and limitations
			
 
				+ * under the License.
			
 
				+ */
			
 
				+
			
 
				+package org.elasticsearch.index.analysis;
			
 
				+
			
 
				+import org.apache.lucene.analysis.CharArraySet;
			
 
				+import org.apache.lucene.analysis.StopFilter;
			
 
				+import org.apache.lucene.analysis.TokenStream;
			
 
				+import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
			
 
				+import org.apache.lucene.search.suggest.analyzing.SuggestStopFilter;
			
 
				+import org.elasticsearch.common.settings.Settings;
			
 
				+import org.elasticsearch.env.Environment;
			
 
				+import org.elasticsearch.index.IndexSettings;
			
 
				+import java.util.Map;
			
 
				+import java.util.Set;
			
 
				+
			
 
				+import static java.util.Collections.singletonMap;
			
 
				+
			
 
				+public class SmartChineseStopTokenFilterFactory extends AbstractTokenFilterFactory {
			
 
				+    private static final Map<String, Set<?>> NAMED_STOP_WORDS = singletonMap("_smartcn_", SmartChineseAnalyzer.getDefaultStopSet());
			
 
				+
			
 
				+    private final CharArraySet stopWords;
			
 
				+
			
 
				+    private final boolean ignoreCase;
			
 
				+
			
 
				+    private final boolean removeTrailing;
			
 
				+
			
 
				+    public SmartChineseStopTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
			
 
				+        super(indexSettings, name, settings);
			
 
				+        this.ignoreCase = settings.getAsBoolean("ignore_case", false);
			
 
				+        this.removeTrailing = settings.getAsBoolean("remove_trailing", true);
			
 
				+        this.stopWords = Analysis.parseWords(env, settings, "stopwords",
			
 
				+            SmartChineseAnalyzer.getDefaultStopSet(), NAMED_STOP_WORDS, ignoreCase);
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public TokenStream create(TokenStream tokenStream) {
			
 
				+        if (removeTrailing) {
			
 
				+            return new StopFilter(tokenStream, stopWords);
			
 
				+        } else {
			
 
				+            return new SuggestStopFilter(tokenStream, stopWords);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+}
			
--- a/plugins/analysis-smartcn/src/main/java/org/elasticsearch/plugin/analysis/smartcn/AnalysisSmartChinesePlugin.java
+++ b/plugins/analysis-smartcn/src/main/java/org/elasticsearch/plugin/analysis/smartcn/AnalysisSmartChinesePlugin.java
@@ -23,6 +23,7 @@ import org.apache.lucene.analysis.Analyzer;
 
				 import org.elasticsearch.index.analysis.AnalyzerProvider;
			
 
				 import org.elasticsearch.index.analysis.SmartChineseAnalyzerProvider;
			
 
				 import org.elasticsearch.index.analysis.SmartChineseNoOpTokenFilterFactory;
			
 
				+import org.elasticsearch.index.analysis.SmartChineseStopTokenFilterFactory;
			
 
				 import org.elasticsearch.index.analysis.SmartChineseTokenizerTokenizerFactory;
			
 
				 import org.elasticsearch.index.analysis.TokenFilterFactory;
			
 
				 import org.elasticsearch.index.analysis.TokenizerFactory;
			
@@ -38,15 +39,18 @@ import static java.util.Collections.singletonMap;
 
				 public class AnalysisSmartChinesePlugin extends Plugin implements AnalysisPlugin {
			
 
				     @Override
			
 
				     public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
			
 
				-        // This is a noop token filter; it's here for backwards compat before we had "smartcn_tokenizer"
			
 
				-        return singletonMap("smartcn_word", SmartChineseNoOpTokenFilterFactory::new);
			
 
				+        Map<String, AnalysisProvider<TokenFilterFactory>> tokenFilters = new HashMap<>();
			
 
				+        tokenFilters.put("smartcn_stop", SmartChineseStopTokenFilterFactory::new);
			
 
				+        // TODO: deprecate and remove, this is a noop token filter; it's here for backwards compat before we had "smartcn_tokenizer"
			
 
				+        tokenFilters.put("smartcn_word", SmartChineseNoOpTokenFilterFactory::new);
			
 
				+        return tokenFilters;
			
 
				     }
			
 
				 
			
 
				     @Override
			
 
				     public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
			
 
				         Map<String, AnalysisProvider<TokenizerFactory>> extra = new HashMap<>();
			
 
				         extra.put("smartcn_tokenizer", SmartChineseTokenizerTokenizerFactory::new);
			
 
				-        // This is an alias to "smartcn_tokenizer"; it's here for backwards compat
			
 
				+        // TODO: deprecate and remove, this is an alias to "smartcn_tokenizer"; it's here for backwards compat
			
 
				         extra.put("smartcn_sentence", SmartChineseTokenizerTokenizerFactory::new);
			
 
				         return extra;
			
 
				     }