Browse Source

Lithuanian analysis

Robert Muir 10 years ago
parent
commit
0d3e3f81fc

+ 2 - 0
core/src/main/java/org/elasticsearch/index/analysis/Analysis.java

@@ -46,6 +46,7 @@ import org.apache.lucene.analysis.hu.HungarianAnalyzer;
 import org.apache.lucene.analysis.hy.ArmenianAnalyzer;
 import org.apache.lucene.analysis.id.IndonesianAnalyzer;
 import org.apache.lucene.analysis.it.ItalianAnalyzer;
+import org.apache.lucene.analysis.lt.LithuanianAnalyzer;
 import org.apache.lucene.analysis.lv.LatvianAnalyzer;
 import org.apache.lucene.analysis.nl.DutchAnalyzer;
 import org.apache.lucene.analysis.no.NorwegianAnalyzer;
@@ -145,6 +146,7 @@ public class Analysis {
             .put("_irish_", IrishAnalyzer.getDefaultStopSet())
             .put("_italian_", ItalianAnalyzer.getDefaultStopSet())
             .put("_latvian_", LatvianAnalyzer.getDefaultStopSet())
+            .put("_lithuanian_", LithuanianAnalyzer.getDefaultStopSet())
             .put("_norwegian_", NorwegianAnalyzer.getDefaultStopSet())
             .put("_persian_", PersianAnalyzer.getDefaultStopSet())
             .put("_portuguese_", PortugueseAnalyzer.getDefaultStopSet())

+ 1 - 0
core/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java

@@ -492,6 +492,7 @@ public class AnalysisModule extends AbstractModule {
             analyzersBindings.processAnalyzer("irish", IrishAnalyzerProvider.class);
             analyzersBindings.processAnalyzer("italian", ItalianAnalyzerProvider.class);
             analyzersBindings.processAnalyzer("latvian", LatvianAnalyzerProvider.class);
+            analyzersBindings.processAnalyzer("lithuanian", LithuanianAnalyzerProvider.class);
             analyzersBindings.processAnalyzer("norwegian", NorwegianAnalyzerProvider.class);
             analyzersBindings.processAnalyzer("persian", PersianAnalyzerProvider.class);
             analyzersBindings.processAnalyzer("portuguese", PortugueseAnalyzerProvider.class);

+ 50 - 0
core/src/main/java/org/elasticsearch/index/analysis/LithuanianAnalyzerProvider.java

@@ -0,0 +1,50 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.lt.LithuanianAnalyzer;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+/**
+ * Provider for {@link LithuanianAnalyzer}
+ */
+public class LithuanianAnalyzerProvider extends AbstractIndexAnalyzerProvider<LithuanianAnalyzer> {
+
+    private final LithuanianAnalyzer analyzer;
+
+    @Inject
+    public LithuanianAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
+        super(index, indexSettings, name, settings);
+        analyzer = new LithuanianAnalyzer(Analysis.parseStopWords(env, settings, LithuanianAnalyzer.getDefaultStopSet()),
+                                      Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
+        analyzer.setVersion(version);
+    }
+
+    @Override
+    public LithuanianAnalyzer get() {
+        return this.analyzer;
+    }
+}

+ 3 - 0
core/src/main/java/org/elasticsearch/index/analysis/StemmerTokenFilterFactory.java

@@ -185,6 +185,9 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory {
         } else if ("latvian".equalsIgnoreCase(language)) {
             return new LatvianStemFilter(tokenStream);
 
+        } else if ("lithuanian".equalsIgnoreCase(language)) {
+            return new SnowballFilter(tokenStream, new LithuanianStemmer());
+
             // Norwegian (Bokmål) stemmers
         } else if ("norwegian".equalsIgnoreCase(language)) {
             return new SnowballFilter(tokenStream, new NorwegianStemmer());

+ 10 - 0
core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltAnalyzers.java

@@ -46,6 +46,7 @@ import org.apache.lucene.analysis.hu.HungarianAnalyzer;
 import org.apache.lucene.analysis.hy.ArmenianAnalyzer;
 import org.apache.lucene.analysis.id.IndonesianAnalyzer;
 import org.apache.lucene.analysis.it.ItalianAnalyzer;
+import org.apache.lucene.analysis.lt.LithuanianAnalyzer;
 import org.apache.lucene.analysis.lv.LatvianAnalyzer;
 import org.apache.lucene.analysis.nl.DutchAnalyzer;
 import org.apache.lucene.analysis.no.NorwegianAnalyzer;
@@ -378,6 +379,15 @@ public enum PreBuiltAnalyzers {
         }
     },
 
+    LITHUANIAN {
+        @Override
+        protected Analyzer create(Version version) {
+            Analyzer a = new LithuanianAnalyzer();
+            a.setVersion(version.luceneVersion);
+            return a;
+        }
+    },
+
     NORWEGIAN {
         @Override
         protected Analyzer create(Version version) {

+ 3 - 0
core/src/test/java/org/elasticsearch/bwcompat/BasicAnalysisBackwardCompatibilityIT.java

@@ -112,6 +112,9 @@ public class BasicAnalysisBackwardCompatibilityIT extends ESBackcompatTestCase {
             if (preBuiltAnalyzers == PreBuiltAnalyzers.SORANI && compatibilityVersion().before(Version.V_1_3_0)) {
                 continue; // SORANI was added in 1.3.0
             }
+            if (preBuiltAnalyzers == PreBuiltAnalyzers.LITHUANIAN && compatibilityVersion().before(Version.V_2_1_0)) {
+                continue; // LITHUANIAN was added in 2.1.0
+            }
             return preBuiltAnalyzers.name().toLowerCase(Locale.ROOT);
         }
 

+ 48 - 2
docs/reference/analysis/analyzers/lang-analyzer.asciidoc

@@ -25,6 +25,7 @@ following types are supported:
 <<irish-analyzer,`irish`>>,
 <<italian-analyzer,`italian`>>,
 <<latvian-analyzer,`latvian`>>,
+<<lithuanian-analyzer,`lithuanian`>>,
 <<norwegian-analyzer,`norwegian`>>,
 <<persian-analyzer,`persian`>>,
 <<portuguese-analyzer,`portuguese`>>,
@@ -56,8 +57,9 @@ with the `keywords` set to the value of the `stem_exclusion` parameter.
 The following analyzers support setting custom `stem_exclusion` list:
 `arabic`, `armenian`, `basque`, `catalan`, `bulgarian`, `catalan`,
 `czech`, `finnish`, `dutch`, `english`, `finnish`, `french`, `galician`,
-`german`, `irish`, `hindi`, `hungarian`, `indonesian`, `italian`, `latvian`, `norwegian`,
-`portuguese`, `romanian`, `russian`, `sorani`, `spanish`, `swedish`, `turkish`.
+`german`, `irish`, `hindi`, `hungarian`, `indonesian`, `italian`, `latvian`,
+`lithuanian`, `norwegian`, `portuguese`, `romanian`, `russian`, `sorani`,
+`spanish`, `swedish`, `turkish`.
 
 ==== Reimplementing language analyzers
 
@@ -1082,6 +1084,50 @@ The `latvian` analyzer could be reimplemented as a `custom` analyzer as follows:
 <2> This filter should be removed unless there are words which should
     be excluded from stemming.
 
+[[lithuanian-analyzer]]
+===== `lithuanian` analyzer
+
+The `lithuanian` analyzer could be reimplemented as a `custom` analyzer as follows:
+
+[source,js]
+----------------------------------------------------
+{
+  "settings": {
+    "analysis": {
+      "filter": {
+        "lithuanian_stop": {
+          "type":       "stop",
+          "stopwords":  "_lithuanian_" <1>
+        },
+        "lithuanian_keywords": {
+          "type":       "keyword_marker",
+          "keywords":   [] <2>
+        },
+        "lithuanian_stemmer": {
+          "type":       "stemmer",
+          "language":   "lithuanian"
+        }
+      },
+      "analyzer": {
+        "lithuanian": {
+          "tokenizer":  "standard",
+          "filter": [
+            "lowercase",
+            "lithuanian_stop",
+            "lithuanian_keywords",
+            "lithuanian_stemmer"
+          ]
+        }
+      }
+    }
+  }
+}
+----------------------------------------------------
+<1> The default stopwords can be overridden with the `stopwords`
+    or `stopwords_path` parameters.
+<2> This filter should be removed unless there are words which should
+    be excluded from stemming.
+
 [[norwegian-analyzer]]
 ===== `norwegian` analyzer
 

+ 2 - 2
docs/reference/analysis/tokenfilters/snowball-tokenfilter.asciidoc

@@ -5,8 +5,8 @@ A filter that stems words using a Snowball-generated stemmer. The
 `language` parameter controls the stemmer with the following available
 values: `Armenian`, `Basque`, `Catalan`, `Danish`, `Dutch`, `English`,
 `Finnish`, `French`, `German`, `German2`, `Hungarian`, `Italian`, `Kp`,
-`Lovins`, `Norwegian`, `Porter`, `Portuguese`, `Romanian`, `Russian`,
-`Spanish`, `Swedish`, `Turkish`.
+`Lithuanian`, `Lovins`, `Norwegian`, `Porter`, `Portuguese`, `Romanian`,
+`Russian`, `Spanish`, `Swedish`, `Turkish`.
 
 For example:
 

+ 4 - 0
docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc

@@ -133,6 +133,10 @@ Latvian::
 
 http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/lv/LatvianStemmer.html[*`latvian`*]
 
+Lithuanian::
+
+http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_5_3/lucene/analysis/common/src/java/org/apache/lucene/analysis/lt/stem_ISO_8859_1.sbl?view=markup[*`lithuanian`*]
+
 Norwegian (Bokmål)::
 
 http://snowball.tartarus.org/algorithms/norwegian/stemmer.html[*`norwegian`*],