|
@@ -19,49 +19,141 @@
|
|
|
|
|
|
package org.elasticsearch.indices.analysis;
|
|
|
|
|
|
-import org.apache.lucene.analysis.hunspell.Dictionary;
|
|
|
-import org.elasticsearch.ElasticsearchException;
|
|
|
import org.elasticsearch.Version;
|
|
|
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
|
|
-import org.elasticsearch.common.inject.AbstractModule;
|
|
|
import org.elasticsearch.common.settings.Settings;
|
|
|
import org.elasticsearch.env.Environment;
|
|
|
import org.elasticsearch.index.IndexSettings;
|
|
|
+import org.elasticsearch.index.analysis.ASCIIFoldingTokenFilterFactory;
|
|
|
import org.elasticsearch.index.analysis.AnalysisRegistry;
|
|
|
import org.elasticsearch.index.analysis.AnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.ApostropheFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.ArabicAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.ArabicNormalizationFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.ArmenianAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.BasqueAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.BrazilianAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.BrazilianStemTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.BulgarianAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.CJKBigramFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.CJKWidthFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.CatalanAnalyzerProvider;
|
|
|
import org.elasticsearch.index.analysis.CharFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.ChineseAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.CjkAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.ClassicFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.ClassicTokenizerFactory;
|
|
|
+import org.elasticsearch.index.analysis.CommonGramsTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.CzechAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.CzechStemTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.DanishAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.DecimalDigitFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.DutchAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.DutchStemTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.EdgeNGramTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
|
|
|
+import org.elasticsearch.index.analysis.ElisionTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.EnglishAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.FingerprintAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.FingerprintTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.FinnishAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.FrenchAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.FrenchStemTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.GalicianAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.GermanAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.GermanNormalizationFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.GermanStemTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.GreekAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.HindiAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.HindiNormalizationFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.HungarianAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.HunspellTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.IndicNormalizationFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.IndonesianAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.IrishAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.ItalianAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.KStemTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.KeepTypesFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.KeepWordFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.KeywordAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.KeywordMarkerTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
|
|
|
+import org.elasticsearch.index.analysis.LatvianAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.LengthTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.LetterTokenizerFactory;
|
|
|
+import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.LithuanianAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.LowerCaseTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.LowerCaseTokenizerFactory;
|
|
|
+import org.elasticsearch.index.analysis.MappingCharFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.NGramTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.NGramTokenizerFactory;
|
|
|
+import org.elasticsearch.index.analysis.NorwegianAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.PathHierarchyTokenizerFactory;
|
|
|
+import org.elasticsearch.index.analysis.PatternAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.PatternCaptureGroupTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.PatternReplaceCharFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.PatternReplaceTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.PatternTokenizerFactory;
|
|
|
+import org.elasticsearch.index.analysis.PersianAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.PorterStemTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.PortugueseAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.ReverseTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.RomanianAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.RussianAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.RussianStemTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.ScandinavianNormalizationFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.SerbianNormalizationFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.SimpleAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.SnowballAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.SnowballTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.SoraniAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.SpanishAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.StandardAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.StandardHtmlStripAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.StandardTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.StandardTokenizerFactory;
|
|
|
+import org.elasticsearch.index.analysis.StemmerOverrideTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.StemmerTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.StopAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.StopTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.SwedishAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.ThaiAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.ThaiTokenizerFactory;
|
|
|
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
|
|
import org.elasticsearch.index.analysis.TokenizerFactory;
|
|
|
+import org.elasticsearch.index.analysis.TrimTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.TruncateTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.TurkishAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory;
|
|
|
+import org.elasticsearch.index.analysis.UniqueTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider;
|
|
|
+import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
|
|
|
+import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
|
|
|
+import org.elasticsearch.plugins.AnalysisPlugin;
|
|
|
|
|
|
import java.io.IOException;
|
|
|
-import java.util.Collections;
|
|
|
import java.util.HashMap;
|
|
|
+import java.util.List;
|
|
|
import java.util.Map;
|
|
|
+import java.util.function.Function;
|
|
|
+
|
|
|
+import static java.util.Objects.requireNonNull;
|
|
|
|
|
|
/**
|
|
|
- * The AnalysisModule is the main extension point for node and index level analysis components. The lucene classes
|
|
|
- * {@link org.apache.lucene.analysis.Analyzer}, {@link org.apache.lucene.analysis.TokenFilter}, {@link org.apache.lucene.analysis.Tokenizer}
|
|
|
- * and {@link org.apache.lucene.analysis.CharFilter} can be extended in plugins and registered on node startup when the analysis module
|
|
|
- * gets loaded. Since elasticsearch needs to create multiple instances for different configurations dedicated factories need to be provided for
|
|
|
- * each of the components:
|
|
|
- * <ul>
|
|
|
- * <li> {@link org.apache.lucene.analysis.Analyzer} can be exposed via {@link AnalyzerProvider} and registered on {@link #registerAnalyzer(String, AnalysisProvider)}</li>
|
|
|
- * <li> {@link org.apache.lucene.analysis.TokenFilter} can be exposed via {@link TokenFilterFactory} and registered on {@link #registerTokenFilter(String, AnalysisProvider)}</li>
|
|
|
- * <li> {@link org.apache.lucene.analysis.Tokenizer} can be exposed via {@link TokenizerFactory} and registered on {@link #registerTokenizer(String, AnalysisProvider)}</li>
|
|
|
- * <li> {@link org.apache.lucene.analysis.CharFilter} can be exposed via {@link CharFilterFactory} and registered on {@link #registerCharFilter(String, AnalysisProvider)}</li>
|
|
|
- * </ul>
|
|
|
- *
|
|
|
- * The {@link org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider} is only a functional interface that allows to register factory constructors directly like the plugin example below:
|
|
|
- * <pre>
|
|
|
- * public class MyAnalysisPlugin extends Plugin {
|
|
|
- * public void onModule(AnalysisModule module) {
|
|
|
- * module.registerAnalyzer("my-analyzer-name", MyAnalyzer::new);
|
|
|
- * }
|
|
|
- * }
|
|
|
- * </pre>
|
|
|
+ * Sets up {@link AnalysisRegistry}.
|
|
|
*/
|
|
|
-public final class AnalysisModule extends AbstractModule {
|
|
|
-
|
|
|
+public final class AnalysisModule {
|
|
|
static {
|
|
|
Settings build = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
|
|
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 1)
|
|
@@ -71,108 +163,195 @@ public final class AnalysisModule extends AbstractModule {
|
|
|
NA_INDEX_SETTINGS = new IndexSettings(metaData, Settings.EMPTY);
|
|
|
}
|
|
|
private static final IndexSettings NA_INDEX_SETTINGS;
|
|
|
- private final Environment environment;
|
|
|
- private final Map<String, AnalysisProvider<CharFilterFactory>> charFilters = new HashMap<>();
|
|
|
- private final Map<String, AnalysisProvider<TokenFilterFactory>> tokenFilters = new HashMap<>();
|
|
|
- private final Map<String, AnalysisProvider<TokenizerFactory>> tokenizers = new HashMap<>();
|
|
|
- private final Map<String, AnalysisProvider<AnalyzerProvider>> analyzers = new HashMap<>();
|
|
|
- private final Map<String, org.apache.lucene.analysis.hunspell.Dictionary> knownDictionaries = new HashMap<>();
|
|
|
|
|
|
- /**
|
|
|
- * Creates a new AnalysisModule
|
|
|
- */
|
|
|
- public AnalysisModule(Environment environment) {
|
|
|
- this.environment = environment;
|
|
|
+ private final HunspellService hunspellService;
|
|
|
+ private final AnalysisRegistry analysisRegistry;
|
|
|
+
|
|
|
+ public AnalysisModule(Environment environment, List<AnalysisPlugin> plugins) throws IOException {
|
|
|
+ NamedRegistry<AnalysisProvider<CharFilterFactory>> charFilters = setupCharFilters(plugins);
|
|
|
+ NamedRegistry<org.apache.lucene.analysis.hunspell.Dictionary> hunspellDictionaries = setupHunspellDictionaries(plugins);
|
|
|
+ hunspellService = new HunspellService(environment.settings(), environment, hunspellDictionaries.registry);
|
|
|
+ NamedRegistry<AnalysisProvider<TokenFilterFactory>> tokenFilters = setupTokenFilters(plugins, hunspellService);
|
|
|
+ NamedRegistry<AnalysisProvider<TokenizerFactory>> tokenizers = setupTokenizers(plugins);
|
|
|
+ NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> analyzers = setupAnalyzers(plugins);
|
|
|
+ analysisRegistry = new AnalysisRegistry(environment, charFilters.registry, tokenFilters.registry,
|
|
|
+ tokenizers.registry, analyzers.registry);
|
|
|
}
|
|
|
|
|
|
- /**
|
|
|
- * Registers a new {@link AnalysisProvider} to create
|
|
|
- * {@link CharFilterFactory} instance per node as well as per index.
|
|
|
- */
|
|
|
- public void registerCharFilter(String name, AnalysisProvider<CharFilterFactory> charFilter) {
|
|
|
- if (charFilter == null) {
|
|
|
- throw new IllegalArgumentException("char_filter provider must not be null");
|
|
|
- }
|
|
|
- if (charFilters.putIfAbsent(name, charFilter) != null) {
|
|
|
- throw new IllegalArgumentException("char_filter provider for name " + name + " already registered");
|
|
|
- }
|
|
|
+ HunspellService getHunspellService() {
|
|
|
+ return hunspellService;
|
|
|
}
|
|
|
|
|
|
- /**
|
|
|
- * Registers a new {@link AnalysisProvider} to create
|
|
|
- * {@link TokenFilterFactory} instance per node as well as per index.
|
|
|
- */
|
|
|
- public void registerTokenFilter(String name, AnalysisProvider<TokenFilterFactory> tokenFilter) {
|
|
|
- if (tokenFilter == null) {
|
|
|
- throw new IllegalArgumentException("token_filter provider must not be null");
|
|
|
- }
|
|
|
- if (tokenFilters.putIfAbsent(name, tokenFilter) != null) {
|
|
|
- throw new IllegalArgumentException("token_filter provider for name " + name + " already registered");
|
|
|
- }
|
|
|
+ public AnalysisRegistry getAnalysisRegistry() {
|
|
|
+ return analysisRegistry;
|
|
|
}
|
|
|
|
|
|
- /**
|
|
|
- * Registers a new {@link AnalysisProvider} to create
|
|
|
- * {@link TokenizerFactory} instance per node as well as per index.
|
|
|
- */
|
|
|
- public void registerTokenizer(String name, AnalysisProvider<TokenizerFactory> tokenizer) {
|
|
|
- if (tokenizer == null) {
|
|
|
- throw new IllegalArgumentException("tokenizer provider must not be null");
|
|
|
- }
|
|
|
- if (tokenizers.putIfAbsent(name, tokenizer) != null) {
|
|
|
- throw new IllegalArgumentException("tokenizer provider for name " + name + " already registered");
|
|
|
- }
|
|
|
+ private NamedRegistry<AnalysisProvider<CharFilterFactory>> setupCharFilters(List<AnalysisPlugin> plugins) {
|
|
|
+ NamedRegistry<AnalysisProvider<CharFilterFactory>> charFilters = new NamedRegistry<>("char_filter");
|
|
|
+ charFilters.register("html_strip", HtmlStripCharFilterFactory::new);
|
|
|
+ charFilters.register("pattern_replace", requriesAnalysisSettings(PatternReplaceCharFilterFactory::new));
|
|
|
+ charFilters.register("mapping", requriesAnalysisSettings(MappingCharFilterFactory::new));
|
|
|
+ charFilters.registerPlugins(plugins, AnalysisPlugin::getCharFilters);
|
|
|
+ return charFilters;
|
|
|
}
|
|
|
|
|
|
- /**
|
|
|
- * Registers a new {@link AnalysisProvider} to create
|
|
|
- * {@link AnalyzerProvider} instance per node as well as per index.
|
|
|
- */
|
|
|
- public void registerAnalyzer(String name, AnalysisProvider<AnalyzerProvider> analyzer) {
|
|
|
- if (analyzer == null) {
|
|
|
- throw new IllegalArgumentException("analyzer provider must not be null");
|
|
|
- }
|
|
|
- if (analyzers.putIfAbsent(name, analyzer) != null) {
|
|
|
- throw new IllegalArgumentException("analyzer provider for name " + name + " already registered");
|
|
|
- }
|
|
|
+ public NamedRegistry<org.apache.lucene.analysis.hunspell.Dictionary> setupHunspellDictionaries(List<AnalysisPlugin> plugins) {
|
|
|
+ NamedRegistry<org.apache.lucene.analysis.hunspell.Dictionary> hunspellDictionaries = new NamedRegistry<>("dictionary");
|
|
|
+ hunspellDictionaries.registerPlugins(plugins, AnalysisPlugin::getHunspellDictionaries);
|
|
|
+ return hunspellDictionaries;
|
|
|
}
|
|
|
|
|
|
- /**
|
|
|
- * Registers a new hunspell {@link Dictionary} that can be referenced by the given name in
|
|
|
- * hunspell analysis configuration.
|
|
|
- */
|
|
|
- public void registerHunspellDictionary(String name, Dictionary dictionary) {
|
|
|
- if (knownDictionaries.putIfAbsent(name, dictionary) != null) {
|
|
|
- throw new IllegalArgumentException("dictionary for [" + name + "] is already registered");
|
|
|
- }
|
|
|
+ private NamedRegistry<AnalysisProvider<TokenFilterFactory>> setupTokenFilters(List<AnalysisPlugin> plugins,
|
|
|
+ HunspellService hunspellService) {
|
|
|
+ NamedRegistry<AnalysisProvider<TokenFilterFactory>> tokenFilters = new NamedRegistry<>("token_filter");
|
|
|
+ tokenFilters.register("stop", StopTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("reverse", ReverseTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("asciifolding", ASCIIFoldingTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("length", LengthTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("lowercase", LowerCaseTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("uppercase", UpperCaseTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("porter_stem", PorterStemTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("kstem", KStemTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("standard", StandardTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("nGram", NGramTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("ngram", NGramTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("edgeNGram", EdgeNGramTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("edge_ngram", EdgeNGramTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("shingle", ShingleTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("unique", UniqueTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("truncate", requriesAnalysisSettings(TruncateTokenFilterFactory::new));
|
|
|
+ tokenFilters.register("trim", TrimTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("limit", LimitTokenCountFilterFactory::new);
|
|
|
+ tokenFilters.register("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new));
|
|
|
+ tokenFilters.register("snowball", SnowballTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("stemmer", StemmerTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("word_delimiter", WordDelimiterTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("elision", ElisionTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("keep", requriesAnalysisSettings(KeepWordFilterFactory::new));
|
|
|
+ tokenFilters.register("keep_types", requriesAnalysisSettings(KeepTypesFilterFactory::new));
|
|
|
+ tokenFilters.register("pattern_capture", requriesAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));
|
|
|
+ tokenFilters.register("pattern_replace", requriesAnalysisSettings(PatternReplaceTokenFilterFactory::new));
|
|
|
+ tokenFilters.register("dictionary_decompounder", requriesAnalysisSettings(DictionaryCompoundWordTokenFilterFactory::new));
|
|
|
+ tokenFilters.register("hyphenation_decompounder", requriesAnalysisSettings(HyphenationCompoundWordTokenFilterFactory::new));
|
|
|
+ tokenFilters.register("arabic_stem", ArabicStemTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("brazilian_stem", BrazilianStemTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("czech_stem", CzechStemTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("dutch_stem", DutchStemTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("french_stem", FrenchStemTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("german_stem", GermanStemTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("russian_stem", RussianStemTokenFilterFactory::new);
|
|
|
+ tokenFilters.register("keyword_marker", requriesAnalysisSettings(KeywordMarkerTokenFilterFactory::new));
|
|
|
+ tokenFilters.register("stemmer_override", requriesAnalysisSettings(StemmerOverrideTokenFilterFactory::new));
|
|
|
+ tokenFilters.register("arabic_normalization", ArabicNormalizationFilterFactory::new);
|
|
|
+ tokenFilters.register("german_normalization", GermanNormalizationFilterFactory::new);
|
|
|
+ tokenFilters.register("hindi_normalization", HindiNormalizationFilterFactory::new);
|
|
|
+ tokenFilters.register("indic_normalization", IndicNormalizationFilterFactory::new);
|
|
|
+ tokenFilters.register("sorani_normalization", SoraniNormalizationFilterFactory::new);
|
|
|
+ tokenFilters.register("persian_normalization", PersianNormalizationFilterFactory::new);
|
|
|
+ tokenFilters.register("scandinavian_normalization", ScandinavianNormalizationFilterFactory::new);
|
|
|
+ tokenFilters.register("scandinavian_folding", ScandinavianFoldingFilterFactory::new);
|
|
|
+ tokenFilters.register("serbian_normalization", SerbianNormalizationFilterFactory::new);
|
|
|
+
|
|
|
+ tokenFilters.register("hunspell", requriesAnalysisSettings(
|
|
|
+ (indexSettings, env, name, settings) -> new HunspellTokenFilterFactory(indexSettings, name, settings, hunspellService)));
|
|
|
+ tokenFilters.register("cjk_bigram", CJKBigramFilterFactory::new);
|
|
|
+ tokenFilters.register("cjk_width", CJKWidthFilterFactory::new);
|
|
|
+
|
|
|
+ tokenFilters.register("apostrophe", ApostropheFilterFactory::new);
|
|
|
+ tokenFilters.register("classic", ClassicFilterFactory::new);
|
|
|
+ tokenFilters.register("decimal_digit", DecimalDigitFilterFactory::new);
|
|
|
+ tokenFilters.register("fingerprint", FingerprintTokenFilterFactory::new);
|
|
|
+ tokenFilters.registerPlugins(plugins, AnalysisPlugin::getTokenFilters);
|
|
|
+ return tokenFilters;
|
|
|
}
|
|
|
|
|
|
- @Override
|
|
|
- protected void configure() {
|
|
|
- try {
|
|
|
- AnalysisRegistry registry = buildRegistry();
|
|
|
- bind(HunspellService.class).toInstance(registry.getHunspellService());
|
|
|
- bind(AnalysisRegistry.class).toInstance(registry);
|
|
|
- } catch (IOException e) {
|
|
|
- throw new ElasticsearchException("failed to load hunspell service", e);
|
|
|
- }
|
|
|
+ private NamedRegistry<AnalysisProvider<TokenizerFactory>> setupTokenizers(List<AnalysisPlugin> plugins) {
|
|
|
+ NamedRegistry<AnalysisProvider<TokenizerFactory>> tokenizers = new NamedRegistry<>("tokenizer");
|
|
|
+ tokenizers.register("standard", StandardTokenizerFactory::new);
|
|
|
+ tokenizers.register("uax_url_email", UAX29URLEmailTokenizerFactory::new);
|
|
|
+ tokenizers.register("path_hierarchy", PathHierarchyTokenizerFactory::new);
|
|
|
+ tokenizers.register("PathHierarchy", PathHierarchyTokenizerFactory::new);
|
|
|
+ tokenizers.register("keyword", KeywordTokenizerFactory::new);
|
|
|
+ tokenizers.register("letter", LetterTokenizerFactory::new);
|
|
|
+ tokenizers.register("lowercase", LowerCaseTokenizerFactory::new);
|
|
|
+ tokenizers.register("whitespace", WhitespaceTokenizerFactory::new);
|
|
|
+ tokenizers.register("nGram", NGramTokenizerFactory::new);
|
|
|
+ tokenizers.register("ngram", NGramTokenizerFactory::new);
|
|
|
+ tokenizers.register("edgeNGram", EdgeNGramTokenizerFactory::new);
|
|
|
+ tokenizers.register("edge_ngram", EdgeNGramTokenizerFactory::new);
|
|
|
+ tokenizers.register("pattern", PatternTokenizerFactory::new);
|
|
|
+ tokenizers.register("classic", ClassicTokenizerFactory::new);
|
|
|
+ tokenizers.register("thai", ThaiTokenizerFactory::new);
|
|
|
+ tokenizers.registerPlugins(plugins, AnalysisPlugin::getTokenizers);
|
|
|
+ return tokenizers;
|
|
|
}
|
|
|
|
|
|
- /**
|
|
|
- * Builds an {@link AnalysisRegistry} from the current configuration.
|
|
|
- */
|
|
|
- public AnalysisRegistry buildRegistry() throws IOException {
|
|
|
- return new AnalysisRegistry(new HunspellService(environment.settings(), environment, knownDictionaries), environment, charFilters, tokenFilters, tokenizers, analyzers);
|
|
|
+ private NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> setupAnalyzers(List<AnalysisPlugin> plugins) {
|
|
|
+ NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> analyzers = new NamedRegistry<>("analyzer");
|
|
|
+ analyzers.register("default", StandardAnalyzerProvider::new);
|
|
|
+ analyzers.register("standard", StandardAnalyzerProvider::new);
|
|
|
+ analyzers.register("standard_html_strip", StandardHtmlStripAnalyzerProvider::new);
|
|
|
+ analyzers.register("simple", SimpleAnalyzerProvider::new);
|
|
|
+ analyzers.register("stop", StopAnalyzerProvider::new);
|
|
|
+ analyzers.register("whitespace", WhitespaceAnalyzerProvider::new);
|
|
|
+ analyzers.register("keyword", KeywordAnalyzerProvider::new);
|
|
|
+ analyzers.register("pattern", PatternAnalyzerProvider::new);
|
|
|
+ analyzers.register("snowball", SnowballAnalyzerProvider::new);
|
|
|
+ analyzers.register("arabic", ArabicAnalyzerProvider::new);
|
|
|
+ analyzers.register("armenian", ArmenianAnalyzerProvider::new);
|
|
|
+ analyzers.register("basque", BasqueAnalyzerProvider::new);
|
|
|
+ analyzers.register("brazilian", BrazilianAnalyzerProvider::new);
|
|
|
+ analyzers.register("bulgarian", BulgarianAnalyzerProvider::new);
|
|
|
+ analyzers.register("catalan", CatalanAnalyzerProvider::new);
|
|
|
+ analyzers.register("chinese", ChineseAnalyzerProvider::new);
|
|
|
+ analyzers.register("cjk", CjkAnalyzerProvider::new);
|
|
|
+ analyzers.register("czech", CzechAnalyzerProvider::new);
|
|
|
+ analyzers.register("danish", DanishAnalyzerProvider::new);
|
|
|
+ analyzers.register("dutch", DutchAnalyzerProvider::new);
|
|
|
+ analyzers.register("english", EnglishAnalyzerProvider::new);
|
|
|
+ analyzers.register("finnish", FinnishAnalyzerProvider::new);
|
|
|
+ analyzers.register("french", FrenchAnalyzerProvider::new);
|
|
|
+ analyzers.register("galician", GalicianAnalyzerProvider::new);
|
|
|
+ analyzers.register("german", GermanAnalyzerProvider::new);
|
|
|
+ analyzers.register("greek", GreekAnalyzerProvider::new);
|
|
|
+ analyzers.register("hindi", HindiAnalyzerProvider::new);
|
|
|
+ analyzers.register("hungarian", HungarianAnalyzerProvider::new);
|
|
|
+ analyzers.register("indonesian", IndonesianAnalyzerProvider::new);
|
|
|
+ analyzers.register("irish", IrishAnalyzerProvider::new);
|
|
|
+ analyzers.register("italian", ItalianAnalyzerProvider::new);
|
|
|
+ analyzers.register("latvian", LatvianAnalyzerProvider::new);
|
|
|
+ analyzers.register("lithuanian", LithuanianAnalyzerProvider::new);
|
|
|
+ analyzers.register("norwegian", NorwegianAnalyzerProvider::new);
|
|
|
+ analyzers.register("persian", PersianAnalyzerProvider::new);
|
|
|
+ analyzers.register("portuguese", PortugueseAnalyzerProvider::new);
|
|
|
+ analyzers.register("romanian", RomanianAnalyzerProvider::new);
|
|
|
+ analyzers.register("russian", RussianAnalyzerProvider::new);
|
|
|
+ analyzers.register("sorani", SoraniAnalyzerProvider::new);
|
|
|
+ analyzers.register("spanish", SpanishAnalyzerProvider::new);
|
|
|
+ analyzers.register("swedish", SwedishAnalyzerProvider::new);
|
|
|
+ analyzers.register("turkish", TurkishAnalyzerProvider::new);
|
|
|
+ analyzers.register("thai", ThaiAnalyzerProvider::new);
|
|
|
+ analyzers.register("fingerprint", FingerprintAnalyzerProvider::new);
|
|
|
+ analyzers.registerPlugins(plugins, AnalysisPlugin::getAnalyzers);
|
|
|
+ return analyzers;
|
|
|
+ }
|
|
|
+
|
|
|
+ private static <T> AnalysisModule.AnalysisProvider<T> requriesAnalysisSettings(AnalysisModule.AnalysisProvider<T> provider) {
|
|
|
+ return new AnalysisModule.AnalysisProvider<T>() {
|
|
|
+ @Override
|
|
|
+ public T get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
|
|
|
+ return provider.get(indexSettings, environment, name, settings);
|
|
|
+ }
|
|
|
+ @Override
|
|
|
+ public boolean requiresAnalysisSettings() {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ };
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * AnalysisProvider is the basic factory interface for registering analysis components like:
|
|
|
- * <ul>
|
|
|
- * <li>{@link TokenizerFactory} - see {@link AnalysisModule#registerTokenizer(String, AnalysisProvider)}</li>
|
|
|
- * <li>{@link CharFilterFactory} - see {@link AnalysisModule#registerCharFilter(String, AnalysisProvider)}</li>
|
|
|
- * <li>{@link AnalyzerProvider} - see {@link AnalysisModule#registerAnalyzer(String, AnalysisProvider)}</li>
|
|
|
- * <li>{@link TokenFilterFactory}- see {@link AnalysisModule#registerTokenFilter(String, AnalysisProvider)} )}</li>
|
|
|
- * </ul>
|
|
|
+ * The basic factory interface for analysis components.
|
|
|
*/
|
|
|
public interface AnalysisProvider<T> {
|
|
|
|
|
@@ -195,7 +374,8 @@ public final class AnalysisModule extends AbstractModule {
|
|
|
* @param name the name of the analysis component
|
|
|
* @return a new provider instance
|
|
|
* @throws IOException if an {@link IOException} occurs
|
|
|
- * @throws IllegalArgumentException if the provider requires analysis settings ie. if {@link #requiresAnalysisSettings()} returns <code>true</code>
|
|
|
+ * @throws IllegalArgumentException if the provider requires analysis settings ie. if {@link #requiresAnalysisSettings()} returns
|
|
|
+ * <code>true</code>
|
|
|
*/
|
|
|
default T get(Environment environment, String name) throws IOException {
|
|
|
if (requiresAnalysisSettings()) {
|
|
@@ -212,4 +392,29 @@ public final class AnalysisModule extends AbstractModule {
|
|
|
return false;
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+ private static class NamedRegistry<T> {
|
|
|
+ private final Map<String, T> registry = new HashMap<>();
|
|
|
+ private final String targetName;
|
|
|
+
|
|
|
+ public NamedRegistry(String targetName) {
|
|
|
+ this.targetName = targetName;
|
|
|
+ }
|
|
|
+
|
|
|
+ private void register(String name, T t) {
|
|
|
+ requireNonNull(name, "name is required");
|
|
|
+ requireNonNull(t, targetName + " is required");
|
|
|
+ if (registry.putIfAbsent(name, t) != null) {
|
|
|
+ throw new IllegalArgumentException(targetName + " for name " + name + " already registered");
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private <P> void registerPlugins(List<P> plugins, Function<P, Map<String, T>> lookup) {
|
|
|
+ for (P plugin : plugins) {
|
|
|
+ for (Map.Entry<String, T> entry : lookup.apply(plugin).entrySet()) {
|
|
|
+ register(entry.getKey(), entry.getValue());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|