|
@@ -22,6 +22,7 @@ package org.elasticsearch.analysis.common;
|
|
|
import org.apache.lucene.analysis.Analyzer;
|
|
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
|
|
import org.apache.lucene.analysis.TokenStream;
|
|
|
+import org.apache.lucene.analysis.core.KeywordTokenizer;
|
|
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|
|
import org.elasticsearch.Version;
|
|
|
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
|
@@ -29,14 +30,20 @@ import org.elasticsearch.common.settings.Settings;
|
|
|
import org.elasticsearch.env.Environment;
|
|
|
import org.elasticsearch.index.IndexSettings;
|
|
|
import org.elasticsearch.index.analysis.IndexAnalyzers;
|
|
|
+import org.elasticsearch.index.analysis.TokenFilterFactory;
|
|
|
+import org.elasticsearch.index.analysis.TokenizerFactory;
|
|
|
import org.elasticsearch.test.ESTestCase;
|
|
|
import org.elasticsearch.test.IndexSettingsModule;
|
|
|
+import org.elasticsearch.test.VersionUtils;
|
|
|
import org.hamcrest.MatcherAssert;
|
|
|
|
|
|
import java.io.IOException;
|
|
|
import java.io.InputStream;
|
|
|
import java.nio.file.Files;
|
|
|
import java.nio.file.Path;
|
|
|
+import java.util.ArrayList;
|
|
|
+import java.util.Collections;
|
|
|
+import java.util.List;
|
|
|
|
|
|
import static org.hamcrest.Matchers.equalTo;
|
|
|
import static org.hamcrest.Matchers.instanceOf;
|
|
@@ -118,7 +125,7 @@ public class SynonymsAnalysisTests extends ESTestCase {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- public void testSynonymsWithMultiplexer() throws IOException {
|
|
|
+ public void testSynonymsWrappedByMultiplexer() throws IOException {
|
|
|
Settings settings = Settings.builder()
|
|
|
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
|
|
.put("path.home", createTempDir().toString())
|
|
@@ -139,6 +146,180 @@ public class SynonymsAnalysisTests extends ESTestCase {
|
|
|
new int[]{ 1, 1, 0, 0, 1, 1 });
|
|
|
}
|
|
|
|
|
|
+ public void testAsciiFoldingFilterForSynonyms() throws IOException {
|
|
|
+ Settings settings = Settings.builder()
|
|
|
+ .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
|
|
+ .put("path.home", createTempDir().toString())
|
|
|
+ .put("index.analysis.filter.synonyms.type", "synonym")
|
|
|
+ .putList("index.analysis.filter.synonyms.synonyms", "hoj, height")
|
|
|
+ .put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard")
|
|
|
+ .putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "asciifolding", "synonyms")
|
|
|
+ .build();
|
|
|
+ IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
|
|
+ indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
|
|
|
+
|
|
|
+ BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "høj",
|
|
|
+ new String[]{ "hoj", "height" },
|
|
|
+ new int[]{ 1, 0 });
|
|
|
+ }
|
|
|
+
|
|
|
+ public void testKeywordRepeatAndSynonyms() throws IOException {
|
|
|
+ Settings settings = Settings.builder()
|
|
|
+ .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
|
|
+ .put("path.home", createTempDir().toString())
|
|
|
+ .put("index.analysis.filter.synonyms.type", "synonym")
|
|
|
+ .putList("index.analysis.filter.synonyms.synonyms", "programmer, developer")
|
|
|
+ .put("index.analysis.filter.my_english.type", "stemmer")
|
|
|
+ .put("index.analysis.filter.my_english.language", "porter2")
|
|
|
+ .put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard")
|
|
|
+ .putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "keyword_repeat", "my_english", "synonyms")
|
|
|
+ .build();
|
|
|
+ IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
|
|
+ indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
|
|
|
+
|
|
|
+ BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "programmers",
|
|
|
+ new String[]{ "programmers", "programm", "develop" },
|
|
|
+ new int[]{ 1, 0, 0 });
|
|
|
+ }
|
|
|
+
|
|
|
+ public void testChainedSynonymFilters() throws IOException {
|
|
|
+ Settings settings = Settings.builder()
|
|
|
+ .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
|
|
+ .put("path.home", createTempDir().toString())
|
|
|
+ .put("index.analysis.filter.synonyms1.type", "synonym")
|
|
|
+ .putList("index.analysis.filter.synonyms1.synonyms", "term1, term2")
|
|
|
+ .put("index.analysis.filter.synonyms2.type", "synonym")
|
|
|
+ .putList("index.analysis.filter.synonyms2.synonyms", "term1, term3")
|
|
|
+ .put("index.analysis.analyzer.syn.tokenizer", "standard")
|
|
|
+ .putList("index.analysis.analyzer.syn.filter", "lowercase", "synonyms1", "synonyms2")
|
|
|
+ .build();
|
|
|
+ IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
|
|
+ indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
|
|
|
+
|
|
|
+ BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("syn"), "term1",
|
|
|
+ new String[]{ "term1", "term3", "term2" }, new int[]{ 1, 0, 0 });
|
|
|
+ }
|
|
|
+
|
|
|
+ public void testShingleFilters() {
|
|
|
+
|
|
|
+ Settings settings = Settings.builder()
|
|
|
+ .put(IndexMetaData.SETTING_VERSION_CREATED,
|
|
|
+ VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.CURRENT))
|
|
|
+ .put("path.home", createTempDir().toString())
|
|
|
+ .put("index.analysis.filter.synonyms.type", "synonym")
|
|
|
+ .putList("index.analysis.filter.synonyms.synonyms", "programmer, developer")
|
|
|
+ .put("index.analysis.filter.my_shingle.type", "shingle")
|
|
|
+ .put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
|
|
|
+ .putList("index.analysis.analyzer.my_analyzer.filter", "my_shingle", "synonyms")
|
|
|
+ .build();
|
|
|
+ IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
|
|
+
|
|
|
+ expectThrows(IllegalArgumentException.class, () -> {
|
|
|
+ indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
|
|
|
+ });
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ public void testTokenFiltersBypassSynonymAnalysis() throws IOException {
|
|
|
+
|
|
|
+ Settings settings = Settings.builder()
|
|
|
+ .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
|
|
+ .put("path.home", createTempDir().toString())
|
|
|
+ .putList("word_list", "a")
|
|
|
+ .put("hyphenation_patterns_path", "foo")
|
|
|
+ .build();
|
|
|
+ IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
|
|
+
|
|
|
+ String[] bypassingFactories = new String[]{
|
|
|
+ "dictionary_decompounder"
|
|
|
+ };
|
|
|
+
|
|
|
+ CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
|
|
|
+ for (String factory : bypassingFactories) {
|
|
|
+ TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
|
|
|
+ TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
|
|
|
+ SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
|
|
|
+ Analyzer analyzer = stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null);
|
|
|
+
|
|
|
+ try (TokenStream ts = analyzer.tokenStream("field", "text")) {
|
|
|
+ assertThat(ts, instanceOf(KeywordTokenizer.class));
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ public void testDisallowedTokenFilters() throws IOException {
|
|
|
+
|
|
|
+ Settings settings = Settings.builder()
|
|
|
+ .put(IndexMetaData.SETTING_VERSION_CREATED,
|
|
|
+ VersionUtils.randomVersionBetween(random(), Version.V_7_0_0, Version.CURRENT))
|
|
|
+ .put("path.home", createTempDir().toString())
|
|
|
+ .putList("common_words", "a", "b")
|
|
|
+ .put("output_unigrams", "true")
|
|
|
+ .build();
|
|
|
+ IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
|
|
+ CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
|
|
|
+
|
|
|
+ String[] disallowedFactories = new String[]{
|
|
|
+ "multiplexer", "cjk_bigram", "common_grams", "ngram", "edge_ngram",
|
|
|
+ "word_delimiter", "word_delimiter_graph", "fingerprint"
|
|
|
+ };
|
|
|
+
|
|
|
+ for (String factory : disallowedFactories) {
|
|
|
+ TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
|
|
|
+ TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
|
|
|
+ SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
|
|
|
+
|
|
|
+ IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
|
|
+ "Expected IllegalArgumentException for factory " + factory,
|
|
|
+ () -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null));
|
|
|
+
|
|
|
+ assertEquals(factory, "Token filter [" + factory
|
|
|
+ + "] cannot be used to parse synonyms",
|
|
|
+ e.getMessage());
|
|
|
+ }
|
|
|
+
|
|
|
+ settings = Settings.builder()
|
|
|
+ .put(IndexMetaData.SETTING_VERSION_CREATED,
|
|
|
+ VersionUtils.randomVersionBetween(random(), Version.V_6_0_0, VersionUtils.getPreviousVersion(Version.V_7_0_0)))
|
|
|
+ .put("path.home", createTempDir().toString())
|
|
|
+ .putList("common_words", "a", "b")
|
|
|
+ .put("output_unigrams", "true")
|
|
|
+ .build();
|
|
|
+ idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
|
|
+
|
|
|
+ List<String> expectedWarnings = new ArrayList<>();
|
|
|
+ for (String factory : disallowedFactories) {
|
|
|
+ TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
|
|
|
+ TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
|
|
|
+ SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
|
|
|
+
|
|
|
+ stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null);
|
|
|
+ expectedWarnings.add("Token filter [" + factory
|
|
|
+ + "] will not be usable to parse synonyms after v7.0");
|
|
|
+ }
|
|
|
+
|
|
|
+ assertWarnings(expectedWarnings.toArray(new String[0]));
|
|
|
+
|
|
|
+ settings = Settings.builder()
|
|
|
+ .put(IndexMetaData.SETTING_VERSION_CREATED,
|
|
|
+ VersionUtils.randomVersionBetween(random(), Version.V_6_0_0, VersionUtils.getPreviousVersion(Version.V_7_0_0)))
|
|
|
+ .put("path.home", createTempDir().toString())
|
|
|
+ .put("preserve_original", "false")
|
|
|
+ .build();
|
|
|
+ idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
|
|
+ TokenFilterFactory tff = plugin.getTokenFilters().get("multiplexer").get(idxSettings, null, "multiplexer", settings);
|
|
|
+ TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
|
|
|
+ SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
|
|
|
+
|
|
|
+ IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
|
|
|
+ () -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null));
|
|
|
+
|
|
|
+ assertEquals("Token filter [multiplexer] cannot be used to parse synonyms unless [preserve_original] is [true]",
|
|
|
+ e.getMessage());
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
private void match(String analyzerName, String source, String target) throws IOException {
|
|
|
Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();
|
|
|
|