Browse Source

Parse synonyms with the same analysis chain (#8049)

* [Analysis] Parse synonyms with the same analysis chain

Synonym Token Filter / Synonym Graph Filter tokenize synonyms with whatever tokenizer and token filters appear before it in the chain.

Close #7199
Jun Ohtani 8 years ago
parent
commit
62d1969595

+ 41 - 28
core/src/main/java/org/elasticsearch/action/admin/indices/analyze/TransportAnalyzeAction.java

@@ -49,6 +49,7 @@ import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AnalysisRegistry;
 import org.elasticsearch.index.analysis.CharFilterFactory;
 import org.elasticsearch.index.analysis.CustomAnalyzer;
+import org.elasticsearch.index.analysis.CustomAnalyzerProvider;
 import org.elasticsearch.index.analysis.IndexAnalyzers;
 import org.elasticsearch.index.analysis.NamedAnalyzer;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
@@ -183,13 +184,14 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
             Tuple<String, TokenizerFactory> tokenizerFactory = parseTokenizerFactory(request, indexAnalyzers,
                         analysisRegistry, environment);
 
-            TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
-            tokenFilterFactories = getTokenFilterFactories(request, indexSettings, analysisRegistry, environment, tokenFilterFactories);
+            List<CharFilterFactory> charFilterFactoryList = parseCharFilterFactories(request, indexSettings, analysisRegistry, environment);
 
-            CharFilterFactory[] charFilterFactories = new CharFilterFactory[0];
-            charFilterFactories = getCharFilterFactories(request, indexSettings, analysisRegistry, environment, charFilterFactories);
+            List<TokenFilterFactory> tokenFilterFactoryList = parseTokenFilterFactories(request, indexSettings, analysisRegistry,
+                environment, tokenizerFactory, charFilterFactoryList);
 
-            analyzer = new CustomAnalyzer(tokenizerFactory.v1(), tokenizerFactory.v2(), charFilterFactories, tokenFilterFactories);
+            analyzer = new CustomAnalyzer(tokenizerFactory.v1(), tokenizerFactory.v2(),
+                charFilterFactoryList.toArray(new CharFilterFactory[charFilterFactoryList.size()]),
+                tokenFilterFactoryList.toArray(new TokenFilterFactory[tokenFilterFactoryList.size()]));
             closeAnalyzer = true;
         } else if (analyzer == null) {
             if (indexAnalyzers == null) {
@@ -462,12 +464,13 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
         return extendedAttributes;
     }
 
-    private static CharFilterFactory[] getCharFilterFactories(AnalyzeRequest request, IndexSettings indexSettings, AnalysisRegistry analysisRegistry,
-                                                              Environment environment, CharFilterFactory[] charFilterFactories) throws IOException {
+    private static List<CharFilterFactory> parseCharFilterFactories(AnalyzeRequest request, IndexSettings indexSettings, AnalysisRegistry analysisRegistry,
+                                                              Environment environment) throws IOException {
+        List<CharFilterFactory> charFilterFactoryList = new ArrayList<>();
         if (request.charFilters() != null && request.charFilters().size() > 0) {
-            charFilterFactories = new CharFilterFactory[request.charFilters().size()];
-            for (int i = 0; i < request.charFilters().size(); i++) {
-                final AnalyzeRequest.NameOrDefinition charFilter = request.charFilters().get(i);
+            List<AnalyzeRequest.NameOrDefinition> charFilters = request.charFilters();
+            for (AnalyzeRequest.NameOrDefinition charFilter : charFilters) {
+                CharFilterFactory charFilterFactory;
                 // parse anonymous settings
                 if (charFilter.definition != null) {
                     Settings settings = getAnonymousSettings(charFilter.definition);
@@ -481,7 +484,7 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
                         throw new IllegalArgumentException("failed to find global char filter under [" + charFilterTypeName + "]");
                     }
                     // Need to set anonymous "name" of char_filter
-                    charFilterFactories[i] = charFilterFactoryFactory.get(getNaIndexSettings(settings), environment, "_anonymous_charfilter_[" + i + "]", settings);
+                    charFilterFactory = charFilterFactoryFactory.get(getNaIndexSettings(settings), environment, "_anonymous_charfilter", settings);
                 } else {
                     AnalysisModule.AnalysisProvider<CharFilterFactory> charFilterFactoryFactory;
                     if (indexSettings == null) {
@@ -489,31 +492,34 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
                         if (charFilterFactoryFactory == null) {
                             throw new IllegalArgumentException("failed to find global char filter under [" + charFilter.name + "]");
                         }
-                        charFilterFactories[i] = charFilterFactoryFactory.get(environment, charFilter.name);
+                        charFilterFactory = charFilterFactoryFactory.get(environment, charFilter.name);
                     } else {
                         charFilterFactoryFactory = analysisRegistry.getCharFilterProvider(charFilter.name, indexSettings);
                         if (charFilterFactoryFactory == null) {
                             throw new IllegalArgumentException("failed to find char filter under [" + charFilter.name + "]");
                         }
-                        charFilterFactories[i] = charFilterFactoryFactory.get(indexSettings, environment, charFilter.name,
+                        charFilterFactory = charFilterFactoryFactory.get(indexSettings, environment, charFilter.name,
                             AnalysisRegistry.getSettingsFromIndexSettings(indexSettings,
                                 AnalysisRegistry.INDEX_ANALYSIS_CHAR_FILTER + "." + charFilter.name));
                     }
                 }
-                if (charFilterFactories[i] == null) {
+                if (charFilterFactory == null) {
                     throw new IllegalArgumentException("failed to find char filter under [" + charFilter.name + "]");
                 }
+                charFilterFactoryList.add(charFilterFactory);
             }
         }
-        return charFilterFactories;
+        return charFilterFactoryList;
     }
 
-    private static TokenFilterFactory[] getTokenFilterFactories(AnalyzeRequest request, IndexSettings indexSettings, AnalysisRegistry analysisRegistry,
-                                                                Environment environment, TokenFilterFactory[] tokenFilterFactories) throws IOException {
+    private static List<TokenFilterFactory> parseTokenFilterFactories(AnalyzeRequest request, IndexSettings indexSettings, AnalysisRegistry analysisRegistry,
+                                                                Environment environment, Tuple<String, TokenizerFactory> tokenizerFactory,
+                                                                List<CharFilterFactory> charFilterFactoryList) throws IOException {
+        List<TokenFilterFactory> tokenFilterFactoryList = new ArrayList<>();
         if (request.tokenFilters() != null && request.tokenFilters().size() > 0) {
-            tokenFilterFactories = new TokenFilterFactory[request.tokenFilters().size()];
-            for (int i = 0; i < request.tokenFilters().size(); i++) {
-                final AnalyzeRequest.NameOrDefinition tokenFilter = request.tokenFilters().get(i);
+            List<AnalyzeRequest.NameOrDefinition> tokenFilters = request.tokenFilters();
+            for (AnalyzeRequest.NameOrDefinition tokenFilter : tokenFilters) {
+                TokenFilterFactory tokenFilterFactory;
                 // parse anonymous settings
                 if (tokenFilter.definition != null) {
                     Settings settings = getAnonymousSettings(tokenFilter.definition);
@@ -527,7 +533,11 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
                         throw new IllegalArgumentException("failed to find global token filter under [" + filterTypeName + "]");
                     }
                     // Need to set anonymous "name" of tokenfilter
-                    tokenFilterFactories[i] = tokenFilterFactoryFactory.get(getNaIndexSettings(settings), environment, "_anonymous_tokenfilter_[" + i + "]", settings);
+                    tokenFilterFactory = tokenFilterFactoryFactory.get(getNaIndexSettings(settings), environment, "_anonymous_tokenfilter", settings);
+                    tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(), tokenizerFactory.v2(), tokenFilterFactoryList,
+                        charFilterFactoryList, environment);
+
+
                 } else {
                     AnalysisModule.AnalysisProvider<TokenFilterFactory> tokenFilterFactoryFactory;
                     if (indexSettings == null) {
@@ -535,23 +545,26 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
                         if (tokenFilterFactoryFactory == null) {
                             throw new IllegalArgumentException("failed to find global token filter under [" + tokenFilter.name + "]");
                         }
-                        tokenFilterFactories[i] = tokenFilterFactoryFactory.get(environment, tokenFilter.name);
+                        tokenFilterFactory = tokenFilterFactoryFactory.get(environment, tokenFilter.name);
                     } else {
                         tokenFilterFactoryFactory = analysisRegistry.getTokenFilterProvider(tokenFilter.name, indexSettings);
-                       if (tokenFilterFactoryFactory == null) {
+                        if (tokenFilterFactoryFactory == null) {
                             throw new IllegalArgumentException("failed to find token filter under [" + tokenFilter.name + "]");
                         }
-                        tokenFilterFactories[i] = tokenFilterFactoryFactory.get(indexSettings, environment, tokenFilter.name,
-                            AnalysisRegistry.getSettingsFromIndexSettings(indexSettings,
-                                AnalysisRegistry.INDEX_ANALYSIS_FILTER + "." + tokenFilter.name));
+                        Settings settings = AnalysisRegistry.getSettingsFromIndexSettings(indexSettings,
+                            AnalysisRegistry.INDEX_ANALYSIS_FILTER + "." + tokenFilter.name);
+                        tokenFilterFactory = tokenFilterFactoryFactory.get(indexSettings, environment, tokenFilter.name, settings);
+                        tokenFilterFactory = CustomAnalyzerProvider.checkAndApplySynonymFilter(tokenFilterFactory, tokenizerFactory.v1(), tokenizerFactory.v2(), tokenFilterFactoryList,
+                            charFilterFactoryList, environment);
                     }
                 }
-                if (tokenFilterFactories[i] == null) {
+                if (tokenFilterFactory == null) {
                     throw new IllegalArgumentException("failed to find or create token filter under [" + tokenFilter.name + "]");
                 }
+                tokenFilterFactoryList.add(tokenFilterFactory);
             }
         }
-        return tokenFilterFactories;
+        return tokenFilterFactoryList;
     }
 
     private static Tuple<String, TokenizerFactory> parseTokenizerFactory(AnalyzeRequest request, IndexAnalyzers indexAnalzyers,

+ 2 - 2
core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java

@@ -318,12 +318,12 @@ public final class AnalysisRegistry implements Closeable {
                 T factory = null;
                 if (typeName == null) {
                     if (currentSettings.get("tokenizer") != null) {
-                        factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings);
+                        factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings, environment);
                     } else {
                         throw new IllegalArgumentException(component + " [" + name + "] must specify either an analyzer type, or a tokenizer");
                     }
                 } else if (typeName.equals("custom")) {
-                    factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings);
+                    factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings, environment);
                 }
                 if (factory != null) {
                     factories.put(name, factory);

+ 40 - 6
core/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java

@@ -20,6 +20,7 @@
 package org.elasticsearch.index.analysis;
 
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.mapper.TextFieldMapper;
 
@@ -34,13 +35,15 @@ import java.util.Map;
 public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<CustomAnalyzer> {
 
     private final Settings analyzerSettings;
+    private final Environment environment;
 
     private CustomAnalyzer customAnalyzer;
 
     public CustomAnalyzerProvider(IndexSettings indexSettings,
-                                  String name, Settings settings) {
+                                  String name, Settings settings, Environment environment) {
         super(indexSettings, name, settings);
         this.analyzerSettings = settings;
+        this.environment = environment;
     }
 
     public void build(final Map<String, TokenizerFactory> tokenizers, final Map<String, CharFilterFactory> charFilters,
@@ -65,6 +68,12 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
             charFiltersList.add(charFilter);
         }
 
+        int positionIncrementGap = TextFieldMapper.Defaults.POSITION_INCREMENT_GAP;
+
+        positionIncrementGap = analyzerSettings.getAsInt("position_increment_gap", positionIncrementGap);
+
+        int offsetGap = analyzerSettings.getAsInt("offset_gap", -1);
+
         String[] tokenFilterNames = analyzerSettings.getAsArray("filter");
         List<TokenFilterFactory> tokenFilterList = new ArrayList<>(tokenFilterNames.length);
         for (String tokenFilterName : tokenFilterNames) {
@@ -72,14 +81,12 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
             if (tokenFilter == null) {
                 throw new IllegalArgumentException("Custom Analyzer [" + name() + "] failed to find filter under name [" + tokenFilterName + "]");
             }
+            // no need offsetGap for tokenize synonyms
+            tokenFilter = checkAndApplySynonymFilter(tokenFilter, tokenizerName, tokenizer, tokenFilterList, charFiltersList,
+                this.environment);
             tokenFilterList.add(tokenFilter);
         }
 
-        int positionIncrementGap = TextFieldMapper.Defaults.POSITION_INCREMENT_GAP;
-
-        positionIncrementGap = analyzerSettings.getAsInt("position_increment_gap", positionIncrementGap);
-
-        int offsetGap = analyzerSettings.getAsInt("offset_gap", -1);;
         this.customAnalyzer = new CustomAnalyzer(tokenizerName, tokenizer,
                 charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
                 tokenFilterList.toArray(new TokenFilterFactory[tokenFilterList.size()]),
@@ -88,6 +95,33 @@ public class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Custom
         );
     }
 
+    public static TokenFilterFactory checkAndApplySynonymFilter(TokenFilterFactory tokenFilter, String tokenizerName, TokenizerFactory tokenizer,
+                                                                List<TokenFilterFactory> tokenFilterList,
+                                                                List<CharFilterFactory> charFiltersList, Environment env) {
+        if (tokenFilter instanceof SynonymGraphTokenFilterFactory) {
+            List<TokenFilterFactory> tokenFiltersListForSynonym = new ArrayList<>(tokenFilterList);
+
+            try (CustomAnalyzer analyzer = new CustomAnalyzer(tokenizerName, tokenizer,
+                    charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
+                    tokenFiltersListForSynonym.toArray(new TokenFilterFactory[tokenFiltersListForSynonym.size()]),
+                    TextFieldMapper.Defaults.POSITION_INCREMENT_GAP,
+                    -1)){
+                tokenFilter = ((SynonymGraphTokenFilterFactory) tokenFilter).createPerAnalyzerSynonymGraphFactory(analyzer, env);
+            }
+
+        } else if (tokenFilter instanceof SynonymTokenFilterFactory) {
+            List<TokenFilterFactory> tokenFiltersListForSynonym = new ArrayList<>(tokenFilterList);
+            try (CustomAnalyzer analyzer = new CustomAnalyzer(tokenizerName, tokenizer,
+                    charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
+                    tokenFiltersListForSynonym.toArray(new TokenFilterFactory[tokenFiltersListForSynonym.size()]),
+                    TextFieldMapper.Defaults.POSITION_INCREMENT_GAP,
+                    -1)) {
+                tokenFilter = ((SynonymTokenFilterFactory) tokenFilter).createPerAnalyzerSynonymFactory(analyzer, env);
+            }
+        }
+        return tokenFilter;
+    }
+
     @Override
     public CustomAnalyzer get() {
         return this.customAnalyzer;

+ 46 - 2
core/src/main/java/org/elasticsearch/index/analysis/SynonymGraphTokenFilterFactory.java

@@ -19,13 +19,19 @@
 
 package org.elasticsearch.index.analysis;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.synonym.SolrSynonymParser;
 import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
+import org.elasticsearch.common.io.FastStringReader;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
 
 import java.io.IOException;
+import java.io.Reader;
 
 public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {
     public SynonymGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry,
@@ -35,7 +41,45 @@ public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {
 
     @Override
     public TokenStream create(TokenStream tokenStream) {
-        // fst is null means no synonyms
-        return synonymMap.fst == null ? tokenStream : new SynonymGraphFilter(tokenStream, synonymMap, ignoreCase);
+        throw new IllegalStateException("Call createPerAnalyzerSynonymGraphFactory to specialize this factory for an analysis chain first");
+    }
+
+    Factory createPerAnalyzerSynonymGraphFactory(Analyzer analyzerForParseSynonym, Environment env){
+        return new Factory("synonymgraph", analyzerForParseSynonym, getRulesFromSettings(env));
+    }
+
+    public class Factory implements TokenFilterFactory{
+
+        private final String name;
+        private final SynonymMap synonymMap;
+
+        public Factory(String name, final Analyzer analyzerForParseSynonym, Reader rulesReader) {
+            this.name = name;
+
+            try {
+                SynonymMap.Builder parser;
+                if ("wordnet".equalsIgnoreCase(format)) {
+                    parser = new WordnetSynonymParser(true, expand, analyzerForParseSynonym);
+                    ((WordnetSynonymParser) parser).parse(rulesReader);
+                } else {
+                    parser = new SolrSynonymParser(true, expand, analyzerForParseSynonym);
+                    ((SolrSynonymParser) parser).parse(rulesReader);
+                }
+                synonymMap = parser.build();
+            } catch (Exception e) {
+                throw new IllegalArgumentException("failed to build synonyms", e);
+            }
+        }
+
+        @Override
+        public String name() {
+            return this.name;
+        }
+
+        @Override
+        public TokenStream create(TokenStream tokenStream) {
+            // fst is null means no synonyms
+            return synonymMap.fst == null ? tokenStream : new SynonymGraphFilter(tokenStream, synonymMap, ignoreCase);
+        }
     }
 }

+ 108 - 40
core/src/main/java/org/elasticsearch/index/analysis/SynonymTokenFilterFactory.java

@@ -23,35 +23,80 @@ import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.synonym.SolrSynonymParser;
 import org.apache.lucene.analysis.synonym.SynonymFilter;
 import org.apache.lucene.analysis.synonym.SynonymMap;
 import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
+import org.elasticsearch.Version;
 import org.elasticsearch.common.io.FastStringReader;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.indices.analysis.AnalysisModule;
 
+import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.Reader;
+import java.nio.file.Files;
 import java.util.List;
 
 public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
 
-    protected final SynonymMap synonymMap;
+    /**
+     * @deprecated this property only works with tokenizer property
+     */
+    @Deprecated
     protected final boolean ignoreCase;
+    protected final String format;
+    protected final boolean expand;
+    protected final Settings settings;
 
     public SynonymTokenFilterFactory(IndexSettings indexSettings, Environment env, AnalysisRegistry analysisRegistry,
                                       String name, Settings settings) throws IOException {
         super(indexSettings, name, settings);
+        this.settings = settings;
 
-        Reader rulesReader = null;
+        this.ignoreCase =
+            settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "ignore_case", false, deprecationLogger);
+        if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_6_0_0_alpha3) && settings.get("ignore_case") != null) {
+            deprecationLogger.deprecated(
+                "This tokenize synonyms with whatever tokenizer and token filters appear before it in the chain. " +
+                "If you need ignore case with this filter, you should set lowercase filter before this");
+        }
+
+        this.expand =
+            settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "expand", true, deprecationLogger);
+
+        // for backward compatibility
+        if (indexSettings.getIndexVersionCreated().before(Version.V_6_0_0_alpha3)) {
+            String tokenizerName = settings.get("tokenizer", "whitespace");
+            AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory =
+                analysisRegistry.getTokenizerProvider(tokenizerName, indexSettings);
+            if (tokenizerFactoryFactory == null) {
+                throw new IllegalArgumentException("failed to find tokenizer [" + tokenizerName + "] for synonym token filter");
+            }
+            final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.get(indexSettings, env, tokenizerName,
+                AnalysisRegistry.getSettingsFromIndexSettings(indexSettings,
+                    AnalysisRegistry.INDEX_ANALYSIS_TOKENIZER + "." + tokenizerName));
+            this.tokenizerFactory = tokenizerFactory;
+        } else {
+            this.tokenizerFactory = null;
+        }
+
+        this.format = settings.get("format", "");
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        throw new IllegalStateException("Call createPerAnalyzerSynonymFactory to specialize this factory for an analysis chain first");
+    }
+
+    protected Reader getRulesFromSettings(Environment env) {
+        Reader rulesReader;
         if (settings.getAsArray("synonyms", null) != null) {
-            List<String> rules = Analysis.getWordList(env, settings, "synonyms");
+            List<String> rulesList = Analysis.getWordList(env, settings, "synonyms");
             StringBuilder sb = new StringBuilder();
-            for (String line : rules) {
+            for (String line : rulesList) {
                 sb.append(line).append(System.lineSeparator());
             }
             rulesReader = new FastStringReader(sb.toString());
@@ -60,49 +105,72 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
         } else {
             throw new IllegalArgumentException("synonym requires either `synonyms` or `synonyms_path` to be configured");
         }
+        return rulesReader;
+    }
 
-        this.ignoreCase =
-            settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "ignore_case", false, deprecationLogger);
-        boolean expand =
-            settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "expand", true, deprecationLogger);
+    Factory createPerAnalyzerSynonymFactory(Analyzer analyzerForParseSynonym, Environment env){
+        return new Factory("synonym", analyzerForParseSynonym, getRulesFromSettings(env));
+    }
 
-        String tokenizerName = settings.get("tokenizer", "whitespace");
-        AnalysisModule.AnalysisProvider<TokenizerFactory> tokenizerFactoryFactory =
-            analysisRegistry.getTokenizerProvider(tokenizerName, indexSettings);
-        if (tokenizerFactoryFactory == null) {
-            throw new IllegalArgumentException("failed to find tokenizer [" + tokenizerName + "] for synonym token filter");
-        }
-        final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.get(indexSettings, env, tokenizerName,
-            AnalysisRegistry.getSettingsFromIndexSettings(indexSettings, AnalysisRegistry.INDEX_ANALYSIS_TOKENIZER + "." + tokenizerName));
-        Analyzer analyzer = new Analyzer() {
-            @Override
-            protected TokenStreamComponents createComponents(String fieldName) {
-                Tokenizer tokenizer = tokenizerFactory == null ? new WhitespaceTokenizer() : tokenizerFactory.create();
-                TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer;
-                return new TokenStreamComponents(tokenizer, stream);
-            }
-        };
+    // for backward compatibility
+    /**
+     * @deprecated This filter tokenize synonyms with whatever tokenizer and token filters appear before it in the chain in 6.0.
+     */
+    @Deprecated
+    protected final TokenizerFactory tokenizerFactory;
+
+    public class Factory implements TokenFilterFactory{
+
+        private final String name;
+        private final SynonymMap synonymMap;
+
+        public Factory(String name, Analyzer analyzerForParseSynonym, Reader rulesReader) {
 
-        try {
-            SynonymMap.Builder parser = null;
+            this.name = name;
 
-            if ("wordnet".equalsIgnoreCase(settings.get("format"))) {
-                parser = new WordnetSynonymParser(true, expand, analyzer);
-                ((WordnetSynonymParser) parser).parse(rulesReader);
+            Analyzer analyzer;
+            if (tokenizerFactory != null) {
+                analyzer = new Analyzer() {
+                    @Override
+                    protected TokenStreamComponents createComponents(String fieldName) {
+                        Tokenizer tokenizer = tokenizerFactory.create();
+                        TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer;
+                        return new TokenStreamComponents(tokenizer, stream);
+                    }
+                };
             } else {
-                parser = new SolrSynonymParser(true, expand, analyzer);
-                ((SolrSynonymParser) parser).parse(rulesReader);
+                analyzer = analyzerForParseSynonym;
             }
 
-            synonymMap = parser.build();
-        } catch (Exception e) {
-            throw new IllegalArgumentException("failed to build synonyms", e);
+            try {
+                SynonymMap.Builder parser;
+                if ("wordnet".equalsIgnoreCase(format)) {
+                    parser = new WordnetSynonymParser(true, expand, analyzer);
+                    ((WordnetSynonymParser) parser).parse(rulesReader);
+                } else {
+                    parser = new SolrSynonymParser(true, expand, analyzer);
+                    ((SolrSynonymParser) parser).parse(rulesReader);
+                }
+                synonymMap = parser.build();
+            } catch (Exception e) {
+                throw new IllegalArgumentException("failed to build synonyms", e);
+            } finally {
+                if (tokenizerFactory != null) {
+                    analyzer.close();
+                }
+            }
         }
-    }
 
-    @Override
-    public TokenStream create(TokenStream tokenStream) {
-        // fst is null means no synonyms
-        return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, ignoreCase);
+        @Override
+        public String name() {
+            return this.name;
+        }
+
+        @Override
+        public TokenStream create(TokenStream tokenStream) {
+            // fst is null means no synonyms
+            return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, ignoreCase);
+        }
     }
+
 }

+ 52 - 0
core/src/test/java/org/elasticsearch/index/analysis/synonyms/SynonymsAnalysisTests.java

@@ -23,6 +23,7 @@ import org.apache.logging.log4j.Logger;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.queryparser.classic.ParseException;
 import org.elasticsearch.Version;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.common.logging.Loggers;
@@ -41,6 +42,8 @@ import java.nio.file.Files;
 import java.nio.file.Path;
 
 import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.instanceOf;
+import static org.hamcrest.Matchers.startsWith;
 
 public class SynonymsAnalysisTests extends ESTestCase {
     protected final Logger logger = Loggers.getLogger(getClass());
@@ -69,8 +72,57 @@ public class SynonymsAnalysisTests extends ESTestCase {
         match("synonymAnalyzerWordnet", "abstain", "abstain refrain desist");
         match("synonymAnalyzerWordnet_file", "abstain", "abstain refrain desist");
         match("synonymAnalyzerWithsettings", "kimchy", "sha hay");
+        match("synonymAnalyzerWithStopAfterSynonym", "kimchy is the dude abides , stop", "shay is the elasticsearch man! ,");
+        match("synonymAnalyzerWithStopBeforeSynonym", "kimchy is the dude abides , stop", "shay is the elasticsearch man! ,");
+        match("synonymAnalyzerWithStopSynonymAfterSynonym", "kimchy is the dude abides", "shay is the man!");
+        match("synonymAnalyzerExpand", "kimchy is the dude abides", "kimchy shay is the dude elasticsearch abides man!");
+        match("synonymAnalyzerExpandWithStopAfterSynonym", "kimchy is the dude abides", "shay is the dude abides man!");
+
+    }
+
+    public void testSynonymWordDeleteByAnalyzer() throws IOException {
+        Settings settings = Settings.builder()
+            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
+            .put("path.home", createTempDir().toString())
+            .put("index.analysis.filter.synonym.type", "synonym")
+            .putArray("index.analysis.filter.synonym.synonyms", "kimchy => shay", "dude => elasticsearch", "abides => man!")
+            .put("index.analysis.filter.stop_within_synonym.type", "stop")
+            .putArray("index.analysis.filter.stop_within_synonym.stopwords", "kimchy", "elasticsearch")
+            .put("index.analysis.analyzer.synonymAnalyzerWithStopSynonymBeforeSynonym.tokenizer", "whitespace")
+            .putArray("index.analysis.analyzer.synonymAnalyzerWithStopSynonymBeforeSynonym.filter", "stop_within_synonym","synonym")
+            .put().build();
+        IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
+        try {
+            indexAnalyzers = createTestAnalysis(idxSettings, settings).indexAnalyzers;
+            fail("fail! due to synonym word deleted by analyzer");
+        } catch (Exception e) {
+            assertThat(e, instanceOf(IllegalArgumentException.class));
+            assertThat(e.getMessage(), startsWith("failed to build synonyms"));
+        }
     }
 
+    public void testExpandSynonymWordDeleteByAnalyzer() throws IOException {
+        Settings settings = Settings.builder()
+            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
+            .put("path.home", createTempDir().toString())
+            .put("index.analysis.filter.synonym_expand.type", "synonym")
+            .putArray("index.analysis.filter.synonym_expand.synonyms", "kimchy, shay", "dude, elasticsearch", "abides, man!")
+            .put("index.analysis.filter.stop_within_synonym.type", "stop")
+            .putArray("index.analysis.filter.stop_within_synonym.stopwords", "kimchy", "elasticsearch")
+            .put("index.analysis.analyzer.synonymAnalyzerExpandWithStopBeforeSynonym.tokenizer", "whitespace")
+            .putArray("index.analysis.analyzer.synonymAnalyzerExpandWithStopBeforeSynonym.filter", "stop_within_synonym","synonym_expand")
+            .put().build();
+        IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
+        try {
+            indexAnalyzers = createTestAnalysis(idxSettings, settings).indexAnalyzers;
+            fail("fail! due to synonym word deleted by analyzer");
+        } catch (Exception e) {
+            assertThat(e, instanceOf(IllegalArgumentException.class));
+            assertThat(e.getMessage(), startsWith("failed to build synonyms"));
+        }
+    }
+
+
     private void match(String analyzerName, String source, String target) throws IOException {
         Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();
 

+ 1 - 1
core/src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionIT.java

@@ -383,7 +383,7 @@ public class AnalyzeActionIT extends ESIntegTestCase {
         assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[2].getPositionLength(), equalTo(1));
 
         // tokenfilter({"type": "stop", "stopwords": ["foo", "buzz"]})
-        assertThat(analyzeResponse.detail().tokenfilters()[1].getName(), equalTo("_anonymous_tokenfilter_[1]"));
+        assertThat(analyzeResponse.detail().tokenfilters()[1].getName(), equalTo("_anonymous_tokenfilter"));
         assertThat(analyzeResponse.detail().tokenfilters()[1].getTokens().length, equalTo(1));
 
         assertThat(analyzeResponse.detail().tokenfilters()[1].getTokens()[0].getTerm(), equalTo("test"));

+ 39 - 6
core/src/test/resources/org/elasticsearch/index/analysis/synonyms/synonyms.json

@@ -3,11 +3,11 @@
         "analysis":{
             "analyzer":{
                 "synonymAnalyzer":{
-                    "tokenizer":"standard",
+                    "tokenizer":"whitespace",
                     "filter":[ "synonym" ]
                 },
                 "synonymAnalyzer_file":{
-                    "tokenizer":"standard",
+                    "tokenizer":"whitespace",
                     "filter":[ "synonym_file" ]
                 },
                 "synonymAnalyzerWordnet":{
@@ -21,6 +21,26 @@
                 "synonymAnalyzerWithsettings":{
                     "tokenizer":"trigram",
                     "filter":["synonymWithTokenizerSettings"]
+                },
+                "synonymAnalyzerWithStopBeforeSynonym": {
+                    "tokenizer":"whitespace",
+                    "filter":["stop","synonym"]
+                },
+                "synonymAnalyzerWithStopAfterSynonym":{
+                    "tokenizer":"whitespace",
+                    "filter":["synonym","stop"]
+                },
+                "synonymAnalyzerWithStopSynonymAfterSynonym":{
+                    "tokenizer":"whitespace",
+                    "filter":["synonym","stop_within_synonym"]
+                },
+                "synonymAnalyzerExpand":{
+                    "tokenizer": "whitespace",
+                    "filter":["synonym_expand"]
+                },
+                "synonymAnalyzerExpandWithStopAfterSynonym":{
+                    "tokenizer": "whitespace",
+                    "filter":["synonym_expand", "stop_within_synonym"]
                 }
             },
             "tokenizer":{
@@ -61,10 +81,23 @@
                     "type":"synonym",
                     "synonyms":[
                         "kimchy => shay"
-                    ],
-                    "tokenizer" : "trigram",
-                    "min_gram" : 3,
-                    "max_gram" : 3
+                    ]
+                },
+                "stop":{
+                    "type": "stop",
+                    "stopwords":["stop","synonym"]
+                },
+                "stop_within_synonym":{
+                    "type": "stop",
+                    "stopwords":["kimchy", "elasticsearch"]
+                },
+                "synonym_expand":{
+                    "type":"synonym",
+                    "synonyms":[
+                        "kimchy , shay",
+                        "dude , elasticsearch",
+                        "abides , man!"
+                    ]
                 }
             }
         }

+ 6 - 3
docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc

@@ -50,11 +50,14 @@ PUT /test_index
 The above configures a `search_synonyms` filter, with a path of
 `analysis/synonym.txt` (relative to the `config` location). The
 `search_synonyms` analyzer is then configured with the filter.
-Additional settings are: `ignore_case` (defaults to `false`), and
-`expand` (defaults to `true`).
+Additional settings are: `expand` (defaults to `true`).
+
+[float]
+==== `tokenizer` and `ignore_case` are deprecated
 
 The `tokenizer` parameter controls the tokenizers that will be used to
-tokenize the synonym, and defaults to the `whitespace` tokenizer.
+tokenize the synonym, this parameter is for backwards compatibility for indices that created before 6.0..
+The `ignore_case` parameter works with `tokenizer` parameter only.
 
 Two synonym formats are supported: Solr, WordNet.
 

+ 9 - 3
docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc

@@ -34,11 +34,17 @@ PUT /test_index
 The above configures a `synonym` filter, with a path of
 `analysis/synonym.txt` (relative to the `config` location). The
 `synonym` analyzer is then configured with the filter. Additional
-settings are: `ignore_case` (defaults to `false`), and `expand`
-(defaults to `true`).
+settings is: `expand` (defaults to `true`).
+
+This filter tokenize synonyms with whatever tokenizer and token filters
+appear before it in the chain.
+
+[float]
+==== `tokenizer` and `ignore_case` are deprecated
 
 The `tokenizer` parameter controls the tokenizers that will be used to
-tokenize the synonym, and defaults to the `whitespace` tokenizer.
+tokenize the synonym, this parameter is for backwards compatibility for indices that created before 6.0..
+The `ignore_case` parameter works with `tokenizer` parameter only.
 
 Two synonym formats are supported: Solr, WordNet.
 

+ 11 - 0
docs/reference/migration/migrate_6_0/mappings.asciidoc

@@ -29,3 +29,14 @@ now disallowed for these indices' mappings.
 Previously Elasticsearch would silently ignore any dynamic templates that
 included a `match_mapping_type` type that was unrecognized. An exception is now
 thrown on an unrecognized type.
+
+==== Synonym Token Filter
+
+In 6.0, Synonym Token Filter tokenize synonyms with whatever
+tokenizer and token filters appear before it in the chain.
+
+`tokenizer` and `ignore_case` are deprecated.
+These parameters are still left for backwards compatibility
+for indices that created before 6.0.
+And elasticsearch ignores these properties for new indices.
+

+ 35 - 0
modules/analysis-common/src/test/resources/rest-api-spec/test/indices.analyze/10_synonyms.yml

@@ -0,0 +1,35 @@
+"Synonym filter with char_filter":
+  # Tests analyze with synonym and char_filter. This is in the analysis-common module
+  # because there are no char filters in core.
+    - skip:
+        version: " - 5.99.99"
+        reason: to support synonym same analysis chain were added in 6.0.0
+    - do:
+        indices.create:
+          index: test_synonym_with_charfilter
+          body:
+            settings:
+              index:
+                analysis:
+                  analyzer:
+                    synonymAnalyzerWithCharfilter:
+                      tokenizer: whitespace
+                      char_filter: ["html_strip"]
+                      filter: ["synonym"]
+                  filter:
+                    synonym:
+                      type: synonym
+                      synonyms: ["<p>kimchy</p> => shay", "dude => <html>elasticsearch</html>", "<font>abides</font> => man!"]
+
+    - do:
+        indices.analyze:
+          index: test_synonym_with_charfilter
+          body:
+            analyzer: "synonymAnalyzerWithCharfilter"
+            text: "kimchy is the dude <html>abides</html>"
+    - length: { tokens: 5 }
+    - match:  { tokens.0.token: shay }
+    - match:  { tokens.1.token: is }
+    - match:  { tokens.2.token: the }
+    - match:  { tokens.3.token: elasticsearch }
+    - match:  { tokens.4.token: man! }

+ 34 - 1
rest-api-spec/src/main/resources/rest-api-spec/test/indices.analyze/10_analyze.yml

@@ -73,5 +73,38 @@
     - match:  { detail.tokenizer.tokens.0.token: foo }
     - match:  { detail.tokenizer.tokens.1.token: bar }
     - match:  { detail.tokenizer.tokens.2.token: buzz }
-    - match:  { detail.tokenfilters.0.name: "_anonymous_tokenfilter_[0]" }
+    - match:  { detail.tokenfilters.0.name: "_anonymous_tokenfilter" }
     - match:  { detail.tokenfilters.0.tokens.0.token: bar }
+
+---
+"Synonym filter with tokenizer":
+    - skip:
+        version: " - 5.99.99"
+        reason: to support synonym same analysis chain were added in 6.0.0
+    - do:
+        indices.create:
+          index: test_synonym
+          body:
+            settings:
+              index:
+                analysis:
+                  tokenizer:
+                    trigram:
+                      type: nGram
+                      min_gram: 3
+                      max_gram: 3
+                  filter:
+                    synonym:
+                      type: synonym
+                      synonyms: ["kimchy => shay"]
+
+    - do:
+        indices.analyze:
+          index: test_synonym
+          body:
+            tokenizer: trigram
+            filter: [synonym]
+            text: kimchy
+    - length: { tokens: 2 }
+    - match:  { tokens.0.token: sha }
+    - match:  { tokens.1.token: hay }