Преглед изворни кода

Plugins can register pre-configured char filters (#25000)

Fixes the plumbing so plugins can register char filters and moves
the `html_strip` char filter into analysis-common.

Relates to #23658
Nik Everett пре 8 година
родитељ
комит
73307a2144

+ 7 - 16
core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java

@@ -35,8 +35,6 @@ import org.elasticsearch.index.mapper.TextFieldMapper;
 import org.elasticsearch.indices.analysis.AnalysisModule;
 import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
 import org.elasticsearch.indices.analysis.PreBuiltAnalyzers;
-import org.elasticsearch.indices.analysis.PreBuiltCharFilters;
-import org.elasticsearch.indices.analysis.PreBuiltTokenizers;
 
 import java.io.Closeable;
 import java.io.IOException;
@@ -74,6 +72,7 @@ public final class AnalysisRegistry implements Closeable {
                             Map<String, AnalysisProvider<TokenizerFactory>> tokenizers,
                             Map<String, AnalysisProvider<AnalyzerProvider<?>>> analyzers,
                             Map<String, AnalysisProvider<AnalyzerProvider<?>>> normalizers,
+                            Map<String, PreConfiguredCharFilter> preConfiguredCharFilters,
                             Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters,
                             Map<String, PreConfiguredTokenizer> preConfiguredTokenizers) {
         this.environment = environment;
@@ -82,7 +81,7 @@ public final class AnalysisRegistry implements Closeable {
         this.tokenizers = unmodifiableMap(tokenizers);
         this.analyzers = unmodifiableMap(analyzers);
         this.normalizers = unmodifiableMap(normalizers);
-        prebuiltAnalysis = new PrebuiltAnalysis(preConfiguredTokenFilters, preConfiguredTokenizers);
+        prebuiltAnalysis = new PrebuiltAnalysis(preConfiguredCharFilters, preConfiguredTokenFilters, preConfiguredTokenizers);
     }
 
     /**
@@ -180,7 +179,7 @@ public final class AnalysisRegistry implements Closeable {
 
     public Map<String, CharFilterFactory> buildCharFilterFactories(IndexSettings indexSettings) throws IOException {
         final Map<String, Settings> charFiltersSettings = indexSettings.getSettings().getGroups(INDEX_ANALYSIS_CHAR_FILTER);
-        return buildMapping(Component.CHAR_FILTER, indexSettings, charFiltersSettings, charFilters, prebuiltAnalysis.charFilterFactories);
+        return buildMapping(Component.CHAR_FILTER, indexSettings, charFiltersSettings, charFilters, prebuiltAnalysis.preConfiguredCharFilterFactories);
     }
 
     public Map<String, AnalyzerProvider<?>> buildAnalyzerFactories(IndexSettings indexSettings) throws IOException {
@@ -397,13 +396,13 @@ public final class AnalysisRegistry implements Closeable {
         final Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<?>>> analyzerProviderFactories;
         final Map<String, ? extends AnalysisProvider<TokenFilterFactory>> preConfiguredTokenFilters;
         final Map<String, ? extends AnalysisProvider<TokenizerFactory>> preConfiguredTokenizers;
-        final Map<String, AnalysisModule.AnalysisProvider<CharFilterFactory>> charFilterFactories;
+        final Map<String, ? extends AnalysisProvider<CharFilterFactory>> preConfiguredCharFilterFactories;
 
         private PrebuiltAnalysis(
+                Map<String, PreConfiguredCharFilter> preConfiguredCharFilters,
                 Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters,
                 Map<String, PreConfiguredTokenizer> preConfiguredTokenizers) {
             Map<String, PreBuiltAnalyzerProviderFactory> analyzerProviderFactories = new HashMap<>();
-            Map<String, PreBuiltCharFilterFactoryFactory> charFilterFactories = new HashMap<>();
 
             // Analyzers
             for (PreBuiltAnalyzers preBuiltAnalyzerEnum : PreBuiltAnalyzers.values()) {
@@ -411,22 +410,14 @@ public final class AnalysisRegistry implements Closeable {
                 analyzerProviderFactories.put(name, new PreBuiltAnalyzerProviderFactory(name, AnalyzerScope.INDICES, preBuiltAnalyzerEnum.getAnalyzer(Version.CURRENT)));
             }
 
-            // Char Filters
-            for (PreBuiltCharFilters preBuiltCharFilter : PreBuiltCharFilters.values()) {
-                String name = preBuiltCharFilter.name().toLowerCase(Locale.ROOT);
-                charFilterFactories.put(name, new PreBuiltCharFilterFactoryFactory(preBuiltCharFilter.getCharFilterFactory(Version.CURRENT)));
-            }
-            // Char filter aliases
-            charFilterFactories.put("htmlStrip", new PreBuiltCharFilterFactoryFactory(PreBuiltCharFilters.HTML_STRIP.getCharFilterFactory(Version.CURRENT)));
-
             this.analyzerProviderFactories = Collections.unmodifiableMap(analyzerProviderFactories);
-            this.charFilterFactories = Collections.unmodifiableMap(charFilterFactories);
+            this.preConfiguredCharFilterFactories = preConfiguredCharFilters;
             this.preConfiguredTokenFilters = preConfiguredTokenFilters;
             this.preConfiguredTokenizers = preConfiguredTokenizers;
         }
 
         public AnalysisModule.AnalysisProvider<CharFilterFactory> getCharFilterFactory(String name) {
-            return charFilterFactories.get(name);
+            return preConfiguredCharFilterFactories.get(name);
         }
 
         public AnalysisModule.AnalysisProvider<TokenFilterFactory> getTokenFilterFactory(String name) {

+ 0 - 51
core/src/main/java/org/elasticsearch/index/analysis/PreBuiltCharFilterFactoryFactory.java

@@ -1,51 +0,0 @@
-/*
- * Licensed to Elasticsearch under one or more contributor
- * license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright
- * ownership. Elasticsearch licenses this file to you under
- * the Apache License, Version 2.0 (the "License"); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.elasticsearch.index.analysis;
-
-import org.elasticsearch.Version;
-import org.elasticsearch.common.settings.Settings;
-import org.elasticsearch.env.Environment;
-import org.elasticsearch.index.IndexSettings;
-import org.elasticsearch.indices.analysis.AnalysisModule;
-import org.elasticsearch.indices.analysis.PreBuiltCharFilters;
-
-import java.io.IOException;
-
-public class PreBuiltCharFilterFactoryFactory implements AnalysisModule.AnalysisProvider<CharFilterFactory> {
-
-    private final CharFilterFactory charFilterFactory;
-
-    public PreBuiltCharFilterFactoryFactory(CharFilterFactory charFilterFactory) {
-        this.charFilterFactory = charFilterFactory;
-    }
-
-    @Override
-    public CharFilterFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
-        Version indexVersion = Version.indexCreated(settings);
-        if (!Version.CURRENT.equals(indexVersion)) {
-            PreBuiltCharFilters preBuiltCharFilters = PreBuiltCharFilters.getOrDefault(name, null);
-            if (preBuiltCharFilters != null) {
-                return preBuiltCharFilters.getCharFilterFactory(indexVersion);
-            }
-        }
-
-        return charFilterFactory;
-    }
-}

+ 112 - 0
core/src/main/java/org/elasticsearch/index/analysis/PreConfiguredCharFilter.java

@@ -0,0 +1,112 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.CharFilter;
+import org.apache.lucene.analysis.TokenFilter;
+import org.elasticsearch.Version;
+import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
+
+import java.io.Reader;
+import java.util.function.BiFunction;
+import java.util.function.Function;
+
+/**
+ * Provides pre-configured, shared {@link CharFilter}s.
+ */
+public class PreConfiguredCharFilter extends PreConfiguredAnalysisComponent<CharFilterFactory> {
+    /**
+     * Create a pre-configured char filter that may not vary at all.
+     */
+    public static PreConfiguredCharFilter singleton(String name, boolean useFilterForMultitermQueries, Function<Reader, Reader> create) {
+        return new PreConfiguredCharFilter(name, CachingStrategy.ONE, useFilterForMultitermQueries,
+                (reader, version) -> create.apply(reader));
+    }
+
+    /**
+     * Create a pre-configured token filter that may vary based on the Lucene version.
+     */
+    public static PreConfiguredCharFilter luceneVersion(String name, boolean useFilterForMultitermQueries,
+            BiFunction<Reader, org.apache.lucene.util.Version, Reader> create) {
+        return new PreConfiguredCharFilter(name, CachingStrategy.LUCENE, useFilterForMultitermQueries,
+                (reader, version) -> create.apply(reader, version.luceneVersion));
+    }
+
+    /**
+     * Create a pre-configured token filter that may vary based on the Elasticsearch version.
+     */
+    public static PreConfiguredCharFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries,
+            BiFunction<Reader, org.elasticsearch.Version, Reader> create) {
+        return new PreConfiguredCharFilter(name, CachingStrategy.ELASTICSEARCH, useFilterForMultitermQueries, create);
+    }
+
+    private final boolean useFilterForMultitermQueries;
+    private final BiFunction<Reader, Version, Reader> create;
+
+    protected PreConfiguredCharFilter(String name, CachingStrategy cache, boolean useFilterForMultitermQueries,
+            BiFunction<Reader, org.elasticsearch.Version, Reader> create) {
+        super(name, cache);
+        this.useFilterForMultitermQueries = useFilterForMultitermQueries;
+        this.create = create;
+    }
+
+    /**
+     * Can this {@link TokenFilter} be used in multi-term queries?
+     */
+    public boolean shouldUseFilterForMultitermQueries() {
+        return useFilterForMultitermQueries;
+    }
+
+    private interface MultiTermAwareCharFilterFactory extends CharFilterFactory, MultiTermAwareComponent {}
+
+    @Override
+    protected CharFilterFactory create(Version version) {
+        if (useFilterForMultitermQueries) {
+            return new MultiTermAwareCharFilterFactory() {
+                @Override
+                public String name() {
+                    return getName();
+                }
+
+                @Override
+                public Reader create(Reader reader) {
+                    return create.apply(reader, version);
+                }
+
+                @Override
+                public Object getMultiTermComponent() {
+                    return this;
+                }
+            };
+        }
+        return new CharFilterFactory() {
+            @Override
+            public Reader create(Reader reader) {
+                return create.apply(reader, version);
+            }
+
+            @Override
+            public String name() {
+                return getName();
+            }
+        };
+    }
+
+}

+ 19 - 2
core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java

@@ -101,6 +101,7 @@ import org.elasticsearch.index.analysis.PatternTokenizerFactory;
 import org.elasticsearch.index.analysis.PersianAnalyzerProvider;
 import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory;
 import org.elasticsearch.index.analysis.PortugueseAnalyzerProvider;
+import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
 import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
 import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
 import org.elasticsearch.index.analysis.ReverseTokenFilterFactory;
@@ -173,11 +174,14 @@ public final class AnalysisModule {
         NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> analyzers = setupAnalyzers(plugins);
         NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> normalizers = setupNormalizers(plugins);
 
+        Map<String, PreConfiguredCharFilter> preConfiguredCharFilters = setupPreConfiguredCharFilters(plugins);
         Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters = setupPreConfiguredTokenFilters(plugins);
         Map<String, PreConfiguredTokenizer> preConfiguredTokenizers = setupPreConfiguredTokenizers(plugins);
 
-        analysisRegistry = new AnalysisRegistry(environment, charFilters.getRegistry(), tokenFilters.getRegistry(), tokenizers
-            .getRegistry(), analyzers.getRegistry(), normalizers.getRegistry(), preConfiguredTokenFilters, preConfiguredTokenizers);
+        analysisRegistry = new AnalysisRegistry(environment,
+                charFilters.getRegistry(), tokenFilters.getRegistry(), tokenizers.getRegistry(),
+                analyzers.getRegistry(), normalizers.getRegistry(),
+                preConfiguredCharFilters, preConfiguredTokenFilters, preConfiguredTokenizers);
     }
 
     HunspellService getHunspellService() {
@@ -261,6 +265,19 @@ public final class AnalysisModule {
         return tokenFilters;
     }
 
+    static Map<String, PreConfiguredCharFilter> setupPreConfiguredCharFilters(List<AnalysisPlugin> plugins) {
+        NamedRegistry<PreConfiguredCharFilter> preConfiguredCharFilters = new NamedRegistry<>("pre-configured char_filter");
+
+        // No char filter are available in lucene-core so none are built in to Elasticsearch core
+
+        for (AnalysisPlugin plugin: plugins) {
+            for (PreConfiguredCharFilter filter : plugin.getPreConfiguredCharFilters()) {
+                preConfiguredCharFilters.register(filter.getName(), filter);
+            }
+        }
+        return unmodifiableMap(preConfiguredCharFilters.getRegistry());
+    }
+
     static Map<String, PreConfiguredTokenFilter> setupPreConfiguredTokenFilters(List<AnalysisPlugin> plugins) {
         NamedRegistry<PreConfiguredTokenFilter> preConfiguredTokenFilters = new NamedRegistry<>("pre-configured token_filter");
 

+ 0 - 80
core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltCharFilters.java

@@ -1,80 +0,0 @@
-/*
- * Licensed to Elasticsearch under one or more contributor
- * license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright
- * ownership. Elasticsearch licenses this file to you under
- * the Apache License, Version 2.0 (the "License"); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.elasticsearch.indices.analysis;
-
-import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
-import org.elasticsearch.Version;
-import org.elasticsearch.index.analysis.CharFilterFactory;
-import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
-
-import java.io.Reader;
-import java.util.Locale;
-
-public enum PreBuiltCharFilters {
-
-    HTML_STRIP(CachingStrategy.ONE) {
-        @Override
-        public Reader create(Reader tokenStream, Version version) {
-            return new HTMLStripCharFilter(tokenStream);
-        }
-    };
-
-    public abstract  Reader create(Reader tokenStream, Version version);
-
-    protected final PreBuiltCacheFactory.PreBuiltCache<CharFilterFactory> cache;
-
-    PreBuiltCharFilters(CachingStrategy cachingStrategy) {
-        cache = PreBuiltCacheFactory.getCache(cachingStrategy);
-    }
-
-    public synchronized CharFilterFactory getCharFilterFactory(final Version version) {
-        CharFilterFactory charFilterFactory = cache.get(version);
-        if (charFilterFactory == null) {
-            final String finalName = name();
-
-            charFilterFactory = new CharFilterFactory() {
-                @Override
-                public String name() {
-                    return finalName.toLowerCase(Locale.ROOT);
-                }
-
-                @Override
-                public Reader create(Reader tokenStream) {
-                    return valueOf(finalName).create(tokenStream, version);
-                }
-            };
-            cache.put(version, charFilterFactory);
-        }
-
-        return charFilterFactory;
-    }
-
-    /**
-     * Get a pre built CharFilter by its name or fallback to the default one
-     * @param name CharFilter name
-     * @param defaultCharFilter default CharFilter if name not found
-     */
-    public static PreBuiltCharFilters getOrDefault(String name, PreBuiltCharFilters defaultCharFilter) {
-        try {
-            return valueOf(name.toUpperCase(Locale.ROOT));
-        } catch (IllegalArgumentException e) {
-            return defaultCharFilter;
-        }
-    }
-}

+ 9 - 1
core/src/main/java/org/elasticsearch/plugins/AnalysisPlugin.java

@@ -28,8 +28,9 @@ import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AnalyzerProvider;
 import org.elasticsearch.index.analysis.CharFilterFactory;
-import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
+import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
 import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
+import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
 import org.elasticsearch.index.analysis.TokenizerFactory;
 import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
@@ -91,6 +92,13 @@ public interface AnalysisPlugin {
         return emptyMap();
     }
 
+    /**
+     * Override to add additional pre-configured {@link CharFilter}s.
+     */
+    default List<PreConfiguredCharFilter> getPreConfiguredCharFilters() {
+        return emptyList();
+    }
+
     /**
      * Override to add additional pre-configured {@link TokenFilter}s.
      */

+ 55 - 14
core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java

@@ -29,18 +29,24 @@ import org.elasticsearch.common.UUIDs;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractCharFilterFactory;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 import org.elasticsearch.index.analysis.AnalysisRegistry;
+import org.elasticsearch.index.analysis.CharFilterFactory;
 import org.elasticsearch.index.analysis.IndexAnalyzers;
+import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
 import org.elasticsearch.index.mapper.AllFieldMapper;
 import org.elasticsearch.indices.analysis.AnalysisModule;
 import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
+import org.elasticsearch.indices.analysis.AnalysisModuleTests.AppendCharFilter;
 import org.elasticsearch.plugins.AnalysisPlugin;
+import static org.elasticsearch.plugins.AnalysisPlugin.requriesAnalysisSettings;
 import org.elasticsearch.test.ESTestCase;
 import org.elasticsearch.test.IndexSettingsModule;
 
 import java.io.IOException;
+import java.io.Reader;
 import java.util.List;
 import java.util.Map;
 
@@ -81,10 +87,31 @@ public class TransportAnalyzeActionTests extends ESTestCase {
                 }
             }
 
+            class AppendCharFilterFactory extends AbstractCharFilterFactory {
+                AppendCharFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+                    super(indexSettings, name);
+                }
+
+                @Override
+                public Reader create(Reader reader) {
+                    return new AppendCharFilter(reader, "bar");
+                }
+            }
+
+            @Override
+            public Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() {
+                return singletonMap("append", AppendCharFilterFactory::new);
+            }
+
             @Override
             public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
                 return singletonMap("mock", MockFactory::new);
             }
+
+            @Override
+            public List<PreConfiguredCharFilter> getPreConfiguredCharFilters() {
+                return singletonList(PreConfiguredCharFilter.singleton("append_foo", false, reader -> new AppendCharFilter(reader, "foo")));
+            }
         };
         registry = new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry();
         indexAnalyzers = registry.build(idxSettings);
@@ -96,17 +123,17 @@ public class TransportAnalyzeActionTests extends ESTestCase {
     public void testNoIndexAnalyzers() throws IOException {
         // Refer to an analyzer by its type so we get its default configuration
         AnalyzeRequest request = new AnalyzeRequest();
-        request.analyzer("standard");
         request.text("the quick brown fox");
+        request.analyzer("standard");
         AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, null, registry, environment);
         List<AnalyzeResponse.AnalyzeToken> tokens = analyze.getTokens();
         assertEquals(4, tokens.size());
 
         // Refer to a token filter by its type so we get its default configuration
-        request.analyzer(null);
-        request.tokenizer("whitespace");
-        request.addTokenFilter("mock");
+        request = new AnalyzeRequest();
         request.text("the qu1ck brown fox");
+        request.tokenizer("standard");
+        request.addTokenFilter("mock");
         analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, randomBoolean() ? indexAnalyzers : null, registry, environment);
         tokens = analyze.getTokens();
         assertEquals(3, tokens.size());
@@ -114,18 +141,32 @@ public class TransportAnalyzeActionTests extends ESTestCase {
         assertEquals("brown", tokens.get(1).getTerm());
         assertEquals("fox", tokens.get(2).getTerm());
 
-        // Refer to a char filter by its type so we get its default configuration
-        request.analyzer(null);
-        request.tokenizer("whitespace");
-        request.addCharFilter("html_strip");
-        request.addTokenFilter("mock");
-        request.text("<p>the qu1ck brown fox</p>");
+        // We can refer to a pre-configured token filter by its name to get it
+        request = new AnalyzeRequest();
+        request.text("the qu1ck brown fox");
+        request.tokenizer("standard");
+        request.addCharFilter("append_foo");
         analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, randomBoolean() ? indexAnalyzers : null, registry, environment);
         tokens = analyze.getTokens();
-        assertEquals(3, tokens.size());
-        assertEquals("qu1ck", tokens.get(0).getTerm());
-        assertEquals("brown", tokens.get(1).getTerm());
-        assertEquals("fox", tokens.get(2).getTerm());
+        assertEquals(4, tokens.size());
+        assertEquals("the", tokens.get(0).getTerm());
+        assertEquals("qu1ck", tokens.get(1).getTerm());
+        assertEquals("brown", tokens.get(2).getTerm());
+        assertEquals("foxfoo", tokens.get(3).getTerm());
+
+        // We can refer to a token filter by its type to get its default configuration
+        request = new AnalyzeRequest();
+        request.text("the qu1ck brown fox");
+        request.tokenizer("standard");
+        request.addCharFilter("append");
+        request.text("the qu1ck brown fox");
+        analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, randomBoolean() ? indexAnalyzers : null, registry, environment);
+        tokens = analyze.getTokens();
+        assertEquals(4, tokens.size());
+        assertEquals("the", tokens.get(0).getTerm());
+        assertEquals("qu1ck", tokens.get(1).getTerm());
+        assertEquals("brown", tokens.get(2).getTerm());
+        assertEquals("foxbar", tokens.get(3).getTerm());
     }
 
     public void testFillsAttributes() throws IOException {

+ 1 - 1
core/src/test/java/org/elasticsearch/index/IndexModuleTests.java

@@ -120,7 +120,7 @@ public class IndexModuleTests extends ESTestCase {
         index = indexSettings.getIndex();
         environment = new Environment(settings);
         emptyAnalysisRegistry = new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(),
-                emptyMap(), emptyMap());
+                emptyMap(), emptyMap(), emptyMap());
         threadPool = new TestThreadPool("test");
         circuitBreakerService = new NoneCircuitBreakerService();
         bigArrays = new BigArrays(settings, circuitBreakerService);

+ 1 - 1
core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java

@@ -57,7 +57,7 @@ public class AnalysisRegistryTests extends ESTestCase {
 
     private static AnalysisRegistry emptyAnalysisRegistry(Settings settings) {
         return new AnalysisRegistry(new Environment(settings), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap(),
-                emptyMap());
+                emptyMap(), emptyMap());
     }
 
     private static IndexSettings indexSettingsOfCurrentVersion(Settings.Builder settings) {

+ 9 - 3
core/src/test/java/org/elasticsearch/index/analysis/CustomNormalizerTests.java

@@ -32,6 +32,7 @@ import java.io.IOException;
 import java.io.Reader;
 import java.util.List;
 import java.util.Map;
+import java.util.function.Function;
 
 import static java.util.Collections.singletonList;
 import static java.util.Collections.singletonMap;
@@ -101,12 +102,12 @@ public class CustomNormalizerTests extends ESTokenStreamTestCase {
 
     public void testIllegalCharFilters() throws IOException {
         Settings settings = Settings.builder()
-                .putArray("index.analysis.normalizer.my_normalizer.char_filter", "html_strip")
+                .putArray("index.analysis.normalizer.my_normalizer.char_filter", "mock_forbidden")
                 .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
                 .build();
         IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
-                () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings));
-        assertEquals("Custom normalizer [my_normalizer] may not use char filter [html_strip]", e.getMessage());
+                () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, MOCK_ANALYSIS_PLUGIN));
+        assertEquals("Custom normalizer [my_normalizer] may not use char filter [mock_forbidden]", e.getMessage());
     }
 
     private static class MockAnalysisPlugin implements AnalysisPlugin {
@@ -115,6 +116,11 @@ public class CustomNormalizerTests extends ESTokenStreamTestCase {
             return singletonList(PreConfiguredTokenFilter.singleton("mock_forbidden", false, MockLowerCaseFilter::new));
         }
 
+        @Override
+        public List<PreConfiguredCharFilter> getPreConfiguredCharFilters() {
+            return singletonList(PreConfiguredCharFilter.singleton("mock_forbidden", false, Function.identity()));
+        }
+
         @Override
         public Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() {
             return singletonMap("mock_char_filter", (indexSettings, env, name, settings) -> {

+ 85 - 0
core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java

@@ -20,6 +20,7 @@
 package org.elasticsearch.indices.analysis;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharFilter;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
@@ -40,6 +41,7 @@ import org.elasticsearch.index.analysis.CharFilterFactory;
 import org.elasticsearch.index.analysis.CustomAnalyzer;
 import org.elasticsearch.index.analysis.IndexAnalyzers;
 import org.elasticsearch.index.analysis.NamedAnalyzer;
+import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
 import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
 import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
 import org.elasticsearch.index.analysis.StandardTokenizerFactory;
@@ -56,6 +58,7 @@ import org.hamcrest.MatcherAssert;
 import java.io.BufferedWriter;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.Reader;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
@@ -250,6 +253,50 @@ public class AnalysisModuleTests extends ESTestCase {
         }
     }
 
+    /**
+     * Tests that plugins can register pre-configured char filters that vary in behavior based on Elasticsearch version, Lucene version,
+     * and that do not vary based on version at all.
+     */
+    public void testPluginPreConfiguredCharFilters() throws IOException {
+        boolean noVersionSupportsMultiTerm = randomBoolean();
+        boolean luceneVersionSupportsMultiTerm = randomBoolean();
+        boolean elasticsearchVersionSupportsMultiTerm = randomBoolean();
+        AnalysisRegistry registry = new AnalysisModule(new Environment(emptyNodeSettings), singletonList(new AnalysisPlugin() {
+            @Override
+            public List<PreConfiguredCharFilter> getPreConfiguredCharFilters() {
+                return Arrays.asList(
+                        PreConfiguredCharFilter.singleton("no_version", noVersionSupportsMultiTerm,
+                                tokenStream -> new AppendCharFilter(tokenStream, "no_version")),
+                        PreConfiguredCharFilter.luceneVersion("lucene_version", luceneVersionSupportsMultiTerm,
+                                (tokenStream, luceneVersion) -> new AppendCharFilter(tokenStream, luceneVersion.toString())),
+                        PreConfiguredCharFilter.elasticsearchVersion("elasticsearch_version", elasticsearchVersionSupportsMultiTerm,
+                                (tokenStream, esVersion) -> new AppendCharFilter(tokenStream, esVersion.toString()))
+                        );
+            }
+        })).getAnalysisRegistry();
+
+        Version version = VersionUtils.randomVersion(random());
+        IndexAnalyzers analyzers = getIndexAnalyzers(registry, Settings.builder()
+                .put("index.analysis.analyzer.no_version.tokenizer", "keyword")
+                .put("index.analysis.analyzer.no_version.char_filter", "no_version")
+                .put("index.analysis.analyzer.lucene_version.tokenizer", "keyword")
+                .put("index.analysis.analyzer.lucene_version.char_filter", "lucene_version")
+                .put("index.analysis.analyzer.elasticsearch_version.tokenizer", "keyword")
+                .put("index.analysis.analyzer.elasticsearch_version.char_filter", "elasticsearch_version")
+                .put(IndexMetaData.SETTING_VERSION_CREATED, version)
+                .build());
+        assertTokenStreamContents(analyzers.get("no_version").tokenStream("", "test"), new String[] {"testno_version"});
+        assertTokenStreamContents(analyzers.get("lucene_version").tokenStream("", "test"), new String[] {"test" + version.luceneVersion});
+        assertTokenStreamContents(analyzers.get("elasticsearch_version").tokenStream("", "test"), new String[] {"test" + version});
+
+        assertEquals("test" + (noVersionSupportsMultiTerm ? "no_version" : ""),
+                analyzers.get("no_version").normalize("", "test").utf8ToString());
+        assertEquals("test" + (luceneVersionSupportsMultiTerm ? version.luceneVersion.toString() : ""),
+                analyzers.get("lucene_version").normalize("", "test").utf8ToString());
+        assertEquals("test" + (elasticsearchVersionSupportsMultiTerm ? version.toString() : ""),
+                analyzers.get("elasticsearch_version").normalize("", "test").utf8ToString());
+    }
+
     /**
      * Tests that plugins can register pre-configured token filters that vary in behavior based on Elasticsearch version, Lucene version,
      * and that do not vary based on version at all.
@@ -391,6 +438,44 @@ public class AnalysisModuleTests extends ESTestCase {
         assertSame(dictionary, module.getHunspellService().getDictionary("foo"));
     }
 
+    // Simple char filter that appends text to the term
+    public static class AppendCharFilter extends CharFilter {
+        private final char[] appendMe;
+        private int offsetInAppendMe = -1;
+
+        public AppendCharFilter(Reader input, String appendMe) {
+            super(input);
+            this.appendMe = appendMe.toCharArray();
+        }
+
+        @Override
+        protected int correct(int currentOff) {
+            return currentOff;
+        }
+
+        @Override
+        public int read(char[] cbuf, int off, int len) throws IOException {
+            if (offsetInAppendMe < 0) {
+                int read = input.read(cbuf, off, len);
+                if (read == len) {
+                    return read;
+                }
+                off += read;
+                len -= read;
+                int allowedLen = Math.min(len, appendMe.length);
+                System.arraycopy(appendMe, 0, cbuf, off, allowedLen);
+                offsetInAppendMe = allowedLen;
+                return read + allowedLen;
+            }
+            if (offsetInAppendMe >= appendMe.length) {
+                return -1;
+            }
+            int allowedLen = Math.max(len, appendMe.length - offsetInAppendMe);
+            System.arraycopy(appendMe, offsetInAppendMe, cbuf, off, allowedLen);
+            return allowedLen;
+        }
+    }
+
     // Simple token filter that appends text to the term
     private static class AppendTokenFilter extends TokenFilter {
         public static TokenFilterFactory factoryForSuffix(String suffix) {

+ 4 - 9
core/src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionIT.java

@@ -257,23 +257,18 @@ public class AnalyzeActionIT extends ESIntegTestCase {
         assertThat(analyzeResponse.detail().analyzer().getTokens().length, equalTo(4));
 
         //custom analyzer
-        analyzeResponse = client().admin().indices().prepareAnalyze("<text>THIS IS A TEST</text>")
-            .setExplain(true).addCharFilter("html_strip").setTokenizer("keyword").addTokenFilter("lowercase").get();
+        analyzeResponse = client().admin().indices().prepareAnalyze("THIS IS A TEST")
+            .setExplain(true).setTokenizer("keyword").addTokenFilter("lowercase").get();
         assertThat(analyzeResponse.detail().analyzer(), IsNull.nullValue());
-        //charfilters
-        assertThat(analyzeResponse.detail().charfilters().length, equalTo(1));
-        assertThat(analyzeResponse.detail().charfilters()[0].getName(), equalTo("html_strip"));
-        assertThat(analyzeResponse.detail().charfilters()[0].getTexts().length, equalTo(1));
-        assertThat(analyzeResponse.detail().charfilters()[0].getTexts()[0], equalTo("\nTHIS IS A TEST\n"));
         //tokenizer
         assertThat(analyzeResponse.detail().tokenizer().getName(), equalTo("keyword"));
         assertThat(analyzeResponse.detail().tokenizer().getTokens().length, equalTo(1));
-        assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getTerm(), equalTo("\nTHIS IS A TEST\n"));
+        assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getTerm(), equalTo("THIS IS A TEST"));
         //tokenfilters
         assertThat(analyzeResponse.detail().tokenfilters().length, equalTo(1));
         assertThat(analyzeResponse.detail().tokenfilters()[0].getName(), equalTo("lowercase"));
         assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens().length, equalTo(1));
-        assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[0].getTerm(), equalTo("\nthis is a test\n"));
+        assertThat(analyzeResponse.detail().tokenfilters()[0].getTokens()[0].getTerm(), equalTo("this is a test"));
 
         //check other attributes
         analyzeResponse = client().admin().indices().prepareAnalyze("This is troubled")

+ 11 - 0
modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java

@@ -26,6 +26,7 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
 import org.apache.lucene.analysis.ar.ArabicStemFilter;
 import org.apache.lucene.analysis.br.BrazilianStemFilter;
+import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
 import org.apache.lucene.analysis.cjk.CJKBigramFilter;
 import org.apache.lucene.analysis.cjk.CJKWidthFilter;
 import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter;
@@ -68,6 +69,7 @@ import org.elasticsearch.index.analysis.CharFilterFactory;
 import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
 import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory;
 import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory;
+import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
 import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
 import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
@@ -106,6 +108,15 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
         return filters;
     }
 
+    @Override
+    public List<PreConfiguredCharFilter> getPreConfiguredCharFilters() {
+        List<PreConfiguredCharFilter> filters = new ArrayList<>();
+        filters.add(PreConfiguredCharFilter.singleton("html_strip", false, HTMLStripCharFilter::new));
+        // TODO deprecate htmlStrip
+        filters.add(PreConfiguredCharFilter.singleton("htmlStrip", false, HTMLStripCharFilter::new));
+        return filters;
+    }
+
     @Override
     public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
         List<PreConfiguredTokenFilter> filters = new ArrayList<>();

+ 12 - 2
modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java

@@ -19,6 +19,7 @@
 
 package org.elasticsearch.analysis.common;
 
+import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory;
 import org.apache.lucene.analysis.en.PorterStemFilterFactory;
 import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory;
 import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory;
@@ -71,6 +72,14 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
         return filters;
     }
 
+    @Override
+    public Map<String, Class<?>> getPreConfiguredCharFilters() {
+        Map<String, Class<?>> filters = new TreeMap<>(super.getPreConfiguredCharFilters());
+        filters.put("html_strip", HTMLStripCharFilterFactory.class);
+        filters.put("htmlStrip", HTMLStripCharFilterFactory.class);
+        return filters;
+    }
+
     @Override
     protected Map<String, Class<?>> getPreConfiguredTokenFilters() {
         Map<String, Class<?>> filters = new TreeMap<>(super.getPreConfiguredTokenFilters());
@@ -92,6 +101,7 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
         filters.put("elision", null);
         filters.put("french_stem", SnowballPorterFilterFactory.class);
         filters.put("german_stem", null);
+        filters.put("german_normalization", null);
         filters.put("hindi_normalization", null);
         filters.put("indic_normalization", null);
         filters.put("keyword_repeat", null);
@@ -123,8 +133,8 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
 
     @Override
     protected Map<String, Class<?>> getPreConfiguredTokenizers() {
-        Map<String, Class<?>> filters = new TreeMap<>(super.getPreConfiguredTokenFilters());
-
+        Map<String, Class<?>> filters = new TreeMap<>(super.getPreConfiguredTokenizers());
+        filters.put("lowercase", null);
         return filters;
     }
 

+ 36 - 25
test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java

@@ -63,6 +63,7 @@ import org.elasticsearch.index.analysis.PatternCaptureGroupTokenFilterFactory;
 import org.elasticsearch.index.analysis.PatternReplaceTokenFilterFactory;
 import org.elasticsearch.index.analysis.PatternTokenizerFactory;
 import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory;
+import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
 import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
 import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
 import org.elasticsearch.index.analysis.ReverseTokenFilterFactory;
@@ -100,7 +101,9 @@ import java.util.TreeSet;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import static java.util.Collections.emptyMap;
 import static java.util.Collections.singletonList;
+import static org.hamcrest.Matchers.empty;
 import static org.hamcrest.Matchers.typeCompatibleWith;
 
 /**
@@ -275,20 +278,6 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
         .put("persian",        Void.class)
         .immutableMap();
 
-    static final Map<PreBuiltCharFilters, Class<?>> PREBUILT_CHARFILTERS;
-    static {
-        PREBUILT_CHARFILTERS = new EnumMap<>(PreBuiltCharFilters.class);
-        for (PreBuiltCharFilters tokenizer : PreBuiltCharFilters.values()) {
-            Class<?> luceneFactoryClazz;
-            switch (tokenizer) {
-            default:
-                luceneFactoryClazz = org.apache.lucene.analysis.util.CharFilterFactory.lookupClass(
-                        toCamelCase(tokenizer.getCharFilterFactory(Version.CURRENT).name()));
-            }
-            PREBUILT_CHARFILTERS.put(tokenizer, luceneFactoryClazz);
-        }
-    }
-
     /**
      * The plugin being tested. Core uses an "empty" plugin so we don't have to throw null checks all over the place.
      */
@@ -352,9 +341,17 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
             }
             tokenizers.put(tokenizer.name().toLowerCase(Locale.ROOT), luceneFactoryClazz);
         }
+        // TODO drop aliases once they are moved to module
+        tokenizers.put("nGram", tokenizers.get("ngram"));
+        tokenizers.put("edgeNGram", tokenizers.get("edge_ngram"));
+        tokenizers.put("PathHierarchy", tokenizers.get("path_hierarchy"));
         return tokenizers;
     }
 
+    public Map<String, Class<?>> getPreConfiguredCharFilters() {
+        return emptyMap();
+    }
+
     public void testTokenizers() {
         Set<String> missing = new TreeSet<String>(org.apache.lucene.analysis.util.TokenizerFactory.availableTokenizers());
         missing.removeAll(getTokenizers().keySet());
@@ -430,10 +427,12 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
         Collection<Object> actual = new HashSet<>();
 
         Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters =
-                AnalysisModule.setupPreConfiguredTokenFilters(singletonList(plugin));
+                new HashMap<>(AnalysisModule.setupPreConfiguredTokenFilters(singletonList(plugin)));
         for (Map.Entry<String, Class<?>> entry : getPreConfiguredTokenFilters().entrySet()) {
             String name = entry.getKey();
             Class<?> luceneFactory = entry.getValue();
+            PreConfiguredTokenFilter filter = preConfiguredTokenFilters.remove(name);
+            assertNotNull("test claims pre built token filter [" + name + "] should be available but it wasn't", filter);
             if (luceneFactory == Void.class) {
                 continue;
             }
@@ -441,8 +440,6 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
                 luceneFactory = TokenFilterFactory.lookupClass(toCamelCase(name));
             }
             assertThat(luceneFactory, typeCompatibleWith(TokenFilterFactory.class));
-            PreConfiguredTokenFilter filter = preConfiguredTokenFilters.get(name);
-            assertNotNull("test claims pre built token filter [" + name + "] should be available but it wasn't", filter);
             if (filter.shouldUseFilterForMultitermQueries()) {
                 actual.add("token filter [" + name + "]");
             }
@@ -450,10 +447,15 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
                 expected.add("token filter [" + name + "]");
             }
         }
-        Map<String, PreConfiguredTokenizer> preConfiguredTokenizers = AnalysisModule.setupPreConfiguredTokenizers(singletonList(plugin));
+        assertThat("pre configured token filter not registered with test", preConfiguredTokenFilters.keySet(), empty());
+
+        Map<String, PreConfiguredTokenizer> preConfiguredTokenizers = new HashMap<>(
+                AnalysisModule.setupPreConfiguredTokenizers(singletonList(plugin)));
         for (Map.Entry<String, Class<?>> entry : getPreConfiguredTokenizers().entrySet()) {
             String name = entry.getKey();
             Class<?> luceneFactory = entry.getValue();
+            PreConfiguredTokenizer tokenizer = preConfiguredTokenizers.remove(name);
+            assertNotNull("test claims pre built tokenizer [" + name + "] should be available but it wasn't", tokenizer);
             if (luceneFactory == Void.class) {
                 continue;
             }
@@ -461,7 +463,6 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
                 luceneFactory = TokenizerFactory.lookupClass(toCamelCase(name));
             }
             assertThat(luceneFactory, typeCompatibleWith(TokenizerFactory.class));
-            PreConfiguredTokenizer tokenizer = preConfiguredTokenizers.get(name);
             if (tokenizer.hasMultiTermComponent()) {
                 actual.add(tokenizer);
             }
@@ -469,20 +470,30 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
                 expected.add(tokenizer);
             }
         }
-        for (Map.Entry<PreBuiltCharFilters, Class<?>> entry : PREBUILT_CHARFILTERS.entrySet()) {
-            PreBuiltCharFilters charFilter = entry.getKey();
+        assertThat("pre configured tokenizer not registered with test", preConfiguredTokenizers.keySet(), empty());
+
+        Map<String, PreConfiguredCharFilter> preConfiguredCharFilters = new HashMap<>(
+                AnalysisModule.setupPreConfiguredCharFilters(singletonList(plugin)));
+        for (Map.Entry<String, Class<?>> entry : getPreConfiguredCharFilters().entrySet()) {
+            String name = entry.getKey();
             Class<?> luceneFactory = entry.getValue();
+            PreConfiguredCharFilter filter = preConfiguredCharFilters.remove(name);
+            assertNotNull("test claims pre built char filter [" + name + "] should be available but it wasn't", filter);
             if (luceneFactory == Void.class) {
                 continue;
             }
-            assertTrue(CharFilterFactory.class.isAssignableFrom(luceneFactory));
-            if (charFilter.getCharFilterFactory(Version.CURRENT) instanceof MultiTermAwareComponent) {
-                actual.add(charFilter);
+            if (luceneFactory == null) {
+                luceneFactory = TokenFilterFactory.lookupClass(toCamelCase(name));
+            }
+            assertThat(luceneFactory, typeCompatibleWith(CharFilterFactory.class));
+            if (filter.shouldUseFilterForMultitermQueries()) {
+                actual.add(filter);
             }
             if (org.apache.lucene.analysis.util.MultiTermAwareComponent.class.isAssignableFrom(luceneFactory)) {
-                expected.add(charFilter);
+                expected.add("token filter [" + name + "]");
             }
         }
+        assertThat("pre configured char filter not registered with test", preConfiguredCharFilters.keySet(), empty());
 
         Set<Object> classesMissingMultiTermSupport = new HashSet<>(expected);
         classesMissingMultiTermSupport.removeAll(actual);