Browse Source

Move more token filters to analysis-common module

The following token filters were moved: delimited_payload_filter, keep, keep_types, classic, apostrophe, decimal_digit, fingerprint, min_hash and scandinavian_folding.

Relates to #23658
Martijn van Groningen 8 years ago
parent
commit
0b776a1de0
22 changed files with 701 additions and 300 deletions
  1. 17 3
      core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzerProvider.java
  2. 0 18
      core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
  3. 5 5
      core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java
  4. 0 169
      core/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsIT.java
  5. 294 0
      core/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsTests.java
  6. 3 2
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ApostropheFilterFactory.java
  7. 3 2
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ClassicFilterFactory.java
  8. 37 29
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
  9. 4 2
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/DecimalDigitFilterFactory.java
  10. 8 7
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/DelimitedPayloadTokenFilterFactory.java
  11. 8 22
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/FingerprintTokenFilterFactory.java
  12. 4 3
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java
  13. 6 3
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepWordFilterFactory.java
  14. 3 2
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MinHashTokenFilterFactory.java
  15. 4 2
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScandinavianFoldingFilterFactory.java
  16. 10 2
      modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java
  17. 13 8
      modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepFilterFactoryTests.java
  18. 4 2
      modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java
  19. 5 3
      modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/MinHashFilterFactoryTests.java
  20. 0 0
      modules/analysis-common/src/test/resources/org/elasticsearch/analysis/common/keep_analysis.json
  21. 265 0
      modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml
  22. 8 16
      test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java

+ 17 - 3
core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzerProvider.java

@@ -33,17 +33,19 @@ import org.elasticsearch.index.IndexSettings;
  */
 public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider<Analyzer> {
 
-    public static ParseField MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.MAX_OUTPUT_SIZE;
+    public static ParseField SEPARATOR = new ParseField("separator");
+    public static ParseField MAX_OUTPUT_SIZE = new ParseField("max_output_size");
 
-    public static int DEFAULT_MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.DEFAULT_MAX_OUTPUT_SIZE;
+    public static int DEFAULT_MAX_OUTPUT_SIZE = 255;
     public static CharArraySet DEFAULT_STOP_WORDS = CharArraySet.EMPTY_SET;
+    public static final char DEFAULT_SEPARATOR  = ' ';
 
     private final FingerprintAnalyzer analyzer;
 
     public FingerprintAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
         super(indexSettings, name, settings);
 
-        char separator = FingerprintTokenFilterFactory.parseSeparator(settings);
+        char separator = parseSeparator(settings);
         int maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(),DEFAULT_MAX_OUTPUT_SIZE);
         CharArraySet stopWords = Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, DEFAULT_STOP_WORDS);
 
@@ -54,4 +56,16 @@ public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider<A
     public FingerprintAnalyzer get() {
         return analyzer;
     }
+
+    public static char parseSeparator(Settings settings) throws IllegalArgumentException {
+        String customSeparator = settings.get(SEPARATOR.getPreferredName());
+        if (customSeparator == null) {
+            return DEFAULT_SEPARATOR;
+        } else if (customSeparator.length() == 1) {
+            return customSeparator.charAt(0);
+        }
+
+        throw new IllegalArgumentException("Setting [separator] must be a single, non-null character. ["
+                + customSeparator + "] was provided.");
+    }
 }

+ 0 - 18
core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java

@@ -29,7 +29,6 @@ import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AnalysisRegistry;
 import org.elasticsearch.index.analysis.AnalyzerProvider;
-import org.elasticsearch.index.analysis.ApostropheFilterFactory;
 import org.elasticsearch.index.analysis.ArabicAnalyzerProvider;
 import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory;
 import org.elasticsearch.index.analysis.ArmenianAnalyzerProvider;
@@ -41,19 +40,15 @@ import org.elasticsearch.index.analysis.CatalanAnalyzerProvider;
 import org.elasticsearch.index.analysis.CharFilterFactory;
 import org.elasticsearch.index.analysis.ChineseAnalyzerProvider;
 import org.elasticsearch.index.analysis.CjkAnalyzerProvider;
-import org.elasticsearch.index.analysis.ClassicFilterFactory;
 import org.elasticsearch.index.analysis.ClassicTokenizerFactory;
 import org.elasticsearch.index.analysis.CzechAnalyzerProvider;
 import org.elasticsearch.index.analysis.CzechStemTokenFilterFactory;
 import org.elasticsearch.index.analysis.DanishAnalyzerProvider;
-import org.elasticsearch.index.analysis.DecimalDigitFilterFactory;
-import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
 import org.elasticsearch.index.analysis.DutchAnalyzerProvider;
 import org.elasticsearch.index.analysis.DutchStemTokenFilterFactory;
 import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
 import org.elasticsearch.index.analysis.EnglishAnalyzerProvider;
 import org.elasticsearch.index.analysis.FingerprintAnalyzerProvider;
-import org.elasticsearch.index.analysis.FingerprintTokenFilterFactory;
 import org.elasticsearch.index.analysis.FinnishAnalyzerProvider;
 import org.elasticsearch.index.analysis.FrenchAnalyzerProvider;
 import org.elasticsearch.index.analysis.FrenchStemTokenFilterFactory;
@@ -67,15 +62,12 @@ import org.elasticsearch.index.analysis.HunspellTokenFilterFactory;
 import org.elasticsearch.index.analysis.IndonesianAnalyzerProvider;
 import org.elasticsearch.index.analysis.IrishAnalyzerProvider;
 import org.elasticsearch.index.analysis.ItalianAnalyzerProvider;
-import org.elasticsearch.index.analysis.KeepTypesFilterFactory;
-import org.elasticsearch.index.analysis.KeepWordFilterFactory;
 import org.elasticsearch.index.analysis.KeywordAnalyzerProvider;
 import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
 import org.elasticsearch.index.analysis.LatvianAnalyzerProvider;
 import org.elasticsearch.index.analysis.LetterTokenizerFactory;
 import org.elasticsearch.index.analysis.LithuanianAnalyzerProvider;
 import org.elasticsearch.index.analysis.LowerCaseTokenizerFactory;
-import org.elasticsearch.index.analysis.MinHashTokenFilterFactory;
 import org.elasticsearch.index.analysis.NGramTokenizerFactory;
 import org.elasticsearch.index.analysis.NorwegianAnalyzerProvider;
 import org.elasticsearch.index.analysis.PathHierarchyTokenizerFactory;
@@ -89,7 +81,6 @@ import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
 import org.elasticsearch.index.analysis.RomanianAnalyzerProvider;
 import org.elasticsearch.index.analysis.RussianAnalyzerProvider;
 import org.elasticsearch.index.analysis.RussianStemTokenFilterFactory;
-import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory;
 import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
 import org.elasticsearch.index.analysis.SimpleAnalyzerProvider;
 import org.elasticsearch.index.analysis.SnowballAnalyzerProvider;
@@ -181,10 +172,6 @@ public final class AnalysisModule {
         tokenFilters.register("stop", StopTokenFilterFactory::new);
         tokenFilters.register("standard", StandardTokenFilterFactory::new);
         tokenFilters.register("shingle", ShingleTokenFilterFactory::new);
-        tokenFilters.register("min_hash", MinHashTokenFilterFactory::new);
-        tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new);
-        tokenFilters.register("keep", requriesAnalysisSettings(KeepWordFilterFactory::new));
-        tokenFilters.register("keep_types", requriesAnalysisSettings(KeepTypesFilterFactory::new));
         tokenFilters.register("arabic_stem", ArabicStemTokenFilterFactory::new);
         tokenFilters.register("brazilian_stem", BrazilianStemTokenFilterFactory::new);
         tokenFilters.register("czech_stem", CzechStemTokenFilterFactory::new);
@@ -192,15 +179,10 @@ public final class AnalysisModule {
         tokenFilters.register("french_stem", FrenchStemTokenFilterFactory::new);
         tokenFilters.register("german_stem", GermanStemTokenFilterFactory::new);
         tokenFilters.register("russian_stem", RussianStemTokenFilterFactory::new);
-        tokenFilters.register("scandinavian_folding", ScandinavianFoldingFilterFactory::new);
 
         tokenFilters.register("hunspell", requriesAnalysisSettings((indexSettings, env, name, settings) -> new HunspellTokenFilterFactory
             (indexSettings, name, settings, hunspellService)));
 
-        tokenFilters.register("apostrophe", ApostropheFilterFactory::new);
-        tokenFilters.register("classic", ClassicFilterFactory::new);
-        tokenFilters.register("decimal_digit", DecimalDigitFilterFactory::new);
-        tokenFilters.register("fingerprint", FingerprintTokenFilterFactory::new);
         tokenFilters.extractAndRegister(plugins, AnalysisPlugin::getTokenFilters);
         return tokenFilters;
     }

+ 5 - 5
core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java

@@ -319,14 +319,14 @@ public class TransportAnalyzeActionTests extends ESTestCase {
     public void testNonPreBuildTokenFilter() throws IOException {
         AnalyzeRequest request = new AnalyzeRequest();
         request.tokenizer("whitespace");
-        request.addTokenFilter("min_hash");
+        request.addTokenFilter("stop"); // stop token filter is not prebuilt in AnalysisModule#setupPreConfiguredTokenFilters()
         request.text("the quick brown fox");
         AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
         List<AnalyzeResponse.AnalyzeToken> tokens = analyze.getTokens();
-        int default_hash_count = 1;
-        int default_bucket_size = 512;
-        int default_hash_set_size = 1;
-        assertEquals(default_hash_count * default_bucket_size * default_hash_set_size, tokens.size());
+        assertEquals(3, tokens.size());
+        assertEquals("quick", tokens.get(0).getTerm());
+        assertEquals("brown", tokens.get(1).getTerm());
+        assertEquals("fox", tokens.get(2).getTerm());
     }
 
     public void testNormalizerWithIndex() throws IOException {

+ 0 - 169
core/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsIT.java

@@ -19,9 +19,6 @@
 
 package org.elasticsearch.action.termvectors;
 
-import com.carrotsearch.hppc.ObjectIntHashMap;
-
-import org.apache.lucene.analysis.payloads.PayloadHelper;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.Fields;
@@ -29,7 +26,6 @@ import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.util.BytesRef;
-import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.action.ActionFuture;
 import org.elasticsearch.action.admin.cluster.shards.ClusterSearchShardsResponse;
 import org.elasticsearch.action.admin.indices.alias.Alias;
@@ -374,171 +370,6 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
         }
     }
 
-    public void testRandomPayloadWithDelimitedPayloadTokenFilter() throws IOException {
-        //create the test document
-        int encoding = randomIntBetween(0, 2);
-        String encodingString = "";
-        if (encoding == 0) {
-            encodingString = "float";
-        }
-        if (encoding == 1) {
-            encodingString = "int";
-        }
-        if (encoding == 2) {
-            encodingString = "identity";
-        }
-        String[] tokens = crateRandomTokens();
-        Map<String, List<BytesRef>> payloads = createPayloads(tokens, encoding);
-        String delimiter = createRandomDelimiter(tokens);
-        String queryString = createString(tokens, payloads, encoding, delimiter.charAt(0));
-        //create the mapping
-        XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties")
-                .startObject("field").field("type", "text").field("term_vector", "with_positions_offsets_payloads")
-                .field("analyzer", "payload_test").endObject().endObject().endObject().endObject();
-        assertAcked(prepareCreate("test").addMapping("type1", mapping).setSettings(
-                Settings.builder()
-                        .put(indexSettings())
-                        .put("index.analysis.analyzer.payload_test.tokenizer", "whitespace")
-                        .putArray("index.analysis.analyzer.payload_test.filter", "my_delimited_payload_filter")
-                        .put("index.analysis.filter.my_delimited_payload_filter.delimiter", delimiter)
-                        .put("index.analysis.filter.my_delimited_payload_filter.encoding", encodingString)
-                        .put("index.analysis.filter.my_delimited_payload_filter.type", "delimited_payload_filter")));
-
-        client().prepareIndex("test", "type1", Integer.toString(1))
-                .setSource(jsonBuilder().startObject().field("field", queryString).endObject()).execute().actionGet();
-        refresh();
-        TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(1)).setPayloads(true).setOffsets(true)
-                .setPositions(true).setSelectedFields();
-        TermVectorsResponse response = resp.execute().actionGet();
-        assertThat("doc id 1 doesn't exists but should", response.isExists(), equalTo(true));
-        Fields fields = response.getFields();
-        assertThat(fields.size(), equalTo(1));
-        Terms terms = fields.terms("field");
-        TermsEnum iterator = terms.iterator();
-        while (iterator.next() != null) {
-            String term = iterator.term().utf8ToString();
-            PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
-            assertThat(docsAndPositions.nextDoc(), equalTo(0));
-            List<BytesRef> curPayloads = payloads.get(term);
-            assertThat(term, curPayloads, notNullValue());
-            assertNotNull(docsAndPositions);
-            for (int k = 0; k < docsAndPositions.freq(); k++) {
-                docsAndPositions.nextPosition();
-                if (docsAndPositions.getPayload()!=null){
-                    String infoString = "\nterm: " + term + " has payload \n"+ docsAndPositions.getPayload().toString() + "\n but should have payload \n"+curPayloads.get(k).toString();
-                    assertThat(infoString, docsAndPositions.getPayload(), equalTo(curPayloads.get(k)));
-                } else {
-                    String infoString = "\nterm: " + term + " has no payload but should have payload \n"+curPayloads.get(k).toString();
-                    assertThat(infoString, curPayloads.get(k).length, equalTo(0));
-                }
-            }
-        }
-        assertThat(iterator.next(), nullValue());
-    }
-
-    private String createRandomDelimiter(String[] tokens) {
-        String delimiter = "";
-        boolean isTokenOrWhitespace = true;
-        while(isTokenOrWhitespace) {
-            isTokenOrWhitespace = false;
-            delimiter = randomUnicodeOfLength(1);
-            for(String token:tokens) {
-                if(token.contains(delimiter)) {
-                    isTokenOrWhitespace = true;
-                }
-            }
-            if(Character.isWhitespace(delimiter.charAt(0))) {
-                isTokenOrWhitespace = true;
-            }
-        }
-        return delimiter;
-    }
-
-    private String createString(String[] tokens, Map<String, List<BytesRef>> payloads, int encoding, char delimiter) {
-        String resultString = "";
-        ObjectIntHashMap<String> payloadCounter = new ObjectIntHashMap<>();
-        for (String token : tokens) {
-            if (!payloadCounter.containsKey(token)) {
-                payloadCounter.putIfAbsent(token, 0);
-            } else {
-                payloadCounter.put(token, payloadCounter.get(token) + 1);
-            }
-            resultString = resultString + token;
-            BytesRef payload = payloads.get(token).get(payloadCounter.get(token));
-            if (payload.length > 0) {
-                resultString = resultString + delimiter;
-                switch (encoding) {
-                case 0: {
-                    resultString = resultString + Float.toString(PayloadHelper.decodeFloat(payload.bytes, payload.offset));
-                    break;
-                }
-                case 1: {
-                    resultString = resultString + Integer.toString(PayloadHelper.decodeInt(payload.bytes, payload.offset));
-                    break;
-                }
-                case 2: {
-                    resultString = resultString + payload.utf8ToString();
-                    break;
-                }
-                default: {
-                    throw new ElasticsearchException("unsupported encoding type");
-                }
-                }
-            }
-            resultString = resultString + " ";
-        }
-        return resultString;
-    }
-
-    private Map<String, List<BytesRef>> createPayloads(String[] tokens, int encoding) {
-        Map<String, List<BytesRef>> payloads = new HashMap<>();
-        for (String token : tokens) {
-            if (payloads.get(token) == null) {
-                payloads.put(token, new ArrayList<BytesRef>());
-            }
-            boolean createPayload = randomBoolean();
-            if (createPayload) {
-                switch (encoding) {
-                case 0: {
-                    float theFloat = randomFloat();
-                    payloads.get(token).add(new BytesRef(PayloadHelper.encodeFloat(theFloat)));
-                    break;
-                }
-                case 1: {
-                    payloads.get(token).add(new BytesRef(PayloadHelper.encodeInt(randomInt())));
-                    break;
-                }
-                case 2: {
-                    String payload = randomUnicodeOfLengthBetween(50, 100);
-                    for (int c = 0; c < payload.length(); c++) {
-                        if (Character.isWhitespace(payload.charAt(c))) {
-                            payload = payload.replace(payload.charAt(c), 'w');
-                        }
-                    }
-                    payloads.get(token).add(new BytesRef(payload));
-                    break;
-                }
-                default: {
-                    throw new ElasticsearchException("unsupported encoding type");
-                }
-                }
-            } else {
-                payloads.get(token).add(new BytesRef());
-            }
-        }
-        return payloads;
-    }
-
-    private String[] crateRandomTokens() {
-        String[] tokens = { "the", "quick", "brown", "fox" };
-        int numTokensWithDuplicates = randomIntBetween(3, 15);
-        String[] finalTokens = new String[numTokensWithDuplicates];
-        for (int i = 0; i < numTokensWithDuplicates; i++) {
-            finalTokens[i] = tokens[randomIntBetween(0, tokens.length - 1)];
-        }
-        return finalTokens;
-    }
-
     // like testSimpleTermVectors but we create fields with no term vectors
     public void testSimpleTermVectorsWithGenerate() throws IOException {
         String[] fieldNames = new String[10];

+ 294 - 0
core/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsTests.java

@@ -0,0 +1,294 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.action.termvectors;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.payloads.FloatEncoder;
+import org.apache.lucene.analysis.payloads.IdentityEncoder;
+import org.apache.lucene.analysis.payloads.IntegerEncoder;
+import org.apache.lucene.analysis.payloads.PayloadEncoder;
+import org.apache.lucene.analysis.payloads.PayloadHelper;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.ElasticsearchException;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
+import org.elasticsearch.indices.analysis.AnalysisModule;
+import org.elasticsearch.plugins.AnalysisPlugin;
+import org.elasticsearch.plugins.Plugin;
+import org.elasticsearch.test.ESSingleNodeTestCase;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.notNullValue;
+import static org.hamcrest.Matchers.nullValue;
+
+public class GetTermVectorsTests extends ESSingleNodeTestCase {
+
+    @Override
+    protected Collection<Class<? extends Plugin>> getPlugins() {
+        return Collections.singleton(MockPayloadAnalyzerPlugin.class);
+    }
+
+    // Delimited payload token filter was moved to analysis-common module,
+    // This test relies heavily on this token filter, even though it is not testing this token filter.
+    // Solution for now is copy what delimited payload token filter does in this test.
+    // Unfortunately MockPayloadAnalyzer couldn't be used here as it misses functionality.
+    public static class MockPayloadAnalyzerPlugin extends Plugin implements AnalysisPlugin {
+
+        @Override
+        public Map<String, AnalysisModule.AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
+            return Collections.singletonMap("mock_payload_filter", (indexSettings, environment, name, settings) -> {
+                return new TokenFilterFactory() {
+                    @Override
+                    public String name() {
+                        return "mock_payload_filter";
+                    }
+
+                    @Override
+                    public TokenStream create(TokenStream tokenStream) {
+                        String delimiter = settings.get("delimiter");
+                        PayloadEncoder encoder = null;
+                        if (settings.get("encoding").equals("float")) {
+                            encoder = new FloatEncoder();
+                        } else if (settings.get("encoding").equals("int")) {
+                            encoder = new IntegerEncoder();
+                        } else if (settings.get("encoding").equals("identity")) {
+                            encoder = new IdentityEncoder();
+                        }
+                        return new MockPayloadTokenFilter(tokenStream, delimiter.charAt(0), encoder);
+                    }
+                };
+            });
+        }
+
+        // Based on DelimitedPayloadTokenFilter:
+        final class MockPayloadTokenFilter extends TokenFilter {
+            private final char delimiter;
+            private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+            private final PayloadAttribute payAtt = addAttribute(PayloadAttribute.class);
+            private final PayloadEncoder encoder;
+
+
+            MockPayloadTokenFilter(TokenStream input, char delimiter, PayloadEncoder encoder) {
+                super(input);
+                this.delimiter = delimiter;
+                this.encoder = encoder;
+            }
+
+            @Override
+            public boolean incrementToken() throws IOException {
+                if (input.incrementToken()) {
+                    final char[] buffer = termAtt.buffer();
+                    final int length = termAtt.length();
+                    for (int i = 0; i < length; i++) {
+                        if (buffer[i] == delimiter) {
+                            payAtt.setPayload(encoder.encode(buffer, i + 1, (length - (i + 1))));
+                            termAtt.setLength(i); // simply set a new length
+                            return true;
+                        }
+                    }
+                    // we have not seen the delimiter
+                    payAtt.setPayload(null);
+                    return true;
+                } else {
+                    return false;
+                }
+            }
+        }
+    }
+
+    public void testRandomPayloadWithDelimitedPayloadTokenFilter() throws IOException {
+        //create the test document
+        int encoding = randomIntBetween(0, 2);
+        String encodingString = "";
+        if (encoding == 0) {
+            encodingString = "float";
+        }
+        if (encoding == 1) {
+            encodingString = "int";
+        }
+        if (encoding == 2) {
+            encodingString = "identity";
+        }
+        String[] tokens = crateRandomTokens();
+        Map<String, List<BytesRef>> payloads = createPayloads(tokens, encoding);
+        String delimiter = createRandomDelimiter(tokens);
+        String queryString = createString(tokens, payloads, encoding, delimiter.charAt(0));
+        //create the mapping
+        XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties")
+                .startObject("field").field("type", "text").field("term_vector", "with_positions_offsets_payloads")
+                .field("analyzer", "payload_test").endObject().endObject().endObject().endObject();
+        Settings setting =  Settings.builder()
+            .put("index.analysis.analyzer.payload_test.tokenizer", "whitespace")
+            .putArray("index.analysis.analyzer.payload_test.filter", "my_delimited_payload_filter")
+            .put("index.analysis.filter.my_delimited_payload_filter.delimiter", delimiter)
+            .put("index.analysis.filter.my_delimited_payload_filter.encoding", encodingString)
+            .put("index.analysis.filter.my_delimited_payload_filter.type", "mock_payload_filter").build();
+        createIndex("test", setting, "type1", mapping);
+
+        client().prepareIndex("test", "type1", Integer.toString(1))
+                .setSource(jsonBuilder().startObject().field("field", queryString).endObject()).execute().actionGet();
+        client().admin().indices().prepareRefresh().get();
+        TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(1))
+                .setPayloads(true).setOffsets(true).setPositions(true).setSelectedFields();
+        TermVectorsResponse response = resp.execute().actionGet();
+        assertThat("doc id 1 doesn't exists but should", response.isExists(), equalTo(true));
+        Fields fields = response.getFields();
+        assertThat(fields.size(), equalTo(1));
+        Terms terms = fields.terms("field");
+        TermsEnum iterator = terms.iterator();
+        while (iterator.next() != null) {
+            String term = iterator.term().utf8ToString();
+            PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
+            assertThat(docsAndPositions.nextDoc(), equalTo(0));
+            List<BytesRef> curPayloads = payloads.get(term);
+            assertThat(term, curPayloads, notNullValue());
+            assertNotNull(docsAndPositions);
+            for (int k = 0; k < docsAndPositions.freq(); k++) {
+                docsAndPositions.nextPosition();
+                if (docsAndPositions.getPayload()!=null){
+                    String infoString = "\nterm: " + term + " has payload \n"+ docsAndPositions.getPayload().toString() +
+                            "\n but should have payload \n"+curPayloads.get(k).toString();
+                    assertThat(infoString, docsAndPositions.getPayload(), equalTo(curPayloads.get(k)));
+                } else {
+                    String infoString = "\nterm: " + term + " has no payload but should have payload \n"+curPayloads.get(k).toString();
+                    assertThat(infoString, curPayloads.get(k).length, equalTo(0));
+                }
+            }
+        }
+        assertThat(iterator.next(), nullValue());
+    }
+
+    private String createString(String[] tokens, Map<String, List<BytesRef>> payloads, int encoding, char delimiter) {
+        String resultString = "";
+        Map<String, Integer> payloadCounter = new HashMap<>();
+        for (String token : tokens) {
+            if (!payloadCounter.containsKey(token)) {
+                payloadCounter.putIfAbsent(token, 0);
+            } else {
+                payloadCounter.put(token, payloadCounter.get(token) + 1);
+            }
+            resultString = resultString + token;
+            BytesRef payload = payloads.get(token).get(payloadCounter.get(token));
+            if (payload.length > 0) {
+                resultString = resultString + delimiter;
+                switch (encoding) {
+                    case 0: {
+                        resultString = resultString + Float.toString(PayloadHelper.decodeFloat(payload.bytes, payload.offset));
+                        break;
+                    }
+                    case 1: {
+                        resultString = resultString + Integer.toString(PayloadHelper.decodeInt(payload.bytes, payload.offset));
+                        break;
+                    }
+                    case 2: {
+                        resultString = resultString + payload.utf8ToString();
+                        break;
+                    }
+                    default: {
+                        throw new ElasticsearchException("unsupported encoding type");
+                    }
+                }
+            }
+            resultString = resultString + " ";
+        }
+        return resultString;
+    }
+
+    private String[] crateRandomTokens() {
+        String[] tokens = { "the", "quick", "brown", "fox" };
+        int numTokensWithDuplicates = randomIntBetween(3, 15);
+        String[] finalTokens = new String[numTokensWithDuplicates];
+        for (int i = 0; i < numTokensWithDuplicates; i++) {
+            finalTokens[i] = tokens[randomIntBetween(0, tokens.length - 1)];
+        }
+        return finalTokens;
+    }
+
+    private String createRandomDelimiter(String[] tokens) {
+        String delimiter = "";
+        boolean isTokenOrWhitespace = true;
+        while(isTokenOrWhitespace) {
+            isTokenOrWhitespace = false;
+            delimiter = randomUnicodeOfLength(1);
+            for(String token:tokens) {
+                if(token.contains(delimiter)) {
+                    isTokenOrWhitespace = true;
+                }
+            }
+            if(Character.isWhitespace(delimiter.charAt(0))) {
+                isTokenOrWhitespace = true;
+            }
+        }
+        return delimiter;
+    }
+
+    private Map<String, List<BytesRef>> createPayloads(String[] tokens, int encoding) {
+        Map<String, List<BytesRef>> payloads = new HashMap<>();
+        for (String token : tokens) {
+            payloads.computeIfAbsent(token, k -> new ArrayList<>());
+            boolean createPayload = randomBoolean();
+            if (createPayload) {
+                switch (encoding) {
+                    case 0: {
+                        float theFloat = randomFloat();
+                        payloads.get(token).add(new BytesRef(PayloadHelper.encodeFloat(theFloat)));
+                        break;
+                    }
+                    case 1: {
+                        payloads.get(token).add(new BytesRef(PayloadHelper.encodeInt(randomInt())));
+                        break;
+                    }
+                    case 2: {
+                        String payload = randomUnicodeOfLengthBetween(50, 100);
+                        for (int c = 0; c < payload.length(); c++) {
+                            if (Character.isWhitespace(payload.charAt(c))) {
+                                payload = payload.replace(payload.charAt(c), 'w');
+                            }
+                        }
+                        payloads.get(token).add(new BytesRef(payload));
+                        break;
+                    }
+                    default: {
+                        throw new ElasticsearchException("unsupported encoding type");
+                    }
+                }
+            } else {
+                payloads.get(token).add(new BytesRef());
+            }
+        }
+        return payloads;
+    }
+}

+ 3 - 2
core/src/main/java/org/elasticsearch/index/analysis/ApostropheFilterFactory.java → modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ApostropheFilterFactory.java

@@ -16,20 +16,21 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tr.ApostropheFilter;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 
 /**
  * Factory for {@link ApostropheFilter}
  */
 public class ApostropheFilterFactory extends AbstractTokenFilterFactory {
 
-    public ApostropheFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+    ApostropheFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
     }
 

+ 3 - 2
core/src/main/java/org/elasticsearch/index/analysis/ClassicFilterFactory.java → modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ClassicFilterFactory.java

@@ -16,20 +16,21 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.ClassicFilter;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 
 /**
  * Factory for {@link ClassicFilter}
  */
 public class ClassicFilterFactory extends AbstractTokenFilterFactory {
 
-    public ClassicFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+    ClassicFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
     }
 

+ 37 - 29
modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java

@@ -67,7 +67,6 @@ import org.apache.lucene.analysis.standard.ClassicFilter;
 import org.apache.lucene.analysis.tr.ApostropheFilter;
 import org.apache.lucene.analysis.util.ElisionFilter;
 import org.elasticsearch.index.analysis.CharFilterFactory;
-import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
 import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory;
 import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
 import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
@@ -92,44 +91,53 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
     @Override
     public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
         Map<String, AnalysisProvider<TokenFilterFactory>> filters = new TreeMap<>();
+        filters.put("apostrophe", ApostropheFilterFactory::new);
+        filters.put("arabic_normalization", ArabicNormalizationFilterFactory::new);
         filters.put("asciifolding", ASCIIFoldingTokenFilterFactory::new);
-        filters.put("keyword_marker", requriesAnalysisSettings(KeywordMarkerTokenFilterFactory::new));
-        filters.put("porter_stem", PorterStemTokenFilterFactory::new);
-        filters.put("snowball", SnowballTokenFilterFactory::new);
-        filters.put("trim", TrimTokenFilterFactory::new);
-        filters.put("word_delimiter", WordDelimiterTokenFilterFactory::new);
-        filters.put("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new);
-        filters.put("unique", UniqueTokenFilterFactory::new);
-        filters.put("flatten_graph", FlattenGraphTokenFilterFactory::new);
-        filters.put("length", LengthTokenFilterFactory::new);
-        filters.put("lowercase", LowerCaseTokenFilterFactory::new);
-        filters.put("uppercase", UpperCaseTokenFilterFactory::new);
-        filters.put("nGram", NGramTokenFilterFactory::new);
-        filters.put("ngram", NGramTokenFilterFactory::new);
-        filters.put("edgeNGram", EdgeNGramTokenFilterFactory::new);
-        filters.put("edge_ngram", EdgeNGramTokenFilterFactory::new);
-        filters.put("stemmer", StemmerTokenFilterFactory::new);
-        filters.put("stemmer_override", requriesAnalysisSettings(StemmerOverrideTokenFilterFactory::new));
-        filters.put("kstem", KStemTokenFilterFactory::new);
+        filters.put("cjk_bigram", CJKBigramFilterFactory::new);
+        filters.put("cjk_width", CJKWidthFilterFactory::new);
+        filters.put("classic", ClassicFilterFactory::new);
+        filters.put("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new));
+        filters.put("decimal_digit", DecimalDigitFilterFactory::new);
+        filters.put("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new);
         filters.put("dictionary_decompounder", requriesAnalysisSettings(DictionaryCompoundWordTokenFilterFactory::new));
-        filters.put("hyphenation_decompounder", requriesAnalysisSettings(HyphenationCompoundWordTokenFilterFactory::new));
-        filters.put("reverse", ReverseTokenFilterFactory::new);
+        filters.put("edge_ngram", EdgeNGramTokenFilterFactory::new);
+        filters.put("edgeNGram", EdgeNGramTokenFilterFactory::new);
         filters.put("elision", ElisionTokenFilterFactory::new);
-        filters.put("truncate", requriesAnalysisSettings(TruncateTokenFilterFactory::new));
-        filters.put("limit", LimitTokenCountFilterFactory::new);
-        filters.put("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new));
-        filters.put("pattern_replace", requriesAnalysisSettings(PatternReplaceTokenFilterFactory::new));
-        filters.put("pattern_capture", requriesAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));
-        filters.put("arabic_normalization", ArabicNormalizationFilterFactory::new);
+        filters.put("fingerprint", FingerprintTokenFilterFactory::new);
+        filters.put("flatten_graph", FlattenGraphTokenFilterFactory::new);
         filters.put("german_normalization", GermanNormalizationFilterFactory::new);
         filters.put("hindi_normalization", HindiNormalizationFilterFactory::new);
+        filters.put("hyphenation_decompounder", requriesAnalysisSettings(HyphenationCompoundWordTokenFilterFactory::new));
         filters.put("indic_normalization", IndicNormalizationFilterFactory::new);
+        filters.put("keep", requriesAnalysisSettings(KeepWordFilterFactory::new));
+        filters.put("keep_types", requriesAnalysisSettings(KeepTypesFilterFactory::new));
+        filters.put("keyword_marker", requriesAnalysisSettings(KeywordMarkerTokenFilterFactory::new));
+        filters.put("kstem", KStemTokenFilterFactory::new);
+        filters.put("length", LengthTokenFilterFactory::new);
+        filters.put("limit", LimitTokenCountFilterFactory::new);
+        filters.put("lowercase", LowerCaseTokenFilterFactory::new);
+        filters.put("min_hash", MinHashTokenFilterFactory::new);
+        filters.put("ngram", NGramTokenFilterFactory::new);
+        filters.put("nGram", NGramTokenFilterFactory::new);
+        filters.put("pattern_capture", requriesAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));
+        filters.put("pattern_replace", requriesAnalysisSettings(PatternReplaceTokenFilterFactory::new));
         filters.put("persian_normalization", PersianNormalizationFilterFactory::new);
+        filters.put("porter_stem", PorterStemTokenFilterFactory::new);
+        filters.put("reverse", ReverseTokenFilterFactory::new);
+        filters.put("scandinavian_folding", ScandinavianFoldingFilterFactory::new);
         filters.put("scandinavian_normalization", ScandinavianNormalizationFilterFactory::new);
         filters.put("serbian_normalization", SerbianNormalizationFilterFactory::new);
+        filters.put("snowball", SnowballTokenFilterFactory::new);
         filters.put("sorani_normalization", SoraniNormalizationFilterFactory::new);
-        filters.put("cjk_width", CJKWidthFilterFactory::new);
-        filters.put("cjk_bigram", CJKBigramFilterFactory::new);
+        filters.put("stemmer_override", requriesAnalysisSettings(StemmerOverrideTokenFilterFactory::new));
+        filters.put("stemmer", StemmerTokenFilterFactory::new);
+        filters.put("trim", TrimTokenFilterFactory::new);
+        filters.put("truncate", requriesAnalysisSettings(TruncateTokenFilterFactory::new));
+        filters.put("unique", UniqueTokenFilterFactory::new);
+        filters.put("uppercase", UpperCaseTokenFilterFactory::new);
+        filters.put("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new);
+        filters.put("word_delimiter", WordDelimiterTokenFilterFactory::new);
         return filters;
     }
 

+ 4 - 2
core/src/main/java/org/elasticsearch/index/analysis/DecimalDigitFilterFactory.java → modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/DecimalDigitFilterFactory.java

@@ -17,20 +17,22 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.DecimalDigitFilter;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.index.analysis.MultiTermAwareComponent;
 
 /**
  * Factory for {@link DecimalDigitFilter}
  */
 public final class DecimalDigitFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
 
-    public DecimalDigitFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+    DecimalDigitFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
         super(indexSettings, name, settings);
     }
 

+ 8 - 7
core/src/main/java/org/elasticsearch/index/analysis/DelimitedPayloadTokenFilterFactory.java → modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/DelimitedPayloadTokenFilterFactory.java

@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
@@ -28,6 +28,7 @@ import org.apache.lucene.analysis.payloads.PayloadEncoder;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 
 public class DelimitedPayloadTokenFilterFactory extends AbstractTokenFilterFactory {
 
@@ -37,11 +38,10 @@ public class DelimitedPayloadTokenFilterFactory extends AbstractTokenFilterFacto
     static final String ENCODING = "encoding";
     static final String DELIMITER = "delimiter";
 
-    char delimiter;
-    PayloadEncoder encoder;
+    private final char delimiter;
+    private final PayloadEncoder encoder;
 
-    public DelimitedPayloadTokenFilterFactory(IndexSettings indexSettings, Environment env, String name,
-            Settings settings) {
+    DelimitedPayloadTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
         super(indexSettings, name, settings);
         String delimiterConf = settings.get(DELIMITER);
         if (delimiterConf != null) {
@@ -57,6 +57,8 @@ public class DelimitedPayloadTokenFilterFactory extends AbstractTokenFilterFacto
                 encoder = new IntegerEncoder();
             } else if (settings.get(ENCODING).equals("identity")) {
                 encoder = new IdentityEncoder();
+            } else {
+                encoder = DEFAULT_ENCODER;
             }
         } else {
             encoder = DEFAULT_ENCODER;
@@ -65,8 +67,7 @@ public class DelimitedPayloadTokenFilterFactory extends AbstractTokenFilterFacto
 
     @Override
     public TokenStream create(TokenStream tokenStream) {
-        DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(tokenStream, delimiter, encoder);
-        return filter;
+        return new DelimitedPayloadTokenFilter(tokenStream, delimiter, encoder);
     }
 
 }

+ 8 - 22
core/src/main/java/org/elasticsearch/index/analysis/FingerprintTokenFilterFactory.java → modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/FingerprintTokenFilterFactory.java

@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.miscellaneous.FingerprintFilter;
@@ -25,24 +25,21 @@ import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.index.analysis.FingerprintAnalyzerProvider;
 
+import static org.elasticsearch.index.analysis.FingerprintAnalyzerProvider.DEFAULT_MAX_OUTPUT_SIZE;
+import static org.elasticsearch.index.analysis.FingerprintAnalyzerProvider.MAX_OUTPUT_SIZE;
 
 public class FingerprintTokenFilterFactory extends AbstractTokenFilterFactory {
 
     private final char separator;
     private final int maxOutputSize;
 
-    public static ParseField SEPARATOR = new ParseField("separator");
-    public static ParseField MAX_OUTPUT_SIZE = new ParseField("max_output_size");
-
-    public static final char DEFAULT_SEPARATOR  = ' ';
-    public static final int DEFAULT_MAX_OUTPUT_SIZE = 255;
-
-    public FingerprintTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+    FingerprintTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
-        this.separator = parseSeparator(settings);
-        this.maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(),
-            FingerprintTokenFilterFactory.DEFAULT_MAX_OUTPUT_SIZE);
+        this.separator = FingerprintAnalyzerProvider.parseSeparator(settings);
+        this.maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(), DEFAULT_MAX_OUTPUT_SIZE);
     }
 
     @Override
@@ -52,15 +49,4 @@ public class FingerprintTokenFilterFactory extends AbstractTokenFilterFactory {
         return result;
     }
 
-    public static char parseSeparator(Settings settings) throws IllegalArgumentException {
-        String customSeparator = settings.get(SEPARATOR.getPreferredName());
-        if (customSeparator == null) {
-            return FingerprintTokenFilterFactory.DEFAULT_SEPARATOR;
-        } else if (customSeparator.length() == 1) {
-            return customSeparator.charAt(0);
-        }
-
-        throw new IllegalArgumentException("Setting [separator] must be a single, non-null character. ["
-            + customSeparator + "] was provided.");
-    }
 }

+ 4 - 3
core/src/main/java/org/elasticsearch/index/analysis/KeepTypesFilterFactory.java → modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java

@@ -17,13 +17,15 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.TypeTokenFilter;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
 
 import java.util.Arrays;
 import java.util.HashSet;
@@ -43,8 +45,7 @@ public class KeepTypesFilterFactory extends AbstractTokenFilterFactory {
     private final Set<String> keepTypes;
     private static final String KEEP_TYPES_KEY = "types";
 
-    public KeepTypesFilterFactory(IndexSettings indexSettings,
-                                 Environment env, String name, Settings settings) {
+    KeepTypesFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
         super(indexSettings, name, settings);
 
         final String[] arrayKeepTypes = settings.getAsArray(KEEP_TYPES_KEY, null);

+ 6 - 3
core/src/main/java/org/elasticsearch/index/analysis/KeepWordFilterFactory.java → modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepWordFilterFactory.java

@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.TokenStream;
@@ -26,6 +26,10 @@ import org.apache.lucene.util.Version;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.index.analysis.Analysis;
+import org.elasticsearch.index.analysis.StopTokenFilterFactory;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
 
 /**
  * A {@link TokenFilterFactory} for {@link KeepWordFilter}. This filter only
@@ -54,8 +58,7 @@ public class KeepWordFilterFactory extends AbstractTokenFilterFactory {
     // unsupported ancient option
     private static final String ENABLE_POS_INC_KEY = "enable_position_increments";
 
-    public KeepWordFilterFactory(IndexSettings indexSettings,
-                                 Environment env, String name, Settings settings) {
+    KeepWordFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
         super(indexSettings, name, settings);
 
         final String[] arrayKeepWords = settings.getAsArray(KEEP_WORDS_KEY, null);

+ 3 - 2
core/src/main/java/org/elasticsearch/index/analysis/MinHashTokenFilterFactory.java → modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MinHashTokenFilterFactory.java

@@ -17,13 +17,14 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.minhash.MinHashFilterFactory;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 
 import java.util.HashMap;
 import java.util.Map;
@@ -36,7 +37,7 @@ public class MinHashTokenFilterFactory extends AbstractTokenFilterFactory {
 
     private final MinHashFilterFactory minHashFilterFactory;
 
-    public MinHashTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+    MinHashTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
         minHashFilterFactory = new MinHashFilterFactory(convertSettings(settings));
     }

+ 4 - 2
core/src/main/java/org/elasticsearch/index/analysis/ScandinavianFoldingFilterFactory.java → modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScandinavianFoldingFilterFactory.java

@@ -16,20 +16,22 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilter;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.index.analysis.MultiTermAwareComponent;
 
 /**
  * Factory for {@link ScandinavianFoldingFilter}
  */
 public class ScandinavianFoldingFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
 
-    public ScandinavianFoldingFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+    ScandinavianFoldingFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
     }
 

+ 10 - 2
modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java

@@ -22,7 +22,6 @@ package org.elasticsearch.analysis.common;
 import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory;
 import org.apache.lucene.analysis.en.PorterStemFilterFactory;
 import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory;
-import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory;
 import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
 import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
 import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory;
@@ -117,6 +116,15 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
         filters.put("soraninormalization", SoraniNormalizationFilterFactory.class);
         filters.put("cjkwidth", CJKWidthFilterFactory.class);
         filters.put("cjkbigram", CJKBigramFilterFactory.class);
+        filters.put("delimitedpayload", DelimitedPayloadTokenFilterFactory.class);
+        filters.put("keepword", KeepWordFilterFactory.class);
+        filters.put("type", KeepTypesFilterFactory.class);
+        filters.put("classic", ClassicFilterFactory.class);
+        filters.put("apostrophe", ApostropheFilterFactory.class);
+        filters.put("decimaldigit", DecimalDigitFilterFactory.class);
+        filters.put("fingerprint", FingerprintTokenFilterFactory.class);
+        filters.put("minhash", MinHashTokenFilterFactory.class);
+        filters.put("scandinavianfolding", ScandinavianFoldingFilterFactory.class);
         return filters;
     }
 
@@ -155,7 +163,7 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
         filters.put("common_grams", null);
         filters.put("czech_stem", null);
         filters.put("decimal_digit", null);
-        filters.put("delimited_payload_filter", DelimitedPayloadTokenFilterFactory.class);
+        filters.put("delimited_payload_filter", org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class);
         filters.put("dutch_stem", SnowballPorterFilterFactory.class);
         filters.put("edge_ngram", null);
         filters.put("edgeNGram", null);

+ 13 - 8
core/src/test/java/org/elasticsearch/index/analysis/KeepFilterFactoryTests.java → modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepFilterFactoryTests.java

@@ -17,12 +17,14 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.analysis.AnalysisTestsHelper;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
 import org.elasticsearch.test.ESTestCase;
 import org.elasticsearch.test.ESTokenStreamTestCase;
 import org.junit.Assert;
@@ -33,10 +35,11 @@ import java.io.StringReader;
 import static org.hamcrest.Matchers.instanceOf;
 
 public class KeepFilterFactoryTests extends ESTokenStreamTestCase {
-    private static final String RESOURCE = "/org/elasticsearch/index/analysis/keep_analysis.json";
+    private static final String RESOURCE = "/org/elasticsearch/analysis/common/keep_analysis.json";
 
     public void testLoadWithoutSettings() throws IOException {
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(
+                createTempDir(), RESOURCE, new CommonAnalysisPlugin());
         TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep");
         Assert.assertNull(tokenFilter);
     }
@@ -49,7 +52,7 @@ public class KeepFilterFactoryTests extends ESTokenStreamTestCase {
                 .put("index.analysis.filter.broken_keep_filter.keep_words", "[\"Hello\", \"worlD\"]")
                 .build();
         try {
-            AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
+            AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
             Assert.fail("path and array are configured");
         } catch (IllegalArgumentException e) {
         } catch (IOException e) {
@@ -65,7 +68,7 @@ public class KeepFilterFactoryTests extends ESTokenStreamTestCase {
                 .build();
         try {
             // test our none existing setup is picked up
-            AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
+            AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
             fail("expected an exception due to non existent keep_words_path");
         } catch (IllegalArgumentException e) {
         } catch (IOException e) {
@@ -77,7 +80,7 @@ public class KeepFilterFactoryTests extends ESTokenStreamTestCase {
                 .build();
         try {
             // test our none existing setup is picked up
-            AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
+            AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
             fail("expected an exception indicating that you can't use [keep_words_path] with [keep_words] ");
         } catch (IllegalArgumentException e) {
         } catch (IOException e) {
@@ -87,7 +90,8 @@ public class KeepFilterFactoryTests extends ESTokenStreamTestCase {
     }
 
     public void testCaseInsensitiveMapping() throws IOException {
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(
+                createTempDir(), RESOURCE, new CommonAnalysisPlugin());
         TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_keep_filter");
         assertThat(tokenFilter, instanceOf(KeepWordFilterFactory.class));
         String source = "hello small world";
@@ -98,7 +102,8 @@ public class KeepFilterFactoryTests extends ESTokenStreamTestCase {
     }
 
     public void testCaseSensitiveMapping() throws IOException {
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(
+                createTempDir(), RESOURCE, new CommonAnalysisPlugin());
         TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_case_sensitive_keep_filter");
         assertThat(tokenFilter, instanceOf(KeepWordFilterFactory.class));
         String source = "Hello small world";

+ 4 - 2
core/src/test/java/org/elasticsearch/index/analysis/KeepTypesFilterFactoryTests.java → modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java

@@ -17,12 +17,14 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.analysis.AnalysisTestsHelper;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
 import org.elasticsearch.test.ESTestCase;
 import org.elasticsearch.test.ESTokenStreamTestCase;
 
@@ -38,7 +40,7 @@ public class KeepTypesFilterFactoryTests extends ESTokenStreamTestCase {
                 .put("index.analysis.filter.keep_numbers.type", "keep_types")
                 .putArray("index.analysis.filter.keep_numbers.types", new String[] {"<NUM>", "<SOMETHINGELSE>"})
                 .build();
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
         TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers");
         assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class));
         String source = "Hello 123 world";

+ 5 - 3
core/src/test/java/org/elasticsearch/index/analysis/MinHashFilterFactoryTests.java → modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/MinHashFilterFactoryTests.java

@@ -17,12 +17,14 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.analysis.AnalysisTestsHelper;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
 import org.elasticsearch.test.ESTestCase;
 import org.elasticsearch.test.ESTokenStreamTestCase;
 
@@ -37,7 +39,7 @@ public class MinHashFilterFactoryTests extends ESTokenStreamTestCase {
         Settings settings = Settings.builder()
             .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
             .build();
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
         TokenFilterFactory tokenFilter = analysis.tokenFilter.get("min_hash");
         String source = "the quick brown fox";
         Tokenizer tokenizer = new WhitespaceTokenizer();
@@ -58,7 +60,7 @@ public class MinHashFilterFactoryTests extends ESTokenStreamTestCase {
             .put("index.analysis.filter.test_min_hash.with_rotation", false)
             .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
             .build();
-        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
         TokenFilterFactory tokenFilter = analysis.tokenFilter.get("test_min_hash");
         String source = "sushi";
         Tokenizer tokenizer = new WhitespaceTokenizer();

+ 0 - 0
core/src/test/resources/org/elasticsearch/index/analysis/keep_analysis.json → modules/analysis-common/src/test/resources/org/elasticsearch/analysis/common/keep_analysis.json


+ 265 - 0
modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml

@@ -993,3 +993,268 @@
     - match:  { tokens.8.token: に落 }
     - match:  { tokens.9.token: 落ち }
     - match:  { tokens.10.token: ちた }
+
+---
+"delimited_payload_filter":
+    - do:
+        indices.create:
+          index: test
+          body:
+            settings:
+              analysis:
+                filter:
+                  my_delimited_payload_filter:
+                    type: delimited_payload_filter
+                    delimiter: ^
+                    encoding: identity
+    - do:
+        indices.analyze:
+          index: test
+          body:
+            text:      foo^bar
+            tokenizer: keyword
+            filter:    [my_delimited_payload_filter]
+    - length: { tokens: 1 }
+    - match:  { tokens.0.token: foo }
+
+    # Test pre-configured token filter too:
+    - do:
+        indices.analyze:
+          body:
+            text:      foo|5
+            tokenizer: keyword
+            filter:    [delimited_payload_filter]
+    - length: { tokens: 1 }
+    - match:  { tokens.0.token: foo }
+
+---
+"keep_filter":
+    - do:
+        indices.create:
+          index: test
+          body:
+            settings:
+              analysis:
+                filter:
+                  my_keep:
+                    type: keep
+                    keep_words: [foo,bar]
+    - do:
+        indices.analyze:
+          index: test
+          body:
+            text:      foo bar baz
+            tokenizer: whitespace
+            filter:    [my_keep]
+    - length: { tokens: 2 }
+    - match:  { tokens.0.token: foo }
+    - match:  { tokens.1.token: bar }
+
+---
+"keep_types_filter":
+    - do:
+        indices.create:
+          index: test
+          body:
+            settings:
+              analysis:
+                filter:
+                  my_keep_types:
+                    type: keep_types
+                    types: [<NUM>]
+    - do:
+        indices.analyze:
+          index: test
+          body:
+            text:      foo 123 bar
+            tokenizer: standard
+            filter:    [my_keep_types]
+    - length: { tokens: 1 }
+    - match:  { tokens.0.token: "123" }
+
+---
+"classic":
+    - do:
+        indices.create:
+          index: test
+          body:
+            settings:
+              analysis:
+                filter:
+                  my_classic:
+                    type: classic
+    - do:
+        indices.analyze:
+          index: test
+          body:
+            text:      foo's bar
+            tokenizer: classic
+            filter:    [my_classic]
+    - length: { tokens: 2 }
+    - match:  { tokens.0.token: foo }
+    - match:  { tokens.1.token: bar }
+
+    # Test pre-configured token filter too:
+    - do:
+        indices.analyze:
+          body:
+            text:      foo's bar
+            tokenizer: classic
+            filter:    [classic]
+    - length: { tokens: 2 }
+    - match:  { tokens.0.token: foo }
+    - match:  { tokens.1.token: bar }
+
+---
+"apostrophe":
+    - do:
+        indices.create:
+          index: test
+          body:
+            settings:
+              analysis:
+                filter:
+                  my_apostrophe:
+                    type: apostrophe
+    - do:
+        indices.analyze:
+          index: test
+          body:
+            text:      foo's bar
+            tokenizer: keyword
+            filter:    [my_apostrophe]
+    - length: { tokens: 1 }
+    - match:  { tokens.0.token: foo }
+
+    # Test pre-configured token filter too:
+    - do:
+        indices.analyze:
+          body:
+            text:      foo's bar
+            tokenizer: keyword
+            filter:    [apostrophe]
+    - length: { tokens: 1 }
+    - match:  { tokens.0.token: foo }
+
+---
+"decimal_digit":
+    - do:
+        indices.create:
+          index: test
+          body:
+            settings:
+              analysis:
+                filter:
+                  my_decimal_digit:
+                    type: decimal_digit
+    - do:
+        indices.analyze:
+          index: test
+          body:
+            text:      ١٢٣٤
+            tokenizer: keyword
+            filter:    [my_decimal_digit]
+    - length: { tokens: 1 }
+    - match:  { tokens.0.token: "1234" }
+
+    # Test pre-configured token filter too:
+    - do:
+        indices.analyze:
+          body:
+            text:      ١٢٣٤
+            tokenizer: keyword
+            filter:    [decimal_digit]
+    - length: { tokens: 1 }
+    - match:  { tokens.0.token: "1234" }
+
+---
+"fingerprint":
+    - do:
+        indices.create:
+          index: test
+          body:
+            settings:
+              analysis:
+                filter:
+                  my_fingerprint:
+                    type: fingerprint
+                    separator: _
+    - do:
+        indices.analyze:
+          index: test
+          body:
+            text: A1 B2 A1 D4 C3
+            tokenizer: whitespace
+            filter:    [my_fingerprint]
+    - length: { tokens: 1 }
+    - match:  { tokens.0.token: A1_B2_C3_D4 }
+
+    # Test pre-configured token filter too:
+    - do:
+        indices.analyze:
+          body:
+            text: A1 B2 A1 D4 C3
+            tokenizer: whitespace
+            filter:    [fingerprint]
+    - length: { tokens: 1 }
+    - match:  { tokens.0.token: A1 B2 C3 D4 }
+
+---
+"min_hash":
+    - do:
+        indices.create:
+          index: test
+          body:
+            settings:
+              analysis:
+                filter:
+                  my_min_hash:
+                    type: min_hash
+    - do:
+        indices.analyze:
+          index: test
+          body:
+            text: the quick brown fox
+            tokenizer: whitespace
+            filter:    [my_min_hash]
+    - length: { tokens: 512 }
+
+    # Test pre-configured token filter too:
+    - do:
+        indices.analyze:
+          body:
+            text: the quick brown fox
+            tokenizer: whitespace
+            filter:    [min_hash]
+    - length: { tokens: 512 }
+
+---
+"scandinavian_folding":
+    - do:
+        indices.create:
+          index: test
+          body:
+            settings:
+              analysis:
+                filter:
+                  my_scandinavian_folding:
+                    type: scandinavian_folding
+    - do:
+        indices.analyze:
+          index: test
+          body:
+            text: räksmörgås
+            tokenizer: keyword
+            filter:    [my_scandinavian_folding]
+    - length: { tokens: 1 }
+    - match:  { tokens.0.token: raksmorgas }
+
+    # Test pre-configured token filter too:
+    - do:
+        indices.analyze:
+          body:
+            text: räksmörgås
+            tokenizer: keyword
+            filter:    [scandinavian_folding]
+    - length: { tokens: 1 }
+    - match:  { tokens.0.token: raksmorgas }

+ 8 - 16
test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java

@@ -23,23 +23,16 @@ import org.apache.lucene.analysis.util.CharFilterFactory;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 import org.apache.lucene.analysis.util.TokenizerFactory;
 import org.elasticsearch.common.collect.MapBuilder;
-import org.elasticsearch.index.analysis.ApostropheFilterFactory;
 import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory;
 import org.elasticsearch.index.analysis.BrazilianStemTokenFilterFactory;
-import org.elasticsearch.index.analysis.ClassicFilterFactory;
 import org.elasticsearch.index.analysis.ClassicTokenizerFactory;
 import org.elasticsearch.index.analysis.CzechStemTokenFilterFactory;
-import org.elasticsearch.index.analysis.DecimalDigitFilterFactory;
-import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
 import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
 import org.elasticsearch.index.analysis.GermanStemTokenFilterFactory;
 import org.elasticsearch.index.analysis.HunspellTokenFilterFactory;
-import org.elasticsearch.index.analysis.KeepTypesFilterFactory;
-import org.elasticsearch.index.analysis.KeepWordFilterFactory;
 import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
 import org.elasticsearch.index.analysis.LetterTokenizerFactory;
 import org.elasticsearch.index.analysis.LowerCaseTokenizerFactory;
-import org.elasticsearch.index.analysis.MinHashTokenFilterFactory;
 import org.elasticsearch.index.analysis.MultiTermAwareComponent;
 import org.elasticsearch.index.analysis.NGramTokenizerFactory;
 import org.elasticsearch.index.analysis.PathHierarchyTokenizerFactory;
@@ -47,7 +40,6 @@ import org.elasticsearch.index.analysis.PatternTokenizerFactory;
 import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
 import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
 import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
-import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory;
 import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
 import org.elasticsearch.index.analysis.StandardTokenFilterFactory;
 import org.elasticsearch.index.analysis.StandardTokenizerFactory;
@@ -120,7 +112,7 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
 
     static final Map<String,Class<?>> KNOWN_TOKENFILTERS = new MapBuilder<String,Class<?>>()
         // exposed in ES
-        .put("apostrophe",                ApostropheFilterFactory.class)
+        .put("apostrophe",                MovedToAnalysisCommon.class)
         .put("arabicnormalization",       MovedToAnalysisCommon.class)
         .put("arabicstem",                ArabicStemTokenFilterFactory.class)
         .put("asciifolding",              MovedToAnalysisCommon.class)
@@ -128,12 +120,12 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
         .put("bulgarianstem",             MovedToAnalysisCommon.class)
         .put("cjkbigram",                 MovedToAnalysisCommon.class)
         .put("cjkwidth",                  MovedToAnalysisCommon.class)
-        .put("classic",                   ClassicFilterFactory.class)
+        .put("classic",                   MovedToAnalysisCommon.class)
         .put("commongrams",               MovedToAnalysisCommon.class)
         .put("commongramsquery",          MovedToAnalysisCommon.class)
         .put("czechstem",                 CzechStemTokenFilterFactory.class)
-        .put("decimaldigit",              DecimalDigitFilterFactory.class)
-        .put("delimitedpayload",          DelimitedPayloadTokenFilterFactory.class)
+        .put("decimaldigit",              MovedToAnalysisCommon.class)
+        .put("delimitedpayload",          MovedToAnalysisCommon.class)
         .put("dictionarycompoundword",    MovedToAnalysisCommon.class)
         .put("edgengram",                 MovedToAnalysisCommon.class)
         .put("elision",                   MovedToAnalysisCommon.class)
@@ -159,7 +151,7 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
         .put("irishlowercase",            MovedToAnalysisCommon.class)
         .put("indonesianstem",            MovedToAnalysisCommon.class)
         .put("italianlightstem",          MovedToAnalysisCommon.class)
-        .put("keepword",                  KeepWordFilterFactory.class)
+        .put("keepword",                  MovedToAnalysisCommon.class)
         .put("keywordmarker",             MovedToAnalysisCommon.class)
         .put("kstem",                     MovedToAnalysisCommon.class)
         .put("latvianstem",               MovedToAnalysisCommon.class)
@@ -178,11 +170,11 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
         .put("portugueseminimalstem",     MovedToAnalysisCommon.class)
         .put("reversestring",             MovedToAnalysisCommon.class)
         .put("russianlightstem",          MovedToAnalysisCommon.class)
-        .put("scandinavianfolding",       ScandinavianFoldingFilterFactory.class)
+        .put("scandinavianfolding",       MovedToAnalysisCommon.class)
         .put("scandinaviannormalization", MovedToAnalysisCommon.class)
         .put("serbiannormalization",      MovedToAnalysisCommon.class)
         .put("shingle",                   ShingleTokenFilterFactory.class)
-        .put("minhash",                   MinHashTokenFilterFactory.class)
+        .put("minhash",                   MovedToAnalysisCommon.class)
         .put("snowballporter",            MovedToAnalysisCommon.class)
         .put("soraninormalization",       MovedToAnalysisCommon.class)
         .put("soranistem",                MovedToAnalysisCommon.class)
@@ -196,7 +188,7 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
         .put("trim",                      MovedToAnalysisCommon.class)
         .put("truncate",                  MovedToAnalysisCommon.class)
         .put("turkishlowercase",          MovedToAnalysisCommon.class)
-        .put("type",                      KeepTypesFilterFactory.class)
+        .put("type",                      MovedToAnalysisCommon.class)
         .put("uppercase",                 MovedToAnalysisCommon.class)
         .put("worddelimiter",             MovedToAnalysisCommon.class)
         .put("worddelimitergraph",        MovedToAnalysisCommon.class)