1
0
Эх сурвалжийг харах

Move remaining pre-configured token filters into analysis-common (#24716)

Moves the remaining preconfigured token figured into the analysis-common module. There were a couple of tests in core that depended on the pre-configured token filters so I had to touch them:

* `GetTermVectorsCheckDocFreqIT` depended on `type_as_payload` but didn't do anything important with it. I dropped the dependency. Then I moved the test to a single node test case because we're trying to cut down on the number of `ESIntegTestCase` subclasses.
* `AbstractTermVectorsTestCase` and its subclasses depended on `type_as_payload`. I dropped their usage of the token filter and added an integration test for the termvectors API that uses `type_as_payload` to the `analysis-common` module.
* `AnalysisModuleTests` expected a few pre-configured token filtes be registered by default. They aren't any more so I dropped this assertion. We assert that the `CommonAnalysisPlugin` registers these pre-built token filters in `CommonAnalysisFactoryTests`
* `SearchQueryIT` and `SuggestSearchIT` had tests that depended on the specific behavior of the token filters so I moved the tests to integration tests in `analysis-common`.
Nik Everett 8 жил өмнө
parent
commit
7ef390068a
15 өөрчлөгдсөн 402 нэмэгдсэн , 700 устгасан
  1. 0 16
      core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
  2. 1 251
      core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java
  3. 2 7
      core/src/test/java/org/elasticsearch/action/termvectors/AbstractTermVectorsTestCase.java
  4. 0 259
      core/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsCheckDocFreqIT.java
  5. 5 6
      core/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsIT.java
  6. 45 0
      core/src/test/java/org/elasticsearch/index/termvectors/TermVectorsServiceTests.java
  7. 0 9
      core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java
  8. 0 24
      core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java
  9. 0 101
      core/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java
  10. 60 1
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
  11. 26 0
      modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java
  12. 65 0
      modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/10_match.yml
  13. 158 0
      modules/analysis-common/src/test/resources/rest-api-spec/test/search.suggest/20_phrase.yaml
  14. 40 0
      modules/analysis-common/src/test/resources/rest-api-spec/test/termvectors/10_payloads.yml
  15. 0 26
      test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java

+ 0 - 16
core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java

@@ -278,22 +278,6 @@ public final class AnalysisModule {
          * version uses a set of English stop words that are in
          * lucene-analyzers-common so "stop" is defined in the analysis-common
          * module. */
-        
-        // Add token filters declared in PreBuiltTokenFilters until they have all been migrated
-        for (PreBuiltTokenFilters preBuilt : PreBuiltTokenFilters.values()) {
-            switch (preBuilt) {
-            case LOWERCASE:
-                // This has been migrated but has to stick around until PreBuiltTokenizers is removed.
-                continue;
-            default:
-                if (CachingStrategy.ONE != preBuilt.getCachingStrategy()) {
-                    throw new UnsupportedOperationException("shim not available for " + preBuilt.getCachingStrategy());
-                }
-                String name = preBuilt.name().toLowerCase(Locale.ROOT);
-                preConfiguredTokenFilters.register(name, PreConfiguredTokenFilter.singleton(name, preBuilt.isMultiTermAware(),
-                        tokenStream -> preBuilt.create(tokenStream, Version.CURRENT)));
-            }
-        }
 
         for (AnalysisPlugin plugin: plugins) {
             for (PreConfiguredTokenFilter filter : plugin.getPreConfiguredTokenFilters()) {

+ 1 - 251
core/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java

@@ -20,38 +20,10 @@ package org.elasticsearch.indices.analysis;
 
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
-import org.apache.lucene.analysis.ar.ArabicStemFilter;
-import org.apache.lucene.analysis.br.BrazilianStemFilter;
-import org.apache.lucene.analysis.cjk.CJKBigramFilter;
-import org.apache.lucene.analysis.cjk.CJKWidthFilter;
-import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter;
-import org.apache.lucene.analysis.core.DecimalDigitFilter;
-import org.apache.lucene.analysis.cz.CzechStemFilter;
-import org.apache.lucene.analysis.de.GermanNormalizationFilter;
-import org.apache.lucene.analysis.de.GermanStemFilter;
-import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
-import org.apache.lucene.analysis.fr.FrenchAnalyzer;
-import org.apache.lucene.analysis.hi.HindiNormalizationFilter;
-import org.apache.lucene.analysis.in.IndicNormalizationFilter;
-import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter;
-import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
-import org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilter;
-import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter;
-import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
-import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
-import org.apache.lucene.analysis.shingle.ShingleFilter;
-import org.apache.lucene.analysis.snowball.SnowballFilter;
-import org.apache.lucene.analysis.tr.ApostropheFilter;
-import org.apache.lucene.analysis.util.ElisionFilter;
 import org.elasticsearch.Version;
-import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
-import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory;
 import org.elasticsearch.index.analysis.MultiTermAwareComponent;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
 import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
-import org.tartarus.snowball.ext.DutchStemmer;
-import org.tartarus.snowball.ext.FrenchStemmer;
 
 import java.util.Locale;
 
@@ -66,229 +38,7 @@ public enum PreBuiltTokenFilters {
         protected boolean isMultiTermAware() {
             return true;
         }
-    },
-
-    // Extended Token Filters
-    ELISION(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new ElisionFilter(tokenStream, FrenchAnalyzer.DEFAULT_ARTICLES);
-        }
-        @Override
-        protected boolean isMultiTermAware() {
-            return true;
-        }
-    },
-
-    ARABIC_STEM(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new ArabicStemFilter(tokenStream);
-        }
-    },
-
-    BRAZILIAN_STEM(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new BrazilianStemFilter(tokenStream);
-        }
-    },
-
-    CZECH_STEM(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new CzechStemFilter(tokenStream);
-        }
-    },
-
-    DUTCH_STEM(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new SnowballFilter(tokenStream, new DutchStemmer());
-        }
-    },
-
-    FRENCH_STEM(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new SnowballFilter(tokenStream, new FrenchStemmer());
-        }
-    },
-
-    GERMAN_STEM(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new GermanStemFilter(tokenStream);
-        }
-    },
-
-    RUSSIAN_STEM(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new SnowballFilter(tokenStream, "Russian");
-        }
-    },
-
-    KEYWORD_REPEAT(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new KeywordRepeatFilter(tokenStream);
-        }
-    },
-
-    ARABIC_NORMALIZATION(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new ArabicNormalizationFilter(tokenStream);
-        }
-        @Override
-        protected boolean isMultiTermAware() {
-            return true;
-        }
-    },
-
-    PERSIAN_NORMALIZATION(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new PersianNormalizationFilter(tokenStream);
-        }
-        @Override
-        protected boolean isMultiTermAware() {
-            return true;
-        }
-    },
-
-    TYPE_AS_PAYLOAD(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new TypeAsPayloadTokenFilter(tokenStream);
-        }
-    },
-
-    SHINGLE(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new ShingleFilter(tokenStream);
-        }
-    },
-
-    GERMAN_NORMALIZATION(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new GermanNormalizationFilter(tokenStream);
-        }
-        @Override
-        protected boolean isMultiTermAware() {
-            return true;
-        }
-    },
-
-    HINDI_NORMALIZATION(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new HindiNormalizationFilter(tokenStream);
-        }
-        @Override
-        protected boolean isMultiTermAware() {
-            return true;
-        }
-    },
-
-    INDIC_NORMALIZATION(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new IndicNormalizationFilter(tokenStream);
-        }
-        @Override
-        protected boolean isMultiTermAware() {
-            return true;
-        }
-    },
-
-    SORANI_NORMALIZATION(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new SoraniNormalizationFilter(tokenStream);
-        }
-        @Override
-        protected boolean isMultiTermAware() {
-            return true;
-        }
-    },
-
-    SCANDINAVIAN_NORMALIZATION(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new ScandinavianNormalizationFilter(tokenStream);
-        }
-        @Override
-        protected boolean isMultiTermAware() {
-            return true;
-        }
-    },
-
-    SCANDINAVIAN_FOLDING(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new ScandinavianFoldingFilter(tokenStream);
-        }
-        @Override
-        protected boolean isMultiTermAware() {
-            return true;
-        }
-    },
-
-    APOSTROPHE(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new ApostropheFilter(tokenStream);
-        }
-    },
-
-    CJK_WIDTH(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new CJKWidthFilter(tokenStream);
-        }
-        @Override
-        protected boolean isMultiTermAware() {
-            return true;
-        }
-    },
-
-    DECIMAL_DIGIT(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new DecimalDigitFilter(tokenStream);
-        }
-        @Override
-        protected boolean isMultiTermAware() {
-            return true;
-        }
-    },
-
-    CJK_BIGRAM(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new CJKBigramFilter(tokenStream);
-        }
-    },
-
-    DELIMITED_PAYLOAD_FILTER(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new DelimitedPayloadTokenFilter(tokenStream, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER);
-        }
-    },
-
-    LIMIT(CachingStrategy.ONE) {
-        @Override
-        public TokenStream create(TokenStream tokenStream, Version version) {
-            return new LimitTokenCountFilter(tokenStream, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT, LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS);
-        }
-    },
-
-    ;
+    };
 
     protected boolean isMultiTermAware() {
         return false;

+ 2 - 7
core/src/test/java/org/elasticsearch/action/termvectors/AbstractTermVectorsTestCase.java

@@ -66,7 +66,6 @@ import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcke
 import static org.hamcrest.Matchers.equalTo;
 
 public abstract class AbstractTermVectorsTestCase extends ESIntegTestCase {
-
     protected static class TestFieldSetting {
         public final String name;
         public final boolean storedOffset;
@@ -211,7 +210,7 @@ public abstract class AbstractTermVectorsTestCase extends ESIntegTestCase {
         Settings.Builder settings = Settings.builder()
                 .put(indexSettings())
                 .put("index.analysis.analyzer.tv_test.tokenizer", "standard")
-                .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase");
+                .putArray("index.analysis.analyzer.tv_test.filter", "lowercase");
         assertAcked(prepareCreate(index).addMapping("type1", mappingBuilder).setSettings(settings).addAlias(new Alias(alias)));
     }
 
@@ -395,11 +394,7 @@ public abstract class AbstractTermVectorsTestCase extends ESIntegTestCase {
                         assertThat("Missing offset test failed" + failDesc, esDocsPosEnum.startOffset(), equalTo(-1));
                         assertThat("Missing offset test failed" + failDesc, esDocsPosEnum.endOffset(), equalTo(-1));
                     }
-                    if (field.storedPayloads && testConfig.requestPayloads) {
-                        assertThat("Payload test failed" + failDesc, luceneDocsPosEnum.getPayload(), equalTo(esDocsPosEnum.getPayload()));
-                    } else {
-                        assertThat("Missing payload test failed" + failDesc, esDocsPosEnum.getPayload(), equalTo(null));
-                    }
+                    assertNull("Missing payload test failed" + failDesc, esDocsPosEnum.getPayload());
                 }
             }
             assertNull("Es returned terms are done but lucene isn't", luceneTermEnum.next());

+ 0 - 259
core/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsCheckDocFreqIT.java

@@ -1,259 +0,0 @@
-/*
- * Licensed to Elasticsearch under one or more contributor
- * license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright
- * ownership. Elasticsearch licenses this file to you under
- * the Apache License, Version 2.0 (the "License"); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.elasticsearch.action.termvectors;
-
-import org.apache.lucene.index.Fields;
-import org.apache.lucene.index.PostingsEnum;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.util.BytesRef;
-import org.elasticsearch.common.settings.Settings;
-import org.elasticsearch.common.xcontent.ToXContent;
-import org.elasticsearch.common.xcontent.XContentBuilder;
-import org.elasticsearch.common.xcontent.XContentFactory;
-import org.elasticsearch.test.ESIntegTestCase;
-import org.hamcrest.Matchers;
-
-import java.io.IOException;
-
-import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
-import static org.hamcrest.Matchers.equalTo;
-
-public class GetTermVectorsCheckDocFreqIT extends ESIntegTestCase {
-
-    @Override
-    protected int numberOfShards() {
-        return 1;
-    }
-
-    @Override
-    protected int numberOfReplicas() {
-        return 0;
-    }
-
-    @Override
-    public Settings indexSettings() {
-        return Settings.builder()
-                .put(super.indexSettings())
-                .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
-                .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")
-                .build();
-    }
-
-    public void testSimpleTermVectors() throws IOException {
-        XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1")
-                .startObject("properties")
-                        .startObject("field")
-                            .field("type", "text")
-                            .field("term_vector", "with_positions_offsets_payloads")
-                            .field("analyzer", "tv_test")
-                        .endObject()
-                .endObject()
-                .endObject().endObject();
-        assertAcked(prepareCreate("test").addMapping("type1", mapping));
-        ensureGreen();
-        int numDocs = 15;
-        for (int i = 0; i < numDocs; i++) {
-            client().prepareIndex("test", "type1", Integer.toString(i))
-                    .setSource(XContentFactory.jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog")
-                    // 0the3 4quick9 10brown15 16fox19 20jumps25 26over30
-                    // 31the34 35lazy39 40dog43
-                            .endObject()).execute().actionGet();
-            refresh();
-        }
-        String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" };
-        int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 };
-        int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } };
-        int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } };
-        int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } };
-        for (int i = 0; i < numDocs; i++) {
-            checkAllInfo(numDocs, values, freq, pos, startOffset, endOffset, i);
-            checkWithoutTermStatistics(numDocs, values, freq, pos, startOffset, endOffset, i);
-            checkWithoutFieldStatistics(numDocs, values, freq, pos, startOffset, endOffset, i);
-        }
-    }
-
-    private void checkWithoutFieldStatistics(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset,
-            int i) throws IOException {
-        TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)).setPayloads(true).setOffsets(true)
-                .setPositions(true).setTermStatistics(true).setFieldStatistics(false).setSelectedFields();
-        TermVectorsResponse response = resp.execute().actionGet();
-        assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
-        Fields fields = response.getFields();
-        assertThat(fields.size(), equalTo(1));
-        Terms terms = fields.terms("field");
-        assertThat(terms.size(), equalTo(8L));
-        assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) -1));
-        assertThat(terms.getDocCount(), Matchers.equalTo(-1));
-        assertThat(terms.getSumDocFreq(), equalTo((long) -1));
-        TermsEnum iterator = terms.iterator();
-        for (int j = 0; j < values.length; j++) {
-            String string = values[j];
-            BytesRef next = iterator.next();
-            assertThat(next, Matchers.notNullValue());
-            assertThat("expected " + string, string, equalTo(next.utf8ToString()));
-            assertThat(next, Matchers.notNullValue());
-            if (string.equals("the")) {
-                assertThat("expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq()));
-            } else {
-                assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq()));
-            }
-
-            PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
-            assertThat(docsAndPositions.nextDoc(), equalTo(0));
-            assertThat(freq[j], equalTo(docsAndPositions.freq()));
-            assertThat(iterator.docFreq(), equalTo(numDocs));
-            int[] termPos = pos[j];
-            int[] termStartOffset = startOffset[j];
-            int[] termEndOffset = endOffset[j];
-            assertThat(termPos.length, equalTo(freq[j]));
-            assertThat(termStartOffset.length, equalTo(freq[j]));
-            assertThat(termEndOffset.length, equalTo(freq[j]));
-            for (int k = 0; k < freq[j]; k++) {
-                int nextPosition = docsAndPositions.nextPosition();
-                assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
-                assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
-                assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
-                assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
-            }
-        }
-        assertThat(iterator.next(), Matchers.nullValue());
-
-        XContentBuilder xBuilder = XContentFactory.jsonBuilder();
-        response.toXContent(xBuilder, null);
-        String utf8 = xBuilder.bytes().utf8ToString().replaceFirst("\"took\":\\d+,", "");;
-        String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\""
-                + i
-                + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}";
-        assertThat(utf8, equalTo(expectedString));
-
-    }
-
-    private void checkWithoutTermStatistics(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset,
-            int i) throws IOException {
-        TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)).setPayloads(true).setOffsets(true)
-                .setPositions(true).setTermStatistics(false).setFieldStatistics(true).setSelectedFields();
-        assertThat(resp.request().termStatistics(), equalTo(false));
-        TermVectorsResponse response = resp.execute().actionGet();
-        assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
-        Fields fields = response.getFields();
-        assertThat(fields.size(), equalTo(1));
-        Terms terms = fields.terms("field");
-        assertThat(terms.size(), equalTo(8L));
-        assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs)));
-        assertThat(terms.getDocCount(), Matchers.equalTo(numDocs));
-        assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length));
-        TermsEnum iterator = terms.iterator();
-        for (int j = 0; j < values.length; j++) {
-            String string = values[j];
-            BytesRef next = iterator.next();
-            assertThat(next, Matchers.notNullValue());
-            assertThat("expected " + string, string, equalTo(next.utf8ToString()));
-            assertThat(next, Matchers.notNullValue());
-
-            assertThat("expected ttf of " + string, -1, equalTo((int) iterator.totalTermFreq()));
-
-            PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
-            assertThat(docsAndPositions.nextDoc(), equalTo(0));
-            assertThat(freq[j], equalTo(docsAndPositions.freq()));
-            assertThat(iterator.docFreq(), equalTo(-1));
-            int[] termPos = pos[j];
-            int[] termStartOffset = startOffset[j];
-            int[] termEndOffset = endOffset[j];
-            assertThat(termPos.length, equalTo(freq[j]));
-            assertThat(termStartOffset.length, equalTo(freq[j]));
-            assertThat(termEndOffset.length, equalTo(freq[j]));
-            for (int k = 0; k < freq[j]; k++) {
-                int nextPosition = docsAndPositions.nextPosition();
-                assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
-                assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
-                assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
-                assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
-            }
-        }
-        assertThat(iterator.next(), Matchers.nullValue());
-
-        XContentBuilder xBuilder = XContentFactory.jsonBuilder();
-        response.toXContent(xBuilder, null);
-        String utf8 = xBuilder.bytes().utf8ToString().replaceFirst("\"took\":\\d+,", "");;
-        String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\""
-                + i
-                + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}";
-        assertThat(utf8, equalTo(expectedString));
-
-    }
-
-    private void checkAllInfo(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, int i)
-            throws IOException {
-        TermVectorsRequestBuilder resp = client().prepareTermVectors("test", "type1", Integer.toString(i)).setPayloads(true).setOffsets(true)
-                .setPositions(true).setFieldStatistics(true).setTermStatistics(true).setSelectedFields();
-        assertThat(resp.request().fieldStatistics(), equalTo(true));
-        TermVectorsResponse response = resp.execute().actionGet();
-        assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
-        Fields fields = response.getFields();
-        assertThat(fields.size(), equalTo(1));
-        Terms terms = fields.terms("field");
-        assertThat(terms.size(), equalTo(8L));
-        assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs)));
-        assertThat(terms.getDocCount(), Matchers.equalTo(numDocs));
-        assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length));
-        TermsEnum iterator = terms.iterator();
-        for (int j = 0; j < values.length; j++) {
-            String string = values[j];
-            BytesRef next = iterator.next();
-            assertThat(next, Matchers.notNullValue());
-            assertThat("expected " + string, string, equalTo(next.utf8ToString()));
-            assertThat(next, Matchers.notNullValue());
-            if (string.equals("the")) {
-                assertThat("expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq()));
-            } else {
-                assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq()));
-            }
-
-            PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
-            assertThat(docsAndPositions.nextDoc(), equalTo(0));
-            assertThat(freq[j], equalTo(docsAndPositions.freq()));
-            assertThat(iterator.docFreq(), equalTo(numDocs));
-            int[] termPos = pos[j];
-            int[] termStartOffset = startOffset[j];
-            int[] termEndOffset = endOffset[j];
-            assertThat(termPos.length, equalTo(freq[j]));
-            assertThat(termStartOffset.length, equalTo(freq[j]));
-            assertThat(termEndOffset.length, equalTo(freq[j]));
-            for (int k = 0; k < freq[j]; k++) {
-                int nextPosition = docsAndPositions.nextPosition();
-                assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
-                assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
-                assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
-                assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
-            }
-        }
-        assertThat(iterator.next(), Matchers.nullValue());
-
-        XContentBuilder xBuilder = XContentFactory.jsonBuilder();
-        response.toXContent(xBuilder, ToXContent.EMPTY_PARAMS);
-        String utf8 = xBuilder.bytes().utf8ToString().replaceFirst("\"took\":\\d+,", "");;
-        String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\""
-                + i
-                + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}";
-        assertThat(utf8, equalTo(expectedString));
-    }
-
-}

+ 5 - 6
core/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsIT.java

@@ -193,7 +193,7 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
                 .setSettings(Settings.builder()
                         .put(indexSettings())
                         .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
-                        .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
+                        .putArray("index.analysis.analyzer.tv_test.filter", "lowercase")));
         for (int i = 0; i < 10; i++) {
             client().prepareIndex("test", "type1", Integer.toString(i))
                     .setSource(jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog")
@@ -278,7 +278,7 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
         assertAcked(prepareCreate("test").addMapping("type1", mapping)
                 .setSettings(Settings.builder()
                         .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
-                        .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
+                        .putArray("index.analysis.analyzer.tv_test.filter", "lowercase")));
         for (int i = 0; i < 10; i++) {
             client().prepareIndex("test", "type1", Integer.toString(i))
                     .setSource(jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog")
@@ -585,7 +585,7 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
                 .setSettings(Settings.builder()
                         .put(indexSettings())
                         .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
-                        .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
+                        .putArray("index.analysis.analyzer.tv_test.filter", "lowercase")));
 
         ensureGreen();
 
@@ -645,9 +645,8 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
                 assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
                 assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
                 assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
-                if (withPayloads) {
-                    assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
-                }
+                // We never configure an analyzer with payloads for this test so this is never returned
+                assertNull("term: " + string, docsAndPositions.getPayload());
             }
         }
         assertThat(iterator.next(), nullValue());

+ 45 - 0
core/src/test/java/org/elasticsearch/index/termvectors/TermVectorsServiceTests.java

@@ -19,6 +19,9 @@
 
 package org.elasticsearch.index.termvectors;
 
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.elasticsearch.action.bulk.BulkRequestBuilder;
 import org.elasticsearch.action.termvectors.TermVectorsRequest;
 import org.elasticsearch.action.termvectors.TermVectorsResponse;
 import org.elasticsearch.common.settings.Settings;
@@ -28,6 +31,7 @@ import org.elasticsearch.index.shard.IndexShard;
 import org.elasticsearch.indices.IndicesService;
 import org.elasticsearch.test.ESSingleNodeTestCase;
 
+import java.io.IOException;
 import java.util.List;
 import java.util.concurrent.TimeUnit;
 import java.util.stream.Stream;
@@ -71,4 +75,45 @@ public class TermVectorsServiceTests extends ESSingleNodeTestCase {
         assertThat(response, notNullValue());
         assertThat(response.getTookInMillis(), equalTo(TimeUnit.NANOSECONDS.toMillis(longs.get(1) - longs.get(0))));
     }
+
+    public void testDocFreqs() throws IOException {
+        XContentBuilder mapping = jsonBuilder()
+            .startObject()
+                .startObject("doc")
+                    .startObject("properties")
+                        .startObject("text")
+                            .field("type", "text")
+                            .field("term_vector", "with_positions_offsets_payloads")
+                        .endObject()
+                    .endObject()
+                .endObject()
+            .endObject();
+        Settings settings = Settings.builder()
+                .put("number_of_shards", 1)
+                .build();
+        createIndex("test", settings, "doc", mapping);
+        ensureGreen();
+
+        int max = between(3, 10);
+        BulkRequestBuilder bulk = client().prepareBulk();
+        for (int i = 0; i < max; i++) {
+            bulk.add(client().prepareIndex("test", "doc", Integer.toString(i))
+                    .setSource("text", "the quick brown fox jumped over the lazy dog"));
+        }
+        bulk.get();
+
+        TermVectorsRequest request = new TermVectorsRequest("test", "doc", "0").termStatistics(true);
+
+        IndicesService indicesService = getInstanceFromNode(IndicesService.class);
+        IndexService test = indicesService.indexService(resolveIndex("test"));
+        IndexShard shard = test.getShardOrNull(0);
+        assertThat(shard, notNullValue());
+        TermVectorsResponse response = TermVectorsService.getTermVectors(shard, request);
+
+        Terms terms = response.getFields().terms("text");
+        TermsEnum iterator = terms.iterator();
+        while (iterator.next() != null) {
+            assertEquals(max, iterator.docFreq());
+        }
+    }
 }

+ 0 - 9
core/src/test/java/org/elasticsearch/indices/analysis/AnalysisModuleTests.java

@@ -23,11 +23,8 @@ import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
-import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
 import org.apache.lucene.analysis.hunspell.Dictionary;
-import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.store.Directory;
@@ -127,12 +124,6 @@ public class AnalysisModuleTests extends ESTestCase {
         testSimpleConfiguration(settings);
     }
 
-    public void testDefaultFactoryTokenFilters() throws IOException {
-        assertTokenFilter("keyword_repeat", KeywordRepeatFilter.class);
-        assertTokenFilter("persian_normalization", PersianNormalizationFilter.class);
-        assertTokenFilter("arabic_normalization", ArabicNormalizationFilter.class);
-    }
-
     public void testAnalyzerAliasNotAllowedPost5x() throws IOException {
         Settings settings = Settings.builder()
             .put("index.analysis.analyzer.foobar.type", "standard")

+ 0 - 24
core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java

@@ -1550,30 +1550,6 @@ public class SearchQueryIT extends ESIntegTestCase {
         assertHitCount(searchResponse, 2);
     }
 
-    public void testMatchQueryWithStackedStems() throws IOException {
-        CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
-                .put(indexSettings())
-                .put("index.analysis.analyzer.index.type", "custom")
-                .put("index.analysis.analyzer.index.tokenizer", "standard")
-                .put("index.analysis.analyzer.index.filter", "lowercase")
-                .put("index.analysis.analyzer.search.type", "custom")
-                .put("index.analysis.analyzer.search.tokenizer", "standard")
-                .putArray("index.analysis.analyzer.search.filter", "lowercase", "keyword_repeat", "porter_stem", "unique_stem")
-                .put("index.analysis.filter.unique_stem.type", "unique")
-                .put("index.analysis.filter.unique_stem.only_on_same_position", true));
-        assertAcked(builder.addMapping("test", "text", "type=text,analyzer=index,search_analyzer=search"));
-
-        client().prepareIndex("test", "test", "1").setSource("text", "the fox runs across the street").get();
-        refresh();
-        SearchResponse searchResponse = client().prepareSearch("test").setQuery(matchQuery("text", "fox runs").operator(Operator.AND)).get();
-        assertHitCount(searchResponse, 1);
-
-        client().prepareIndex("test", "test", "2").setSource("text", "run fox run").get();
-        refresh();
-        searchResponse = client().prepareSearch("test").setQuery(matchQuery("text", "fox runs").operator(Operator.AND)).get();
-        assertHitCount(searchResponse, 2);
-    }
-
     public void testQueryStringWithSynonyms() throws IOException {
         CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
                 .put(indexSettings())

+ 0 - 101
core/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java

@@ -694,107 +694,6 @@ public class SuggestSearchIT extends ESIntegTestCase {
         assertSuggestion(searchSuggest, 0, "simple_phrase", "xorr the god jewel");
     }
 
-    public void testPhraseBoundaryCases() throws IOException, URISyntaxException {
-        CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
-                .put(indexSettings()).put(SETTING_NUMBER_OF_SHARDS, 1) // to get reliable statistics we should put this all into one shard
-                .put("index.analysis.analyzer.body.tokenizer", "standard")
-                .putArray("index.analysis.analyzer.body.filter", "lowercase")
-                .put("index.analysis.analyzer.bigram.tokenizer", "standard")
-                .putArray("index.analysis.analyzer.bigram.filter", "my_shingle", "lowercase")
-                .put("index.analysis.analyzer.ngram.tokenizer", "standard")
-                .putArray("index.analysis.analyzer.ngram.filter", "my_shingle2", "lowercase")
-                .put("index.analysis.analyzer.myDefAnalyzer.tokenizer", "standard")
-                .putArray("index.analysis.analyzer.myDefAnalyzer.filter", "shingle", "lowercase")
-                .put("index.analysis.filter.my_shingle.type", "shingle")
-                .put("index.analysis.filter.my_shingle.output_unigrams", false)
-                .put("index.analysis.filter.my_shingle.min_shingle_size", 2)
-                .put("index.analysis.filter.my_shingle.max_shingle_size", 2)
-                .put("index.analysis.filter.my_shingle2.type", "shingle")
-                .put("index.analysis.filter.my_shingle2.output_unigrams", true)
-                .put("index.analysis.filter.my_shingle2.min_shingle_size", 2)
-                .put("index.analysis.filter.my_shingle2.max_shingle_size", 2));
-
-        XContentBuilder mapping = XContentFactory.jsonBuilder()
-                    .startObject().startObject("type1")
-                .startObject("properties")
-                .startObject("body").field("type", "text").field("analyzer", "body").endObject()
-                .startObject("bigram").field("type", "text").field("analyzer", "bigram").endObject()
-                .startObject("ngram").field("type", "text").field("analyzer", "ngram").endObject()
-                .endObject()
-                .endObject().endObject();
-        assertAcked(builder.addMapping("type1", mapping));
-        ensureGreen();
-
-        String[] strings = new String[]{
-            "Xorr the God-Jewel",
-            "Grog the God-Crusher",
-            "Xorn",
-            "Walter Newell",
-            "Wanda Maximoff",
-            "Captain America",
-            "American Ace",
-            "Wundarr the Aquarian",
-            "Will o' the Wisp",
-            "Xemnu the Titan"
-        };
-        for (String line : strings) {
-            index("test", "type1", line, "body", line, "bigram", line, "ngram", line);
-        }
-        refresh();
-
-        NumShards numShards = getNumShards("test");
-
-        // Lets make sure some things throw exceptions
-        PhraseSuggestionBuilder phraseSuggestion = phraseSuggestion("bigram")
-                .analyzer("body")
-                .addCandidateGenerator(candidateGenerator("does_not_exist").minWordLength(1).suggestMode("always"))
-                .realWordErrorLikelihood(0.95f)
-                .maxErrors(0.5f)
-                .size(1);
-        phraseSuggestion.clearCandidateGenerators().analyzer(null);
-        try {
-            searchSuggest("xor the got-jewel", numShards.numPrimaries, Collections.singletonMap("simple_phrase", phraseSuggestion));
-            fail("analyzer does only produce ngrams");
-        } catch (SearchPhaseExecutionException e) {
-        }
-
-        phraseSuggestion.analyzer("bigram");
-        try {
-            searchSuggest("xor the got-jewel", numShards.numPrimaries, Collections.singletonMap("simple_phrase", phraseSuggestion));
-            fail("analyzer does only produce ngrams");
-        } catch (SearchPhaseExecutionException e) {
-        }
-
-        // Now we'll make sure some things don't
-        phraseSuggestion.forceUnigrams(false);
-        searchSuggest( "xor the got-jewel", 0, Collections.singletonMap("simple_phrase", phraseSuggestion));
-
-        // Field doesn't produce unigrams but the analyzer does
-        phraseSuggestion.forceUnigrams(true).analyzer("ngram");
-        searchSuggest( "xor the got-jewel", 0, Collections.singletonMap("simple_phrase", phraseSuggestion));
-
-        phraseSuggestion = phraseSuggestion("ngram")
-                .analyzer("myDefAnalyzer")
-                .forceUnigrams(true)
-                .realWordErrorLikelihood(0.95f)
-                .maxErrors(0.5f)
-                .size(1)
-                .addCandidateGenerator(candidateGenerator("body").minWordLength(1).suggestMode("always"));
-        Suggest suggest = searchSuggest( "xor the got-jewel", 0, Collections.singletonMap("simple_phrase", phraseSuggestion));
-
-        // "xorr the god jewel" and and "xorn the god jewel" have identical scores (we are only using unigrams to score), so we tie break by
-        // earlier term (xorn):
-        assertSuggestion(suggest, 0, "simple_phrase", "xorn the god jewel");
-
-        phraseSuggestion.analyzer(null);
-        suggest = searchSuggest( "xor the got-jewel", 0, Collections.singletonMap("simple_phrase", phraseSuggestion));
-
-        // In this case xorr has a better score than xorn because we set the field back to the default (my_shingle2) analyzer, so the
-        // probability that the term is not in the dictionary but is NOT a misspelling is relatively high in this case compared to the
-        // others that have no n-gram with the other terms in the phrase :) you can set this realWorldErrorLikelyhood
-        assertSuggestion(suggest, 0, "simple_phrase", "xorr the god jewel");
-    }
-
     public void testDifferentShardSize() throws Exception {
         createIndex("test");
         ensureGreen();

+ 60 - 1
modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java

@@ -21,13 +21,31 @@ package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
+import org.apache.lucene.analysis.ar.ArabicStemFilter;
+import org.apache.lucene.analysis.br.BrazilianStemFilter;
+import org.apache.lucene.analysis.cjk.CJKBigramFilter;
+import org.apache.lucene.analysis.cjk.CJKWidthFilter;
+import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter;
 import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
 import org.apache.lucene.analysis.core.StopAnalyzer;
 import org.apache.lucene.analysis.core.UpperCaseFilter;
+import org.apache.lucene.analysis.cz.CzechStemFilter;
+import org.apache.lucene.analysis.de.GermanNormalizationFilter;
+import org.apache.lucene.analysis.de.GermanStemFilter;
 import org.apache.lucene.analysis.en.KStemFilter;
 import org.apache.lucene.analysis.en.PorterStemFilter;
+import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
+import org.apache.lucene.analysis.fr.FrenchAnalyzer;
+import org.apache.lucene.analysis.hi.HindiNormalizationFilter;
+import org.apache.lucene.analysis.in.IndicNormalizationFilter;
 import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
+import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter;
 import org.apache.lucene.analysis.miscellaneous.LengthFilter;
+import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
+import org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilter;
+import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter;
 import org.apache.lucene.analysis.miscellaneous.TrimFilter;
 import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter;
 import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter;
@@ -35,16 +53,25 @@ import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
 import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
 import org.apache.lucene.analysis.ngram.NGramTokenFilter;
+import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
+import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
 import org.apache.lucene.analysis.reverse.ReverseStringFilter;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
 import org.apache.lucene.analysis.snowball.SnowballFilter;
 import org.apache.lucene.analysis.standard.ClassicFilter;
+import org.apache.lucene.analysis.tr.ApostropheFilter;
+import org.apache.lucene.analysis.util.ElisionFilter;
 import org.elasticsearch.index.analysis.CharFilterFactory;
+import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
 import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory;
+import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory;
 import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
 import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
 import org.elasticsearch.plugins.AnalysisPlugin;
 import org.elasticsearch.plugins.Plugin;
+import org.tartarus.snowball.ext.DutchStemmer;
+import org.tartarus.snowball.ext.FrenchStemmer;
 
 import java.util.ArrayList;
 import java.util.List;
@@ -74,29 +101,61 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
     @Override
     public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
         List<PreConfiguredTokenFilter> filters = new ArrayList<>();
-        filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, input -> new ASCIIFoldingFilter(input)));
+        filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new));
+        filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
+        filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
+        filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
+        filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
+        filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
+        filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
         filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
         filters.add(PreConfiguredTokenFilter.singleton("common_grams", false,
                 input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
+        filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
+        filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
+        filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false, input ->
+                new DelimitedPayloadTokenFilter(input,
+                        DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
+                        DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
+        filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
         filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input ->
                 new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE)));
         // TODO deprecate edgeNGram
         filters.add(PreConfiguredTokenFilter.singleton("edgeNGram", false, input ->
                 new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE)));
+        filters.add(PreConfiguredTokenFilter.singleton("elision", true,
+                input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES)));
+        filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer())));
+        filters.add(PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new));
+        filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
+        filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
+        filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
+        filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
         filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
         filters.add(PreConfiguredTokenFilter.singleton("length", false, input ->
                 new LengthFilter(input, 0, Integer.MAX_VALUE)));  // TODO this one seems useless
+        filters.add(PreConfiguredTokenFilter.singleton("limit", false, input ->
+                new LimitTokenCountFilter(input,
+                        LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT,
+                        LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)));
         filters.add(PreConfiguredTokenFilter.singleton("ngram", false, NGramTokenFilter::new));
         // TODO deprecate nGram
         filters.add(PreConfiguredTokenFilter.singleton("nGram", false, NGramTokenFilter::new));
+        filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
         filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
         filters.add(PreConfiguredTokenFilter.singleton("reverse", false, input -> new ReverseStringFilter(input)));
+        filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian")));
+        filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
+        filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new));
+        filters.add(PreConfiguredTokenFilter.singleton("shingle", false, ShingleFilter::new));
         filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English")));
+        filters.add(PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new));
         filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
         // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
         filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, StopAnalyzer.ENGLISH_STOP_WORDS_SET)));
         filters.add(PreConfiguredTokenFilter.singleton("trim", false, TrimFilter::new));
         filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
+        filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
         filters.add(PreConfiguredTokenFilter.singleton("unique", false, input -> new UniqueTokenFilter(input)));
         filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
         filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input ->

+ 26 - 0
modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java

@@ -20,6 +20,8 @@
 package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.en.PorterStemFilterFactory;
+import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory;
+import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory;
 import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
 import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
 import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory;
@@ -68,22 +70,46 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
     @Override
     protected Map<String, Class<?>> getPreConfiguredTokenFilters() {
         Map<String, Class<?>> filters = new TreeMap<>(super.getPreConfiguredTokenFilters());
+        filters.put("apostrophe", null);
+        filters.put("arabic_normalization", null);
+        filters.put("arabic_stem", null);
         filters.put("asciifolding", null);
+        filters.put("brazilian_stem", null);
+        filters.put("cjk_bigram", null);
+        filters.put("cjk_width", null);
         filters.put("classic", null);
         filters.put("common_grams", null);
+        filters.put("czech_stem", null);
+        filters.put("decimal_digit", null);
+        filters.put("delimited_payload_filter", DelimitedPayloadTokenFilterFactory.class);
+        filters.put("dutch_stem", SnowballPorterFilterFactory.class);
         filters.put("edge_ngram", null);
         filters.put("edgeNGram", null);
+        filters.put("elision", null);
+        filters.put("french_stem", SnowballPorterFilterFactory.class);
+        filters.put("german_stem", null);
+        filters.put("hindi_normalization", null);
+        filters.put("indic_normalization", null);
+        filters.put("keyword_repeat", null);
         filters.put("kstem", null);
         filters.put("length", null);
+        filters.put("limit", LimitTokenCountFilterFactory.class);
         filters.put("ngram", null);
         filters.put("nGram", null);
+        filters.put("persian_normalization", null);
         filters.put("porter_stem", null);
         filters.put("reverse", ReverseStringFilterFactory.class);
+        filters.put("russian_stem", SnowballPorterFilterFactory.class);
+        filters.put("scandinavian_normalization", null);
+        filters.put("scandinavian_folding", null);
+        filters.put("shingle", null);
         filters.put("snowball", SnowballPorterFilterFactory.class);
+        filters.put("sorani_normalization", null);
         filters.put("stemmer", PorterStemFilterFactory.class);
         filters.put("stop", null);
         filters.put("trim", null);
         filters.put("truncate", null);
+        filters.put("type_as_payload", null);
         filters.put("unique", Void.class);
         filters.put("uppercase", null);
         filters.put("word_delimiter", null);

+ 65 - 0
modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/10_match.yml

@@ -0,0 +1,65 @@
+# integration tests for queries with specific analysis chains
+
+"match query with stacked stems":
+  # Tests the match query stemmed tokens are "stacked" on top of the unstemmed
+  # versions in the same position.
+  - do:
+      indices.create:
+        index: test
+        body:
+          settings:
+            number_of_shards: 1
+            number_of_replicas: 1
+            analysis:
+              analyzer:
+                index:
+                  tokenizer: standard
+                  filter: [lowercase]
+                search:
+                  tokenizer: standard
+                  filter: [lowercase, keyword_repeat, porter_stem, unique_stem]
+              filter:
+                unique_stem:
+                  type: unique
+                  only_on_same_position: true
+          mappings:
+            doc:
+              properties:
+                body:
+                  type: text
+                  analyzer: index
+                  search_analyzer: search
+
+  - do:
+      index:
+        index: test
+        type:  doc
+        id:    1
+        body:  { "text": "the fox runs across the street" }
+        refresh: true
+
+  - do:
+      search:
+        body:
+          query:
+            match:
+              text: fox runs
+              operator: AND
+  - match: {hits.count: 1}
+
+  - do:
+      index:
+        index: test
+        type:  doc
+        id:    2
+        body:  { "text": "run fox run" }
+        refresh: true
+
+  - do:
+      search:
+        body:
+          query:
+            match:
+              text: fox runs
+              operator: AND
+  - match: {hits.count: 2}

+ 158 - 0
modules/analysis-common/src/test/resources/rest-api-spec/test/search.suggest/20_phrase.yaml

@@ -0,0 +1,158 @@
+# Integration tests for the phrase suggester with a few analyzers
+
+setup:
+  - do:
+      indices.create:
+        index: test
+        body:
+          settings:
+            number_of_shards: 1
+            number_of_replicas: 1
+            analysis:
+              analyzer:
+                body:
+                  tokenizer: standard
+                  filter: [lowercase]
+                bigram:
+                  tokenizer: standard
+                  filter: [lowercase, bigram]
+                ngram:
+                  tokenizer: standard
+                  filter: [lowercase, ngram]
+              filter:
+                bigram:
+                  type: shingle
+                  output_unigrams: false
+                  min_shingle_size: 2
+                  max_shingle_size: 2
+                ngram:
+                  type: shingle
+                  output_unigrams: true
+                  min_shingle_size: 2
+                  max_shingle_size: 2
+          mappings:
+            doc:
+              properties:
+                body:
+                  type: text
+                  analyzer: body
+                  fields:
+                    bigram:
+                      type: text
+                      analyzer: bigram
+                    ngram:
+                      type: text
+                      analyzer: ngram
+
+  - do:
+      bulk:
+        index: test
+        type: doc
+        refresh: true
+        body: |
+          { "index": {} }
+          { "body": "Xorr the God-Jewel" }
+          { "index": {} }
+          { "body": "Xorn" }
+
+---
+"sorts by score":
+  - do:
+      search:
+        size: 0
+        index: test
+        body:
+          suggest:
+            text: xor the got-jewel
+            test:
+              phrase:
+                field: body.ngram
+                force_unigrams: true
+                max_errors: 0.5
+                direct_generator:
+                  - field: body.ngram
+                    min_word_length: 1
+                    suggest_mode: always
+
+  - match: {suggest.test.0.options.0.text: xorr the god jewel}
+  - match: {suggest.test.0.options.1.text: xorn the god jewel}
+
+---
+"breaks ties by sorting terms":
+  # This runs the suggester without bigrams so we can be sure of the sort order
+  - do:
+      search:
+        size: 0
+        index: test
+        body:
+          suggest:
+            text: xor the got-jewel
+            test:
+              phrase:
+                field: body
+                analyzer: body
+                force_unigrams: true
+                max_errors: 0.5
+                direct_generator:
+                  - field: body
+                    min_word_length: 1
+                    suggest_mode: always
+
+  # The scores are identical but xorn comes first because it sorts first
+  - match: {suggest.test.0.options.0.text: xorn the god jewel}
+  - match: {suggest.test.0.options.1.text: xorr the god jewel}
+  - match: {suggest.test.0.options.0.score: $body.suggest.test.0.options.0.score}
+
+---
+"fails when asked to run on a field without unigrams":
+  - do:
+      catch: /since it doesn't emit unigrams/
+      search:
+        size: 0
+        index: test
+        body:
+          suggest:
+            text: xor the got-jewel
+            test:
+              phrase:
+                field: body.bigram
+
+  - do:
+      catch: /since it doesn't emit unigrams/
+      search:
+        size: 0
+        index: test
+        body:
+          suggest:
+            text: xor the got-jewel
+            test:
+              phrase:
+                field: body.bigram
+                analyzer: bigram
+
+---
+"doesn't fail when asked to run on a field without unigrams when force_unigrams=false":
+  - do:
+      search:
+        size: 0
+        index: test
+        body:
+          suggest:
+            text: xor the got-jewel
+            test:
+              phrase:
+                field: body.bigram
+                force_unigrams: false
+
+  - do:
+      search:
+        size: 0
+        index: test
+        body:
+          suggest:
+            text: xor the got-jewel
+            test:
+              phrase:
+                field: body.bigram
+                analyzer: bigram
+                force_unigrams: false

+ 40 - 0
modules/analysis-common/src/test/resources/rest-api-spec/test/termvectors/10_payloads.yml

@@ -0,0 +1,40 @@
+"term vectors with payloads tests":
+  # Tests term vectors with payloads. This is in the analysis-common module
+  # because there are no token filters that support payloads in core.
+  - do:
+      indices.create:
+          index: test
+          body:
+            mappings:
+              doc:
+                properties:
+                  text:
+                    type: text
+                    term_vector: with_positions_offsets_payloads
+                    analyzer: has_payloads
+              settings:
+                number_of_shards: 1
+                number_of_replicas: 1
+                analysis:
+                  analyzer:
+                    had_payloads:
+                      tokenizer: standard
+                      filter: [type_as_payload]
+
+  - do:
+      index:
+        index: test
+        type:  doc
+        id:    1
+        refresh: true
+        body:
+          text: The quick brown fox is brown.
+
+  - do:
+      termvectors:
+        index: test
+        type:  doc
+        id:    1
+        payloads: true
+  - match: {term_vectors.text.field_statistics.sum_doc_freq: 5}
+  - match: {term_vectors.text.terms.brown.tokens.0.payload: 10}

+ 0 - 26
test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java

@@ -19,8 +19,6 @@
 
 package org.elasticsearch.indices.analysis;
 
-import org.apache.lucene.analysis.en.PorterStemFilterFactory;
-import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
 import org.apache.lucene.analysis.util.CharFilterFactory;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 import org.apache.lucene.analysis.util.TokenizerFactory;
@@ -97,7 +95,6 @@ import java.util.Collection;
 import java.util.EnumMap;
 import java.util.HashMap;
 import java.util.HashSet;
-import java.util.Locale;
 import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
@@ -343,29 +340,6 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
         Map<String, Class<?>> filters = new HashMap<>();
         filters.put("standard", null);
         filters.put("lowercase", null);
-        // TODO remove the loop below once all the tokenizers are migrated out of PreBuiltTokenFilters
-        for (PreBuiltTokenFilters tokenizer : PreBuiltTokenFilters.values()) {
-            Class<?> luceneFactoryClass;
-            switch (tokenizer) {
-            case LOWERCASE:
-                // This has been migrated but has to stick around until PreBuiltTokenizers is removed.
-                continue;
-            case DUTCH_STEM:
-            case FRENCH_STEM:
-            case RUSSIAN_STEM:
-                luceneFactoryClass = SnowballPorterFilterFactory.class;
-                break;
-            case DELIMITED_PAYLOAD_FILTER:
-                luceneFactoryClass = org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class;
-                break;
-            case LIMIT:
-                luceneFactoryClass = org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory.class;
-                break;
-            default:
-                luceneFactoryClass = null;
-            }
-            filters.put(tokenizer.name().toLowerCase(Locale.ROOT), luceneFactoryClass);
-        }
         return filters;
     }