浏览代码

Moved tokenizers to analysis common module (#30538)

The following tokenizers were moved: classic, edge_ngram,
letter, lowercase, ngram, path_hierarchy, pattern, thai, uax_url_email and
whitespace.

Left keyword tokenizer factory in server module, because
normalizers directly depend on it.This should be addressed on a
follow up change.

Relates to #23658
Martijn van Groningen 7 年之前
父节点
当前提交
7b95470897
共有 41 个文件被更改,包括 679 次插入336 次删除
  1. 1 1
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharMatcher.java
  2. 3 2
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ClassicTokenizerFactory.java
  3. 40 0
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
  4. 4 6
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerFactory.java
  5. 3 2
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/LetterTokenizerFactory.java
  6. 4 2
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/LowerCaseTokenizerFactory.java
  7. 3 2
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenizerFactory.java
  8. 3 2
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PathHierarchyTokenizerFactory.java
  9. 3 2
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PatternTokenizerFactory.java
  10. 3 2
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ThaiTokenizerFactory.java
  11. 4 3
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/UAX29URLEmailTokenizerFactory.java
  12. 3 2
      modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WhitespaceTokenizerFactory.java
  13. 1 1
      modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CharMatcherTests.java
  14. 30 4
      modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java
  15. 1 1
      modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonGramsTokenFilterFactoryTests.java
  16. 32 17
      modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/DisableGraphQueryTests.java
  17. 0 2
      modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenizerFactoryTests.java
  18. 1 1
      modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/PathHierarchyTokenizerFactoryTests.java
  19. 5 8
      modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java
  20. 1 1
      modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WhitespaceTokenizerFactoryTests.java
  21. 0 0
      modules/analysis-common/src/test/resources/org/elasticsearch/analysis/common/synonyms.json
  22. 0 0
      modules/analysis-common/src/test/resources/org/elasticsearch/analysis/common/synonyms.txt
  23. 0 0
      modules/analysis-common/src/test/resources/org/elasticsearch/analysis/common/synonyms_wordnet.txt
  24. 371 0
      modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml
  25. 30 0
      modules/analysis-common/src/test/resources/rest-api-spec/test/indices.analyze/10_analyze.yml
  26. 94 0
      modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/20_ngram_search.yml
  27. 0 30
      rest-api-spec/src/main/resources/rest-api-spec/test/indices.analyze/10_analyze.yml
  28. 1 0
      server/src/main/java/org/elasticsearch/index/analysis/KeywordTokenizerFactory.java
  29. 1 28
      server/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
  30. 0 73
      server/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenizers.java
  31. 3 3
      server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java
  32. 3 3
      server/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsIT.java
  33. 9 1
      server/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsTests.java
  34. 2 39
      server/src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionIT.java
  35. 1 1
      server/src/test/java/org/elasticsearch/indices/template/SimpleIndexTemplateIT.java
  36. 2 2
      server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java
  37. 3 3
      server/src/test/java/org/elasticsearch/search/functionscore/QueryRescorerIT.java
  38. 1 1
      server/src/test/java/org/elasticsearch/search/query/MultiMatchQueryIT.java
  39. 1 53
      server/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java
  40. 1 1
      server/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java
  41. 11 37
      test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java

+ 1 - 1
server/src/main/java/org/elasticsearch/index/analysis/CharMatcher.java → modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharMatcher.java

@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import java.util.HashSet;
 import java.util.Set;

+ 3 - 2
server/src/main/java/org/elasticsearch/index/analysis/ClassicTokenizerFactory.java → modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ClassicTokenizerFactory.java

@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.standard.ClassicTokenizer;
@@ -25,6 +25,7 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
 
 /**
  * Factory for {@link ClassicTokenizer}
@@ -33,7 +34,7 @@ public class ClassicTokenizerFactory extends AbstractTokenizerFactory {
 
     private final int maxTokenLength;
 
-    public ClassicTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+    ClassicTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
         maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
     }

+ 40 - 0
modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java

@@ -34,9 +34,11 @@ import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter;
 import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
 import org.apache.lucene.analysis.core.DecimalDigitFilter;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.core.LetterTokenizer;
 import org.apache.lucene.analysis.core.LowerCaseTokenizer;
 import org.apache.lucene.analysis.core.StopAnalyzer;
 import org.apache.lucene.analysis.core.UpperCaseFilter;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.cz.CzechStemFilter;
 import org.apache.lucene.analysis.de.GermanNormalizationFilter;
 import org.apache.lucene.analysis.de.GermanStemFilter;
@@ -58,17 +60,25 @@ import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter;
 import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
 import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
+import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
 import org.apache.lucene.analysis.ngram.NGramTokenFilter;
+import org.apache.lucene.analysis.ngram.NGramTokenizer;
+import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
+import org.apache.lucene.analysis.pattern.PatternTokenizer;
 import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
 import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
 import org.apache.lucene.analysis.reverse.ReverseStringFilter;
 import org.apache.lucene.analysis.shingle.ShingleFilter;
 import org.apache.lucene.analysis.snowball.SnowballFilter;
 import org.apache.lucene.analysis.standard.ClassicFilter;
+import org.apache.lucene.analysis.standard.ClassicTokenizer;
+import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
+import org.apache.lucene.analysis.th.ThaiTokenizer;
 import org.apache.lucene.analysis.tr.ApostropheFilter;
 import org.apache.lucene.analysis.util.ElisionFilter;
 import org.elasticsearch.common.logging.DeprecationLogger;
 import org.elasticsearch.common.logging.Loggers;
+import org.elasticsearch.common.regex.Regex;
 import org.elasticsearch.index.analysis.CharFilterFactory;
 import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
 import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
@@ -169,6 +179,19 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
         Map<String, AnalysisProvider<TokenizerFactory>> tokenizers = new TreeMap<>();
         tokenizers.put("simple_pattern", SimplePatternTokenizerFactory::new);
         tokenizers.put("simple_pattern_split", SimplePatternSplitTokenizerFactory::new);
+        tokenizers.put("thai", ThaiTokenizerFactory::new);
+        tokenizers.put("nGram", NGramTokenizerFactory::new);
+        tokenizers.put("ngram", NGramTokenizerFactory::new);
+        tokenizers.put("edgeNGram", EdgeNGramTokenizerFactory::new);
+        tokenizers.put("edge_ngram", EdgeNGramTokenizerFactory::new);
+        tokenizers.put("classic", ClassicTokenizerFactory::new);
+        tokenizers.put("letter", LetterTokenizerFactory::new);
+        tokenizers.put("lowercase", LowerCaseTokenizerFactory::new);
+        tokenizers.put("path_hierarchy", PathHierarchyTokenizerFactory::new);
+        tokenizers.put("PathHierarchy", PathHierarchyTokenizerFactory::new);
+        tokenizers.put("pattern", PatternTokenizerFactory::new);
+        tokenizers.put("uax_url_email", UAX29URLEmailTokenizerFactory::new);
+        tokenizers.put("whitespace", WhitespaceTokenizerFactory::new);
         return tokenizers;
     }
 
@@ -283,6 +306,16 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
     public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
         List<PreConfiguredTokenizer> tokenizers = new ArrayList<>();
         tokenizers.add(PreConfiguredTokenizer.singleton("keyword", KeywordTokenizer::new, null));
+        tokenizers.add(PreConfiguredTokenizer.singleton("classic", ClassicTokenizer::new, null));
+        tokenizers.add(PreConfiguredTokenizer.singleton("uax_url_email", UAX29URLEmailTokenizer::new, null));
+        tokenizers.add(PreConfiguredTokenizer.singleton("path_hierarchy", PathHierarchyTokenizer::new, null));
+        tokenizers.add(PreConfiguredTokenizer.singleton("letter", LetterTokenizer::new, null));
+        tokenizers.add(PreConfiguredTokenizer.singleton("whitespace", WhitespaceTokenizer::new, null));
+        tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new, null));
+        tokenizers.add(PreConfiguredTokenizer.singleton("edge_ngram",
+            () -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null));
+        tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1), null));
+        tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new, null));
         tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", LowerCaseTokenizer::new, () -> new TokenFilterFactory() {
             @Override
             public String name() {
@@ -294,6 +327,13 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
                 return new LowerCaseFilter(tokenStream);
             }
         }));
+
+        // Temporary shim for aliases. TODO deprecate after they are moved
+        tokenizers.add(PreConfiguredTokenizer.singleton("nGram", NGramTokenizer::new, null));
+        tokenizers.add(PreConfiguredTokenizer.singleton("edgeNGram",
+            () -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null));
+        tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new, null));
+
         return tokenizers;
     }
 }

+ 4 - 6
server/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenizerFactory.java → modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenizerFactory.java

@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
@@ -25,19 +25,17 @@ import org.apache.lucene.analysis.ngram.NGramTokenizer;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
 
-import static org.elasticsearch.index.analysis.NGramTokenizerFactory.parseTokenChars;
+import static org.elasticsearch.analysis.common.NGramTokenizerFactory.parseTokenChars;
 
 public class EdgeNGramTokenizerFactory extends AbstractTokenizerFactory {
 
     private final int minGram;
-
     private final int maxGram;
-
     private final CharMatcher matcher;
 
-
-    public EdgeNGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+    EdgeNGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
         this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
         this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);

+ 3 - 2
server/src/main/java/org/elasticsearch/index/analysis/LetterTokenizerFactory.java → modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/LetterTokenizerFactory.java

@@ -17,17 +17,18 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.LetterTokenizer;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
 
 public class LetterTokenizerFactory extends AbstractTokenizerFactory {
 
-    public LetterTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+    LetterTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
     }
 

+ 4 - 2
server/src/main/java/org/elasticsearch/index/analysis/LowerCaseTokenizerFactory.java → modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/LowerCaseTokenizerFactory.java

@@ -17,17 +17,19 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.LowerCaseTokenizer;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
+import org.elasticsearch.index.analysis.MultiTermAwareComponent;
 
 public class LowerCaseTokenizerFactory extends AbstractTokenizerFactory implements MultiTermAwareComponent {
 
-    public LowerCaseTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+    LowerCaseTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
     }
 

+ 3 - 2
server/src/main/java/org/elasticsearch/index/analysis/NGramTokenizerFactory.java → modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenizerFactory.java

@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ngram.NGramTokenizer;
@@ -25,6 +25,7 @@ import org.elasticsearch.Version;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
 
 import java.lang.reflect.Field;
 import java.lang.reflect.Modifier;
@@ -83,7 +84,7 @@ public class NGramTokenizerFactory extends AbstractTokenizerFactory {
         return builder.build();
     }
 
-    public NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+    NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
         int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff();
         this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);

+ 3 - 2
server/src/main/java/org/elasticsearch/index/analysis/PathHierarchyTokenizerFactory.java → modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PathHierarchyTokenizerFactory.java

@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
@@ -25,6 +25,7 @@ import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
 
 public class PathHierarchyTokenizerFactory extends AbstractTokenizerFactory {
 
@@ -35,7 +36,7 @@ public class PathHierarchyTokenizerFactory extends AbstractTokenizerFactory {
     private final int skip;
     private final boolean reverse;
 
-    public PathHierarchyTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+    PathHierarchyTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
         bufferSize = settings.getAsInt("buffer_size", 1024);
         String delimiter = settings.get("delimiter");

+ 3 - 2
server/src/main/java/org/elasticsearch/index/analysis/PatternTokenizerFactory.java → modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PatternTokenizerFactory.java

@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.pattern.PatternTokenizer;
@@ -25,6 +25,7 @@ import org.elasticsearch.common.regex.Regex;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
 
 import java.util.regex.Pattern;
 
@@ -33,7 +34,7 @@ public class PatternTokenizerFactory extends AbstractTokenizerFactory {
     private final Pattern pattern;
     private final int group;
 
-    public PatternTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+    PatternTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
 
         String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);

+ 3 - 2
server/src/main/java/org/elasticsearch/index/analysis/ThaiTokenizerFactory.java → modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ThaiTokenizerFactory.java

@@ -17,20 +17,21 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.th.ThaiTokenizer;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
 
 /**
  * Factory for {@link ThaiTokenizer}
  */
 public class ThaiTokenizerFactory extends AbstractTokenizerFactory {
 
-    public ThaiTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+    ThaiTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
     }
 

+ 4 - 3
server/src/main/java/org/elasticsearch/index/analysis/UAX29URLEmailTokenizerFactory.java → modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/UAX29URLEmailTokenizerFactory.java

@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
@@ -25,12 +25,13 @@ import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
 
 public class UAX29URLEmailTokenizerFactory extends AbstractTokenizerFactory {
 
     private final int maxTokenLength;
 
-    public UAX29URLEmailTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+    UAX29URLEmailTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
         maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
     }
@@ -41,4 +42,4 @@ public class UAX29URLEmailTokenizerFactory extends AbstractTokenizerFactory {
         tokenizer.setMaxTokenLength(maxTokenLength);
         return tokenizer;
     }
-}
+}

+ 3 - 2
server/src/main/java/org/elasticsearch/index/analysis/WhitespaceTokenizerFactory.java → modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WhitespaceTokenizerFactory.java

@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
@@ -26,13 +26,14 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
 
 public class WhitespaceTokenizerFactory extends AbstractTokenizerFactory {
 
     static final String MAX_TOKEN_LENGTH = "max_token_length";
     private Integer maxTokenLength;
 
-    public WhitespaceTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+    WhitespaceTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
         maxTokenLength = settings.getAsInt(MAX_TOKEN_LENGTH, StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
     }

+ 1 - 1
server/src/test/java/org/elasticsearch/index/analysis/CharMatcherTests.java → modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CharMatcherTests.java

@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import org.elasticsearch.test.ESTestCase;
 

+ 30 - 4
modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java

@@ -24,6 +24,7 @@ import org.apache.lucene.analysis.en.PorterStemFilterFactory;
 import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory;
 import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
 import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
+import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
 import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
 import org.elasticsearch.index.analysis.SynonymTokenFilterFactory;
 import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase;
@@ -45,6 +46,16 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
         Map<String, Class<?>> tokenizers = new TreeMap<>(super.getTokenizers());
         tokenizers.put("simplepattern", SimplePatternTokenizerFactory.class);
         tokenizers.put("simplepatternsplit", SimplePatternSplitTokenizerFactory.class);
+        tokenizers.put("thai", ThaiTokenizerFactory.class);
+        tokenizers.put("ngram", NGramTokenizerFactory.class);
+        tokenizers.put("edgengram", EdgeNGramTokenizerFactory.class);
+        tokenizers.put("classic", ClassicTokenizerFactory.class);
+        tokenizers.put("letter", LetterTokenizerFactory.class);
+        tokenizers.put("lowercase", LowerCaseTokenizerFactory.class);
+        tokenizers.put("pathhierarchy", PathHierarchyTokenizerFactory.class);
+        tokenizers.put("pattern", PatternTokenizerFactory.class);
+        tokenizers.put("uax29urlemail", UAX29URLEmailTokenizerFactory.class);
+        tokenizers.put("whitespace", WhitespaceTokenizerFactory.class);
         return tokenizers;
     }
 
@@ -211,10 +222,25 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
 
     @Override
     protected Map<String, Class<?>> getPreConfiguredTokenizers() {
-        Map<String, Class<?>> filters = new TreeMap<>(super.getPreConfiguredTokenizers());
-        filters.put("keyword", null);
-        filters.put("lowercase", null);
-        return filters;
+        Map<String, Class<?>> tokenizers = new TreeMap<>(super.getPreConfiguredTokenizers());
+        tokenizers.put("keyword", null);
+        tokenizers.put("lowercase", null);
+        tokenizers.put("classic", null);
+        tokenizers.put("uax_url_email", org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory.class);
+        tokenizers.put("path_hierarchy", null);
+        tokenizers.put("letter", null);
+        tokenizers.put("whitespace", null);
+        tokenizers.put("ngram", null);
+        tokenizers.put("edge_ngram", null);
+        tokenizers.put("pattern", null);
+        tokenizers.put("thai", null);
+
+        // TODO drop aliases once they are moved to module
+        tokenizers.put("nGram", tokenizers.get("ngram"));
+        tokenizers.put("edgeNGram", tokenizers.get("edge_ngram"));
+        tokenizers.put("PathHierarchy", tokenizers.get("path_hierarchy"));
+
+        return tokenizers;
     }
 
     /**

+ 1 - 1
modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonGramsTokenFilterFactoryTests.java

@@ -45,7 +45,7 @@ public class CommonGramsTokenFilterFactoryTests extends ESTokenStreamTestCase {
                                 .build();
 
         try {
-            AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
+            AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
             Assert.fail("[common_words] or [common_words_path] is set");
         } catch (IllegalArgumentException e) {
         } catch (IOException e) {

+ 32 - 17
server/src/test/java/org/elasticsearch/index/query/DisableGraphQueryTests.java → modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/DisableGraphQueryTests.java

@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.elasticsearch.index.query;
+package org.elasticsearch.analysis.common;
 
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.Query;
@@ -29,12 +29,22 @@ import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.MultiPhraseQuery;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.IndexService;
+import org.elasticsearch.index.query.MatchPhraseQueryBuilder;
+import org.elasticsearch.index.query.MatchQueryBuilder;
+import org.elasticsearch.index.query.MultiMatchQueryBuilder;
+import org.elasticsearch.index.query.QueryShardContext;
+import org.elasticsearch.index.query.QueryStringQueryBuilder;
+import org.elasticsearch.index.query.SimpleQueryStringBuilder;
+import org.elasticsearch.index.query.SimpleQueryStringFlag;
 import org.elasticsearch.index.search.MatchQuery;
+import org.elasticsearch.plugins.Plugin;
 import org.elasticsearch.test.ESSingleNodeTestCase;
 import org.junit.After;
 import org.junit.Before;
 
 import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
 
 import static org.hamcrest.Matchers.equalTo;
 
@@ -49,6 +59,11 @@ public class DisableGraphQueryTests extends ESSingleNodeTestCase {
     private static Query expectedQueryWithUnigram;
     private static Query expectedPhraseQueryWithUnigram;
 
+    @Override
+    protected Collection<Class<? extends Plugin>> getPlugins() {
+        return Collections.singleton(CommonAnalysisPlugin.class);
+    }
+
     @Before
     public void setup() {
         Settings settings = Settings.builder()
@@ -150,42 +165,42 @@ public class DisableGraphQueryTests extends ESSingleNodeTestCase {
     public void testMatchPhraseQuery() throws IOException {
         MatchPhraseQueryBuilder builder =
             new MatchPhraseQueryBuilder("text_shingle_unigram", "foo bar baz");
-        Query query = builder.doToQuery(shardContext);
+        Query query = builder.toQuery(shardContext);
         assertThat(expectedPhraseQueryWithUnigram, equalTo(query));
 
         builder =
             new MatchPhraseQueryBuilder("text_shingle", "foo bar baz biz");
-        query = builder.doToQuery(shardContext);
+        query = builder.toQuery(shardContext);
         assertThat(expectedPhraseQuery, equalTo(query));
     }
 
     public void testMatchQuery() throws IOException {
         MatchQueryBuilder builder =
             new MatchQueryBuilder("text_shingle_unigram", "foo bar baz");
-        Query query = builder.doToQuery(shardContext);
+        Query query = builder.toQuery(shardContext);
         assertThat(expectedQueryWithUnigram, equalTo(query));
 
         builder = new MatchQueryBuilder("text_shingle", "foo bar baz biz");
-        query = builder.doToQuery(shardContext);
+        query = builder.toQuery(shardContext);
         assertThat(expectedQuery, equalTo(query));
     }
 
     public void testMultiMatchQuery() throws IOException {
         MultiMatchQueryBuilder builder = new MultiMatchQueryBuilder("foo bar baz",
             "text_shingle_unigram");
-        Query query = builder.doToQuery(shardContext);
+        Query query = builder.toQuery(shardContext);
         assertThat(expectedQueryWithUnigram, equalTo(query));
 
         builder.type(MatchQuery.Type.PHRASE);
-        query = builder.doToQuery(shardContext);
+        query = builder.toQuery(shardContext);
         assertThat(expectedPhraseQueryWithUnigram, equalTo(query));
 
         builder = new MultiMatchQueryBuilder("foo bar baz biz", "text_shingle");
-        query = builder.doToQuery(shardContext);
+        query = builder.toQuery(shardContext);
         assertThat(expectedQuery, equalTo(query));
 
         builder.type(MatchQuery.Type.PHRASE);
-        query = builder.doToQuery(shardContext);
+        query = builder.toQuery(shardContext);
         assertThat(expectedPhraseQuery, equalTo(query));
     }
 
@@ -193,47 +208,47 @@ public class DisableGraphQueryTests extends ESSingleNodeTestCase {
         SimpleQueryStringBuilder builder = new SimpleQueryStringBuilder("foo bar baz");
         builder.field("text_shingle_unigram");
         builder.flags(SimpleQueryStringFlag.NONE);
-        Query query = builder.doToQuery(shardContext);
+        Query query = builder.toQuery(shardContext);
         assertThat(expectedQueryWithUnigram, equalTo(query));
 
         builder = new SimpleQueryStringBuilder("\"foo bar baz\"");
         builder.field("text_shingle_unigram");
         builder.flags(SimpleQueryStringFlag.PHRASE);
-        query = builder.doToQuery(shardContext);
+        query = builder.toQuery(shardContext);
         assertThat(expectedPhraseQueryWithUnigram, equalTo(query));
 
         builder = new SimpleQueryStringBuilder("foo bar baz biz");
         builder.field("text_shingle");
         builder.flags(SimpleQueryStringFlag.NONE);
-        query = builder.doToQuery(shardContext);
+        query = builder.toQuery(shardContext);
         assertThat(expectedQuery, equalTo(query));
 
         builder = new SimpleQueryStringBuilder("\"foo bar baz biz\"");
         builder.field("text_shingle");
         builder.flags(SimpleQueryStringFlag.PHRASE);
-        query = builder.doToQuery(shardContext);
+        query = builder.toQuery(shardContext);
         assertThat(expectedPhraseQuery, equalTo(query));
     }
 
     public void testQueryString() throws IOException {
         QueryStringQueryBuilder builder = new QueryStringQueryBuilder("foo bar baz");
         builder.field("text_shingle_unigram");
-        Query query = builder.doToQuery(shardContext);
+        Query query = builder.toQuery(shardContext);
         assertThat(expectedQueryWithUnigram, equalTo(query));
 
         builder = new QueryStringQueryBuilder("\"foo bar baz\"");
         builder.field("text_shingle_unigram");
-        query = builder.doToQuery(shardContext);
+        query = builder.toQuery(shardContext);
         assertThat(expectedPhraseQueryWithUnigram, equalTo(query));
 
         builder = new QueryStringQueryBuilder("foo bar baz biz");
         builder.field("text_shingle");
-        query = builder.doToQuery(shardContext);
+        query = builder.toQuery(shardContext);
         assertThat(expectedQuery, equalTo(query));
 
         builder = new QueryStringQueryBuilder("\"foo bar baz biz\"");
         builder.field("text_shingle");
-        query = builder.doToQuery(shardContext);
+        query = builder.toQuery(shardContext);
         assertThat(expectedPhraseQuery, equalTo(query));
     }
 }

+ 0 - 2
modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenizerFactoryTests.java

@@ -30,8 +30,6 @@ import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.settings.Settings.Builder;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.IndexSettings;
-import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
-import org.elasticsearch.index.analysis.NGramTokenizerFactory;
 import org.elasticsearch.test.ESTokenStreamTestCase;
 import org.elasticsearch.test.IndexSettingsModule;
 

+ 1 - 1
server/src/test/java/org/elasticsearch/index/analysis/PathHierarchyTokenizerFactoryTests.java → modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/PathHierarchyTokenizerFactoryTests.java

@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import com.carrotsearch.randomizedtesting.generators.RandomPicks;
 

+ 5 - 8
server/src/test/java/org/elasticsearch/index/analysis/synonyms/SynonymsAnalysisTests.java → modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java

@@ -17,15 +17,13 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis.synonyms;
+package org.elasticsearch.analysis.common;
 
-import org.apache.logging.log4j.Logger;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.elasticsearch.Version;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
-import org.elasticsearch.common.logging.Loggers;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
@@ -44,7 +42,6 @@ import static org.hamcrest.Matchers.instanceOf;
 import static org.hamcrest.Matchers.startsWith;
 
 public class SynonymsAnalysisTests extends ESTestCase {
-    protected final Logger logger = Loggers.getLogger(getClass());
     private IndexAnalyzers indexAnalyzers;
 
     public void testSynonymsAnalysis() throws IOException {
@@ -56,14 +53,14 @@ public class SynonymsAnalysisTests extends ESTestCase {
         Files.copy(synonyms, config.resolve("synonyms.txt"));
         Files.copy(synonymsWordnet, config.resolve("synonyms_wordnet.txt"));
 
-        String json = "/org/elasticsearch/index/analysis/synonyms/synonyms.json";
+        String json = "/org/elasticsearch/analysis/common/synonyms.json";
         Settings settings = Settings.builder().
             loadFromStream(json, getClass().getResourceAsStream(json), false)
                 .put(Environment.PATH_HOME_SETTING.getKey(), home)
                 .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build();
 
         IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
-        indexAnalyzers = createTestAnalysis(idxSettings, settings).indexAnalyzers;
+        indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
 
         match("synonymAnalyzer", "kimchy is the dude abides", "shay is the elasticsearch man!");
         match("synonymAnalyzer_file", "kimchy is the dude abides", "shay is the elasticsearch man!");
@@ -91,7 +88,7 @@ public class SynonymsAnalysisTests extends ESTestCase {
             .build();
         IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
         try {
-            indexAnalyzers = createTestAnalysis(idxSettings, settings).indexAnalyzers;
+            indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
             fail("fail! due to synonym word deleted by analyzer");
         } catch (Exception e) {
             assertThat(e, instanceOf(IllegalArgumentException.class));
@@ -112,7 +109,7 @@ public class SynonymsAnalysisTests extends ESTestCase {
             .build();
         IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
         try {
-            indexAnalyzers = createTestAnalysis(idxSettings, settings).indexAnalyzers;
+            indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
             fail("fail! due to synonym word deleted by analyzer");
         } catch (Exception e) {
             assertThat(e, instanceOf(IllegalArgumentException.class));

+ 1 - 1
server/src/test/java/org/elasticsearch/index/analysis/WhitespaceTokenizerFactoryTests.java → modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WhitespaceTokenizerFactoryTests.java

@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
 
 import com.carrotsearch.randomizedtesting.generators.RandomStrings;
 

+ 0 - 0
server/src/test/resources/org/elasticsearch/index/analysis/synonyms/synonyms.json → modules/analysis-common/src/test/resources/org/elasticsearch/analysis/common/synonyms.json


+ 0 - 0
server/src/test/resources/org/elasticsearch/index/analysis/synonyms/synonyms.txt → modules/analysis-common/src/test/resources/org/elasticsearch/analysis/common/synonyms.txt


+ 0 - 0
server/src/test/resources/org/elasticsearch/index/analysis/synonyms/synonyms_wordnet.txt → modules/analysis-common/src/test/resources/org/elasticsearch/analysis/common/synonyms_wordnet.txt


+ 371 - 0
modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml

@@ -70,3 +70,374 @@
     - match:  { detail.tokenizer.name: _anonymous_tokenizer }
     - match:  { detail.tokenizer.tokens.0.token: foo }
     - match:  { detail.tokenizer.tokens.1.token: bar }
+
+---
+"thai_tokenizer":
+    - do:
+        indices.analyze:
+          body:
+            text: "ภาษาไทย"
+            explain: true
+            tokenizer:
+              type: thai
+    - length: { detail.tokenizer.tokens: 2 }
+    - match:  { detail.tokenizer.name: _anonymous_tokenizer }
+    - match:  { detail.tokenizer.tokens.0.token: ภาษา }
+    - match:  { detail.tokenizer.tokens.1.token: ไทย }
+
+    - do:
+        indices.analyze:
+          body:
+            text: "ภาษาไทย"
+            explain: true
+            tokenizer: thai
+    - length: { detail.tokenizer.tokens: 2 }
+    - match:  { detail.tokenizer.name: thai }
+    - match:  { detail.tokenizer.tokens.0.token: ภาษา }
+    - match:  { detail.tokenizer.tokens.1.token: ไทย }
+
+---
+"ngram":
+    - do:
+        indices.analyze:
+          body:
+            text: "foobar"
+            explain: true
+            tokenizer:
+              type: ngram
+              min_gram: 3
+              max_gram: 3
+    - length: { detail.tokenizer.tokens: 4 }
+    - match:  { detail.tokenizer.name: _anonymous_tokenizer }
+    - match:  { detail.tokenizer.tokens.0.token: foo }
+    - match:  { detail.tokenizer.tokens.1.token: oob }
+    - match:  { detail.tokenizer.tokens.2.token: oba }
+    - match:  { detail.tokenizer.tokens.3.token: bar }
+
+    - do:
+        indices.analyze:
+          body:
+            text: "foobar"
+            explain: true
+            tokenizer:
+              type: nGram
+              min_gram: 3
+              max_gram: 3
+    - length: { detail.tokenizer.tokens: 4 }
+    - match:  { detail.tokenizer.name: _anonymous_tokenizer }
+    - match:  { detail.tokenizer.tokens.0.token: foo }
+    - match:  { detail.tokenizer.tokens.1.token: oob }
+    - match:  { detail.tokenizer.tokens.2.token: oba }
+    - match:  { detail.tokenizer.tokens.3.token: bar }
+
+    - do:
+        indices.analyze:
+          body:
+            text: "foo"
+            explain: true
+            tokenizer: ngram
+    - length: { detail.tokenizer.tokens: 5 }
+    - match:  { detail.tokenizer.name: ngram }
+    - match:  { detail.tokenizer.tokens.0.token: f }
+    - match:  { detail.tokenizer.tokens.1.token: fo }
+    - match:  { detail.tokenizer.tokens.2.token: o }
+    - match:  { detail.tokenizer.tokens.3.token: oo }
+    - match:  { detail.tokenizer.tokens.4.token: o }
+
+    - do:
+        indices.analyze:
+          body:
+            text: "foo"
+            explain: true
+            tokenizer: nGram
+    - length: { detail.tokenizer.tokens: 5 }
+    - match:  { detail.tokenizer.name: nGram }
+    - match:  { detail.tokenizer.tokens.0.token: f }
+    - match:  { detail.tokenizer.tokens.1.token: fo }
+    - match:  { detail.tokenizer.tokens.2.token: o }
+    - match:  { detail.tokenizer.tokens.3.token: oo }
+    - match:  { detail.tokenizer.tokens.4.token: o }
+
+---
+"edge_ngram":
+    - do:
+        indices.analyze:
+          body:
+            text: "foo"
+            explain: true
+            tokenizer:
+              type: edge_ngram
+              min_gram: 1
+              max_gram: 3
+    - length: { detail.tokenizer.tokens: 3 }
+    - match:  { detail.tokenizer.name: _anonymous_tokenizer }
+    - match:  { detail.tokenizer.tokens.0.token: f }
+    - match:  { detail.tokenizer.tokens.1.token: fo }
+    - match:  { detail.tokenizer.tokens.2.token: foo }
+
+    - do:
+        indices.analyze:
+          body:
+            text: "foo"
+            explain: true
+            tokenizer:
+              type: edgeNGram
+              min_gram: 1
+              max_gram: 3
+    - length: { detail.tokenizer.tokens: 3 }
+    - match:  { detail.tokenizer.name: _anonymous_tokenizer }
+    - match:  { detail.tokenizer.tokens.0.token: f }
+    - match:  { detail.tokenizer.tokens.1.token: fo }
+    - match:  { detail.tokenizer.tokens.2.token: foo }
+
+    - do:
+        indices.analyze:
+          body:
+            text: "foo"
+            explain: true
+            tokenizer: edge_ngram
+    - length: { detail.tokenizer.tokens: 2 }
+    - match:  { detail.tokenizer.name: edge_ngram }
+    - match:  { detail.tokenizer.tokens.0.token: f }
+    - match:  { detail.tokenizer.tokens.1.token: fo }
+
+    - do:
+        indices.analyze:
+          body:
+            text: "foo"
+            explain: true
+            tokenizer: edgeNGram
+    - length: { detail.tokenizer.tokens: 2 }
+    - match:  { detail.tokenizer.name: edgeNGram }
+    - match:  { detail.tokenizer.tokens.0.token: f }
+    - match:  { detail.tokenizer.tokens.1.token: fo }
+
+---
+"classic":
+    - do:
+        indices.analyze:
+          body:
+            text: "Brown-Foxes don't jump."
+            explain: true
+            tokenizer:
+              type: classic
+    - length: { detail.tokenizer.tokens: 4 }
+    - match:  { detail.tokenizer.name: _anonymous_tokenizer }
+    - match:  { detail.tokenizer.tokens.0.token: Brown }
+    - match:  { detail.tokenizer.tokens.1.token: Foxes }
+    - match:  { detail.tokenizer.tokens.2.token: don't }
+    - match:  { detail.tokenizer.tokens.3.token: jump }
+
+    - do:
+        indices.analyze:
+          body:
+            text: "Brown-Foxes don't jump."
+            explain: true
+            tokenizer: classic
+    - length: { detail.tokenizer.tokens: 4 }
+    - match:  { detail.tokenizer.name: classic }
+    - match:  { detail.tokenizer.tokens.0.token: Brown }
+    - match:  { detail.tokenizer.tokens.1.token: Foxes }
+    - match:  { detail.tokenizer.tokens.2.token: don't }
+    - match:  { detail.tokenizer.tokens.3.token: jump }
+
+---
+"letter":
+    - do:
+        indices.analyze:
+          body:
+            text: "Brown-Foxes don't jump."
+            explain: true
+            tokenizer:
+              type: letter
+    - length: { detail.tokenizer.tokens: 5 }
+    - match:  { detail.tokenizer.name: _anonymous_tokenizer }
+    - match:  { detail.tokenizer.tokens.0.token: Brown }
+    - match:  { detail.tokenizer.tokens.1.token: Foxes }
+    - match:  { detail.tokenizer.tokens.2.token: don }
+    - match:  { detail.tokenizer.tokens.3.token: t }
+    - match:  { detail.tokenizer.tokens.4.token: jump }
+
+    - do:
+        indices.analyze:
+          body:
+            text: "Brown-Foxes don't jump."
+            explain: true
+            tokenizer: letter
+    - length: { detail.tokenizer.tokens: 5 }
+    - match:  { detail.tokenizer.name: letter }
+    - match:  { detail.tokenizer.tokens.0.token: Brown }
+    - match:  { detail.tokenizer.tokens.1.token: Foxes }
+    - match:  { detail.tokenizer.tokens.2.token: don }
+    - match:  { detail.tokenizer.tokens.3.token: t }
+    - match:  { detail.tokenizer.tokens.4.token: jump }
+
+---
+"lowercase":
+    - do:
+        indices.analyze:
+          body:
+            text: "Brown-Foxes don't jump."
+            explain: true
+            tokenizer:
+              type: lowercase
+    - length: { detail.tokenizer.tokens: 5 }
+    - match:  { detail.tokenizer.name: _anonymous_tokenizer }
+    - match:  { detail.tokenizer.tokens.0.token: brown }
+    - match:  { detail.tokenizer.tokens.1.token: foxes }
+    - match:  { detail.tokenizer.tokens.2.token: don }
+    - match:  { detail.tokenizer.tokens.3.token: t }
+    - match:  { detail.tokenizer.tokens.4.token: jump }
+
+    - do:
+        indices.analyze:
+          body:
+            text: "Brown-Foxes don't jump."
+            explain: true
+            tokenizer: lowercase
+    - length: { detail.tokenizer.tokens: 5 }
+    - match:  { detail.tokenizer.name: lowercase }
+    - match:  { detail.tokenizer.tokens.0.token: brown }
+    - match:  { detail.tokenizer.tokens.1.token: foxes }
+    - match:  { detail.tokenizer.tokens.2.token: don }
+    - match:  { detail.tokenizer.tokens.3.token: t }
+    - match:  { detail.tokenizer.tokens.4.token: jump }
+
+---
+"path_hierarchy":
+    - do:
+        indices.analyze:
+          body:
+            text: "a/b/c"
+            explain: true
+            tokenizer:
+              type: path_hierarchy
+    - length: { detail.tokenizer.tokens: 3 }
+    - match:  { detail.tokenizer.name: _anonymous_tokenizer }
+    - match:  { detail.tokenizer.tokens.0.token: a }
+    - match:  { detail.tokenizer.tokens.1.token: a/b }
+    - match:  { detail.tokenizer.tokens.2.token: a/b/c }
+
+    - do:
+        indices.analyze:
+          body:
+            text: "a/b/c"
+            explain: true
+            tokenizer:
+              type: PathHierarchy
+    - length: { detail.tokenizer.tokens: 3 }
+    - match:  { detail.tokenizer.name: _anonymous_tokenizer }
+    - match:  { detail.tokenizer.tokens.0.token: a }
+    - match:  { detail.tokenizer.tokens.1.token: a/b }
+    - match:  { detail.tokenizer.tokens.2.token: a/b/c }
+
+    - do:
+        indices.analyze:
+          body:
+            text: "a/b/c"
+            explain: true
+            tokenizer: path_hierarchy
+    - length: { detail.tokenizer.tokens: 3 }
+    - match:  { detail.tokenizer.name: path_hierarchy }
+    - match:  { detail.tokenizer.tokens.0.token: a }
+    - match:  { detail.tokenizer.tokens.1.token: a/b }
+    - match:  { detail.tokenizer.tokens.2.token: a/b/c }
+
+    - do:
+        indices.analyze:
+          body:
+            text: "a/b/c"
+            explain: true
+            tokenizer:  PathHierarchy
+    - length: { detail.tokenizer.tokens: 3 }
+    - match:  { detail.tokenizer.name: PathHierarchy }
+    - match:  { detail.tokenizer.tokens.0.token: a }
+    - match:  { detail.tokenizer.tokens.1.token: a/b }
+    - match:  { detail.tokenizer.tokens.2.token: a/b/c }
+
+---
+"pattern":
+    - do:
+        indices.analyze:
+          body:
+            text: "split by whitespace by default"
+            explain: true
+            tokenizer:
+              type: pattern
+    - length: { detail.tokenizer.tokens: 5 }
+    - match:  { detail.tokenizer.name: _anonymous_tokenizer }
+    - match:  { detail.tokenizer.tokens.0.token: split }
+    - match:  { detail.tokenizer.tokens.1.token: by }
+    - match:  { detail.tokenizer.tokens.2.token: whitespace }
+    - match:  { detail.tokenizer.tokens.3.token: by }
+    - match:  { detail.tokenizer.tokens.4.token: default }
+
+    - do:
+        indices.analyze:
+          body:
+            text: "split by whitespace by default"
+            explain: true
+            tokenizer: pattern
+    - length: { detail.tokenizer.tokens: 5 }
+    - match:  { detail.tokenizer.name: pattern }
+    - match:  { detail.tokenizer.tokens.0.token: split }
+    - match:  { detail.tokenizer.tokens.1.token: by }
+    - match:  { detail.tokenizer.tokens.2.token: whitespace }
+    - match:  { detail.tokenizer.tokens.3.token: by }
+    - match:  { detail.tokenizer.tokens.4.token: default }
+
+---
+"uax_url_email":
+    - do:
+        indices.analyze:
+          body:
+            text: "Email me at john.smith@global-international.com"
+            explain: true
+            tokenizer:
+              type: uax_url_email
+    - length: { detail.tokenizer.tokens: 4 }
+    - match:  { detail.tokenizer.name: _anonymous_tokenizer }
+    - match:  { detail.tokenizer.tokens.0.token: Email }
+    - match:  { detail.tokenizer.tokens.1.token: me }
+    - match:  { detail.tokenizer.tokens.2.token: at }
+    - match:  { detail.tokenizer.tokens.3.token: john.smith@global-international.com }
+
+    - do:
+        indices.analyze:
+          body:
+            text: "Email me at john.smith@global-international.com"
+            explain: true
+            tokenizer: uax_url_email
+    - length: { detail.tokenizer.tokens: 4 }
+    - match:  { detail.tokenizer.name: uax_url_email }
+    - match:  { detail.tokenizer.tokens.0.token: Email }
+    - match:  { detail.tokenizer.tokens.1.token: me }
+    - match:  { detail.tokenizer.tokens.2.token: at }
+    - match:  { detail.tokenizer.tokens.3.token: john.smith@global-international.com }
+
+---
+"whitespace":
+    - do:
+        indices.analyze:
+          body:
+            text: "split by whitespace"
+            explain: true
+            tokenizer:
+              type: whitespace
+    - length: { detail.tokenizer.tokens: 3 }
+    - match:  { detail.tokenizer.name: _anonymous_tokenizer }
+    - match:  { detail.tokenizer.tokens.0.token: split }
+    - match:  { detail.tokenizer.tokens.1.token: by }
+    - match:  { detail.tokenizer.tokens.2.token: whitespace }
+
+    - do:
+        indices.analyze:
+          body:
+            text: "split by whitespace"
+            explain: true
+            tokenizer: whitespace
+    - length: { detail.tokenizer.tokens: 3 }
+    - match:  { detail.tokenizer.name: whitespace }
+    - match:  { detail.tokenizer.tokens.0.token: split }
+    - match:  { detail.tokenizer.tokens.1.token: by }
+    - match:  { detail.tokenizer.tokens.2.token: whitespace }

+ 30 - 0
modules/analysis-common/src/test/resources/rest-api-spec/test/indices.analyze/10_analyze.yml

@@ -67,3 +67,33 @@
             text: "<html>foo</html>"
     - length: { tokens: 1 }
     - match:  { tokens.0.token: "\nfoo\n" }
+
+---
+"Synonym filter with tokenizer":
+    - do:
+        indices.create:
+          index: test_synonym
+          body:
+            settings:
+              index:
+                analysis:
+                  tokenizer:
+                    trigram:
+                      type: nGram
+                      min_gram: 3
+                      max_gram: 3
+                  filter:
+                    synonym:
+                      type: synonym
+                      synonyms: ["kimchy => shay"]
+
+    - do:
+        indices.analyze:
+          index: test_synonym
+          body:
+            tokenizer: trigram
+            filter: [synonym]
+            text: kimchy
+    - length: { tokens: 2 }
+    - match:  { tokens.0.token: sha }
+    - match:  { tokens.1.token: hay }

+ 94 - 0
modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/20_ngram_search.yml

@@ -39,3 +39,97 @@
               text:
                 query: foa
   - match: {hits.total: 1}
+
+---
+"testNGramCopyField":
+  - do:
+      indices.create:
+        index: test
+        body:
+          settings:
+            number_of_shards: 1
+            number_of_replicas: 0
+            max_ngram_diff: 9
+            analysis:
+              analyzer:
+                my_ngram_analyzer:
+                  tokenizer: my_ngram_tokenizer
+              tokenizer:
+                my_ngram_tokenizer:
+                  type: ngram
+                  min: 1,
+                  max: 10
+                  token_chars: []
+          mappings:
+            doc:
+              properties:
+                origin:
+                  type: text
+                  copy_to: meta
+                meta:
+                  type: text
+                  analyzer: my_ngram_analyzer
+
+  - do:
+      index:
+        index: test
+        type:  doc
+        id:    1
+        body:  { "origin": "C.A1234.5678" }
+        refresh: true
+
+  - do:
+      search:
+        body:
+          query:
+            match:
+              meta:
+                query: 1234
+  - match: {hits.total: 1}
+
+  - do:
+      search:
+        body:
+          query:
+            match:
+              meta:
+                query: 1234.56
+  - match: {hits.total: 1}
+
+  - do:
+      search:
+        body:
+          query:
+            match:
+              meta:
+                query: A1234
+  - match: {hits.total: 1}
+
+  - do:
+      search:
+        body:
+          query:
+            term:
+              meta:
+                value: a1234
+  - match: {hits.total: 0}
+
+  - do:
+      search:
+        body:
+          query:
+            match:
+              meta:
+                query: A1234
+                analyzer: my_ngram_analyzer
+  - match: {hits.total: 1}
+
+  - do:
+      search:
+        body:
+          query:
+            match:
+              meta:
+                query: a1234
+                analyzer: my_ngram_analyzer
+  - match: {hits.total: 1}

+ 0 - 30
rest-api-spec/src/main/resources/rest-api-spec/test/indices.analyze/10_analyze.yml

@@ -76,36 +76,6 @@
     - match:  { detail.tokenfilters.0.name: "_anonymous_tokenfilter" }
     - match:  { detail.tokenfilters.0.tokens.0.token: bar }
 
----
-"Synonym filter with tokenizer":
-    - do:
-        indices.create:
-          index: test_synonym
-          body:
-            settings:
-              index:
-                analysis:
-                  tokenizer:
-                    trigram:
-                      type: nGram
-                      min_gram: 3
-                      max_gram: 3
-                  filter:
-                    synonym:
-                      type: synonym
-                      synonyms: ["kimchy => shay"]
-
-    - do:
-        indices.analyze:
-          index: test_synonym
-          body:
-            tokenizer: trigram
-            filter: [synonym]
-            text: kimchy
-    - length: { tokens: 2 }
-    - match:  { tokens.0.token: sha }
-    - match:  { tokens.1.token: hay }
-
 ---
 "Custom normalizer in request":
     - do:

+ 1 - 0
server/src/main/java/org/elasticsearch/index/analysis/KeywordTokenizerFactory.java

@@ -24,6 +24,7 @@ import org.apache.lucene.analysis.core.KeywordTokenizer;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
 
 public class KeywordTokenizerFactory extends AbstractTokenizerFactory {
 

+ 1 - 28
server/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java

@@ -39,11 +39,9 @@ import org.elasticsearch.index.analysis.CatalanAnalyzerProvider;
 import org.elasticsearch.index.analysis.CharFilterFactory;
 import org.elasticsearch.index.analysis.ChineseAnalyzerProvider;
 import org.elasticsearch.index.analysis.CjkAnalyzerProvider;
-import org.elasticsearch.index.analysis.ClassicTokenizerFactory;
 import org.elasticsearch.index.analysis.CzechAnalyzerProvider;
 import org.elasticsearch.index.analysis.DanishAnalyzerProvider;
 import org.elasticsearch.index.analysis.DutchAnalyzerProvider;
-import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
 import org.elasticsearch.index.analysis.EnglishAnalyzerProvider;
 import org.elasticsearch.index.analysis.FingerprintAnalyzerProvider;
 import org.elasticsearch.index.analysis.FinnishAnalyzerProvider;
@@ -60,14 +58,9 @@ import org.elasticsearch.index.analysis.ItalianAnalyzerProvider;
 import org.elasticsearch.index.analysis.KeywordAnalyzerProvider;
 import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
 import org.elasticsearch.index.analysis.LatvianAnalyzerProvider;
-import org.elasticsearch.index.analysis.LetterTokenizerFactory;
 import org.elasticsearch.index.analysis.LithuanianAnalyzerProvider;
-import org.elasticsearch.index.analysis.LowerCaseTokenizerFactory;
-import org.elasticsearch.index.analysis.NGramTokenizerFactory;
 import org.elasticsearch.index.analysis.NorwegianAnalyzerProvider;
-import org.elasticsearch.index.analysis.PathHierarchyTokenizerFactory;
 import org.elasticsearch.index.analysis.PatternAnalyzerProvider;
-import org.elasticsearch.index.analysis.PatternTokenizerFactory;
 import org.elasticsearch.index.analysis.PersianAnalyzerProvider;
 import org.elasticsearch.index.analysis.PortugueseAnalyzerProvider;
 import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
@@ -88,13 +81,10 @@ import org.elasticsearch.index.analysis.StopAnalyzerProvider;
 import org.elasticsearch.index.analysis.StopTokenFilterFactory;
 import org.elasticsearch.index.analysis.SwedishAnalyzerProvider;
 import org.elasticsearch.index.analysis.ThaiAnalyzerProvider;
-import org.elasticsearch.index.analysis.ThaiTokenizerFactory;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
 import org.elasticsearch.index.analysis.TokenizerFactory;
 import org.elasticsearch.index.analysis.TurkishAnalyzerProvider;
-import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory;
 import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider;
-import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
 import org.elasticsearch.plugins.AnalysisPlugin;
 
 import java.io.IOException;
@@ -223,36 +213,19 @@ public final class AnalysisModule {
             }
             preConfiguredTokenizers.register(name, preConfigured);
         }
-        // Temporary shim for aliases. TODO deprecate after they are moved
-        preConfiguredTokenizers.register("nGram", preConfiguredTokenizers.getRegistry().get("ngram"));
-        preConfiguredTokenizers.register("edgeNGram", preConfiguredTokenizers.getRegistry().get("edge_ngram"));
-        preConfiguredTokenizers.register("PathHierarchy", preConfiguredTokenizers.getRegistry().get("path_hierarchy"));
-
         for (AnalysisPlugin plugin: plugins) {
             for (PreConfiguredTokenizer tokenizer : plugin.getPreConfiguredTokenizers()) {
                 preConfiguredTokenizers.register(tokenizer.getName(), tokenizer);
             }
         }
+
         return unmodifiableMap(preConfiguredTokenizers.getRegistry());
     }
 
     private NamedRegistry<AnalysisProvider<TokenizerFactory>> setupTokenizers(List<AnalysisPlugin> plugins) {
         NamedRegistry<AnalysisProvider<TokenizerFactory>> tokenizers = new NamedRegistry<>("tokenizer");
         tokenizers.register("standard", StandardTokenizerFactory::new);
-        tokenizers.register("uax_url_email", UAX29URLEmailTokenizerFactory::new);
-        tokenizers.register("path_hierarchy", PathHierarchyTokenizerFactory::new);
-        tokenizers.register("PathHierarchy", PathHierarchyTokenizerFactory::new);
         tokenizers.register("keyword", KeywordTokenizerFactory::new);
-        tokenizers.register("letter", LetterTokenizerFactory::new);
-        tokenizers.register("lowercase", LowerCaseTokenizerFactory::new);
-        tokenizers.register("whitespace", WhitespaceTokenizerFactory::new);
-        tokenizers.register("nGram", NGramTokenizerFactory::new);
-        tokenizers.register("ngram", NGramTokenizerFactory::new);
-        tokenizers.register("edgeNGram", EdgeNGramTokenizerFactory::new);
-        tokenizers.register("edge_ngram", EdgeNGramTokenizerFactory::new);
-        tokenizers.register("pattern", PatternTokenizerFactory::new);
-        tokenizers.register("classic", ClassicTokenizerFactory::new);
-        tokenizers.register("thai", ThaiTokenizerFactory::new);
         tokenizers.extractAndRegister(plugins, AnalysisPlugin::getTokenizers);
         return tokenizers;
     }

+ 0 - 73
server/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenizers.java

@@ -19,18 +19,8 @@
 package org.elasticsearch.indices.analysis;
 
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.LetterTokenizer;
-import org.apache.lucene.analysis.core.WhitespaceTokenizer;
-import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
-import org.apache.lucene.analysis.ngram.NGramTokenizer;
-import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
-import org.apache.lucene.analysis.pattern.PatternTokenizer;
-import org.apache.lucene.analysis.standard.ClassicTokenizer;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
-import org.apache.lucene.analysis.th.ThaiTokenizer;
 import org.elasticsearch.Version;
-import org.elasticsearch.common.regex.Regex;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
 import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
 
@@ -41,69 +31,6 @@ public enum PreBuiltTokenizers {
         protected Tokenizer create(Version version) {
             return new StandardTokenizer();
         }
-    },
-
-    CLASSIC(CachingStrategy.ONE) {
-        @Override
-        protected Tokenizer create(Version version) {
-            return new ClassicTokenizer();
-        }
-    },
-
-    UAX_URL_EMAIL(CachingStrategy.ONE) {
-        @Override
-        protected Tokenizer create(Version version) {
-            return new UAX29URLEmailTokenizer();
-        }
-    },
-
-    PATH_HIERARCHY(CachingStrategy.ONE) {
-        @Override
-        protected Tokenizer create(Version version) {
-            return new PathHierarchyTokenizer();
-        }
-    },
-
-    LETTER(CachingStrategy.ONE) {
-        @Override
-        protected Tokenizer create(Version version) {
-            return new LetterTokenizer();
-        }
-    },
-
-    WHITESPACE(CachingStrategy.ONE) {
-        @Override
-        protected Tokenizer create(Version version) {
-            return new WhitespaceTokenizer();
-        }
-    },
-
-    NGRAM(CachingStrategy.ONE) {
-        @Override
-        protected Tokenizer create(Version version) {
-            return new NGramTokenizer();
-        }
-    },
-
-    EDGE_NGRAM(CachingStrategy.ONE) {
-        @Override
-        protected Tokenizer create(Version version) {
-            return new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
-        }
-    },
-
-    PATTERN(CachingStrategy.ONE) {
-        @Override
-        protected Tokenizer create(Version version) {
-            return new PatternTokenizer(Regex.compile("\\W+", null), -1);
-        }
-    },
-
-    THAI(CachingStrategy.ONE) {
-        @Override
-        protected Tokenizer create(Version version) {
-            return new ThaiTokenizer();
-        }
     }
 
     ;

+ 3 - 3
server/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java

@@ -287,7 +287,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
         e = expectThrows(IllegalArgumentException.class,
             () -> TransportAnalyzeAction.analyze(
                 new AnalyzeRequest()
-                    .tokenizer("whitespace")
+                    .tokenizer("standard")
                     .addTokenFilter("foobar")
                     .text("the qu1ck brown fox"),
                 "text", null, notGlobal ? indexAnalyzers : null, registry, environment, maxTokenCount));
@@ -300,7 +300,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
         e = expectThrows(IllegalArgumentException.class,
             () -> TransportAnalyzeAction.analyze(
                 new AnalyzeRequest()
-                    .tokenizer("whitespace")
+                    .tokenizer("standard")
                     .addTokenFilter("lowercase")
                     .addCharFilter("foobar")
                     .text("the qu1ck brown fox"),
@@ -322,7 +322,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
 
     public void testNonPreBuildTokenFilter() throws IOException {
         AnalyzeRequest request = new AnalyzeRequest();
-        request.tokenizer("whitespace");
+        request.tokenizer("standard");
         request.addTokenFilter("stop"); // stop token filter is not prebuilt in AnalysisModule#setupPreConfiguredTokenFilters()
         request.text("the quick brown fox");
         AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment, maxTokenCount);

+ 3 - 3
server/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsIT.java

@@ -188,7 +188,7 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
                 .addAlias(new Alias("alias"))
                 .setSettings(Settings.builder()
                         .put(indexSettings())
-                        .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
+                        .put("index.analysis.analyzer.tv_test.tokenizer", "standard")
                         .putList("index.analysis.analyzer.tv_test.filter", "lowercase")));
         for (int i = 0; i < 10; i++) {
             client().prepareIndex("test", "type1", Integer.toString(i))
@@ -260,7 +260,7 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
                 .endObject().endObject();
         assertAcked(prepareCreate("test").addMapping("type1", mapping)
                 .setSettings(Settings.builder()
-                        .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
+                        .put("index.analysis.analyzer.tv_test.tokenizer", "standard")
                         .putList("index.analysis.analyzer.tv_test.filter", "lowercase")));
         for (int i = 0; i < 10; i++) {
             client().prepareIndex("test", "type1", Integer.toString(i))
@@ -394,7 +394,7 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
                 .addMapping("type1", mapping)
                 .setSettings(Settings.builder()
                         .put(indexSettings())
-                        .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
+                        .put("index.analysis.analyzer.tv_test.tokenizer", "standard")
                         .putList("index.analysis.analyzer.tv_test.filter", "lowercase")));
 
         ensureGreen();

+ 9 - 1
server/src/test/java/org/elasticsearch/action/termvectors/GetTermVectorsTests.java

@@ -18,6 +18,7 @@
  */
 package org.elasticsearch.action.termvectors;
 
+import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.payloads.FloatEncoder;
@@ -35,6 +36,7 @@ import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
 import org.elasticsearch.index.analysis.TokenFilterFactory;
 import org.elasticsearch.indices.analysis.AnalysisModule;
 import org.elasticsearch.plugins.AnalysisPlugin;
@@ -93,6 +95,12 @@ public class GetTermVectorsTests extends ESSingleNodeTestCase {
             });
         }
 
+        @Override
+        public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
+            return Collections.singletonList(PreConfiguredTokenizer.singleton("mock-whitespace",
+                () -> new MockTokenizer(MockTokenizer.WHITESPACE, false), null));
+        }
+
         // Based on DelimitedPayloadTokenFilter:
         final class MockPayloadTokenFilter extends TokenFilter {
             private final char delimiter;
@@ -151,7 +159,7 @@ public class GetTermVectorsTests extends ESSingleNodeTestCase {
                 .startObject("field").field("type", "text").field("term_vector", "with_positions_offsets_payloads")
                 .field("analyzer", "payload_test").endObject().endObject().endObject().endObject();
         Settings setting =  Settings.builder()
-            .put("index.analysis.analyzer.payload_test.tokenizer", "whitespace")
+            .put("index.analysis.analyzer.payload_test.tokenizer", "mock-whitespace")
             .putList("index.analysis.analyzer.payload_test.filter", "my_delimited_payload")
             .put("index.analysis.filter.my_delimited_payload.delimiter", delimiter)
             .put("index.analysis.filter.my_delimited_payload.encoding", encodingString)

+ 2 - 39
server/src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionIT.java

@@ -35,10 +35,8 @@ import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcke
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.hasSize;
 import static org.hamcrest.Matchers.is;
-import static org.hamcrest.Matchers.notNullValue;
 import static org.hamcrest.Matchers.startsWith;
 
-
 public class AnalyzeActionIT extends ESIntegTestCase {
     public void testSimpleAnalyzerTests() throws Exception {
         assertAcked(prepareCreate("test").addAlias(new Alias("alias")));
@@ -333,14 +331,14 @@ public class AnalyzeActionIT extends ESIntegTestCase {
         AnalyzeResponse analyzeResponse = client().admin().indices()
             .prepareAnalyze()
             .setText("Foo buzz test")
-            .setTokenizer("whitespace")
+            .setTokenizer("standard")
             .addTokenFilter("lowercase")
             .addTokenFilter(stopFilterSettings)
             .setExplain(true)
             .get();
 
         //tokenizer
-        assertThat(analyzeResponse.detail().tokenizer().getName(), equalTo("whitespace"));
+        assertThat(analyzeResponse.detail().tokenizer().getName(), equalTo("standard"));
         assertThat(analyzeResponse.detail().tokenizer().getTokens().length, equalTo(3));
         assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getTerm(), equalTo("Foo"));
         assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getStartOffset(), equalTo(0));
@@ -393,41 +391,6 @@ public class AnalyzeActionIT extends ESIntegTestCase {
         assertThat(analyzeResponse.detail().tokenfilters()[1].getTokens()[0].getPositionLength(), equalTo(1));
     }
 
-    public void testCustomTokenizerInRequest() throws Exception {
-        Map<String, Object> tokenizerSettings = new HashMap<>();
-        tokenizerSettings.put("type", "nGram");
-        tokenizerSettings.put("min_gram", 2);
-        tokenizerSettings.put("max_gram", 2);
-
-        AnalyzeResponse analyzeResponse = client().admin().indices()
-            .prepareAnalyze()
-            .setText("good")
-            .setTokenizer(tokenizerSettings)
-            .setExplain(true)
-            .get();
-
-        //tokenizer
-        assertThat(analyzeResponse.detail().tokenizer().getName(), equalTo("_anonymous_tokenizer"));
-        assertThat(analyzeResponse.detail().tokenizer().getTokens().length, equalTo(3));
-        assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getTerm(), equalTo("go"));
-        assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getStartOffset(), equalTo(0));
-        assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getEndOffset(), equalTo(2));
-        assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getPosition(), equalTo(0));
-        assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getPositionLength(), equalTo(1));
-
-        assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getTerm(), equalTo("oo"));
-        assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getStartOffset(), equalTo(1));
-        assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getEndOffset(), equalTo(3));
-        assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getPosition(), equalTo(1));
-        assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getPositionLength(), equalTo(1));
-
-        assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getTerm(), equalTo("od"));
-        assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getStartOffset(), equalTo(2));
-        assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getEndOffset(), equalTo(4));
-        assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getPosition(), equalTo(2));
-        assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getPositionLength(), equalTo(1));
-    }
-
     public void testAnalyzeKeywordField() throws IOException {
         assertAcked(prepareCreate("test").addAlias(new Alias("alias")).addMapping("test", "keyword", "type=keyword"));
         ensureGreen("test");

+ 1 - 1
server/src/test/java/org/elasticsearch/indices/template/SimpleIndexTemplateIT.java

@@ -677,7 +677,7 @@ public class SimpleIndexTemplateIT extends ESIntegTestCase {
                     "            \"analysis\" : {\n" +
                     "                \"analyzer\" : {\n" +
                     "                    \"custom_1\" : {\n" +
-                    "                        \"tokenizer\" : \"whitespace\"\n" +
+                    "                        \"tokenizer\" : \"standard\"\n" +
                     "                    }\n" +
                     "                }\n" +
                     "            }\n" +

+ 2 - 2
server/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java

@@ -1359,7 +1359,7 @@ public class HighlighterSearchIT extends ESIntegTestCase {
     public void testPhrasePrefix() throws IOException {
         Builder builder = Settings.builder()
                 .put(indexSettings())
-                .put("index.analysis.analyzer.synonym.tokenizer", "whitespace")
+                .put("index.analysis.analyzer.synonym.tokenizer", "standard")
                 .putList("index.analysis.analyzer.synonym.filter", "synonym", "lowercase")
                 .put("index.analysis.filter.synonym.type", "synonym")
                 .putList("index.analysis.filter.synonym.synonyms", "quick => fast");
@@ -2804,7 +2804,7 @@ public class HighlighterSearchIT extends ESIntegTestCase {
     public void testSynonyms() throws IOException {
         Builder builder = Settings.builder()
             .put(indexSettings())
-            .put("index.analysis.analyzer.synonym.tokenizer", "whitespace")
+            .put("index.analysis.analyzer.synonym.tokenizer", "standard")
             .putList("index.analysis.analyzer.synonym.filter", "synonym", "lowercase")
             .put("index.analysis.filter.synonym.type", "synonym")
             .putList("index.analysis.filter.synonym.synonyms", "fast,quick");

+ 3 - 3
server/src/test/java/org/elasticsearch/search/functionscore/QueryRescorerIT.java

@@ -156,7 +156,7 @@ public class QueryRescorerIT extends ESIntegTestCase {
 
     public void testMoreDocs() throws Exception {
         Builder builder = Settings.builder();
-        builder.put("index.analysis.analyzer.synonym.tokenizer", "whitespace");
+        builder.put("index.analysis.analyzer.synonym.tokenizer", "standard");
         builder.putList("index.analysis.analyzer.synonym.filter", "synonym", "lowercase");
         builder.put("index.analysis.filter.synonym.type", "synonym");
         builder.putList("index.analysis.filter.synonym.synonyms", "ave => ave, avenue", "street => str, street");
@@ -234,7 +234,7 @@ public class QueryRescorerIT extends ESIntegTestCase {
     // Tests a rescore window smaller than number of hits:
     public void testSmallRescoreWindow() throws Exception {
         Builder builder = Settings.builder();
-        builder.put("index.analysis.analyzer.synonym.tokenizer", "whitespace");
+        builder.put("index.analysis.analyzer.synonym.tokenizer", "standard");
         builder.putList("index.analysis.analyzer.synonym.filter", "synonym", "lowercase");
         builder.put("index.analysis.filter.synonym.type", "synonym");
         builder.putList("index.analysis.filter.synonym.synonyms", "ave => ave, avenue", "street => str, street");
@@ -306,7 +306,7 @@ public class QueryRescorerIT extends ESIntegTestCase {
     // Tests a rescorer that penalizes the scores:
     public void testRescorerMadeScoresWorse() throws Exception {
         Builder builder = Settings.builder();
-        builder.put("index.analysis.analyzer.synonym.tokenizer", "whitespace");
+        builder.put("index.analysis.analyzer.synonym.tokenizer", "standard");
         builder.putList("index.analysis.analyzer.synonym.filter", "synonym", "lowercase");
         builder.put("index.analysis.filter.synonym.type", "synonym");
         builder.putList("index.analysis.filter.synonym.synonyms", "ave => ave, avenue", "street => str, street");

+ 1 - 1
server/src/test/java/org/elasticsearch/search/query/MultiMatchQueryIT.java

@@ -82,7 +82,7 @@ public class MultiMatchQueryIT extends ESIntegTestCase {
                 .put("index.analysis.analyzer.perfect_match.tokenizer", "keyword")
                 .put("index.analysis.analyzer.perfect_match.filter", "lowercase")
                 .put("index.analysis.analyzer.category.type", "custom")
-                .put("index.analysis.analyzer.category.tokenizer", "whitespace")
+                .put("index.analysis.analyzer.category.tokenizer", "standard")
                 .put("index.analysis.analyzer.category.filter", "lowercase")
         );
         assertAcked(builder.addMapping("test", createMapping()));

+ 1 - 53
server/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java

@@ -20,7 +20,6 @@
 package org.elasticsearch.search.query;
 
 import org.apache.lucene.util.English;
-import org.elasticsearch.Version;
 import org.elasticsearch.action.admin.indices.create.CreateIndexRequestBuilder;
 import org.elasticsearch.action.index.IndexRequestBuilder;
 import org.elasticsearch.action.search.SearchPhaseExecutionException;
@@ -30,7 +29,6 @@ import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.xcontent.XContentFactory;
 import org.elasticsearch.common.xcontent.XContentType;
-import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.query.BoolQueryBuilder;
 import org.elasticsearch.index.query.MatchQueryBuilder;
 import org.elasticsearch.index.query.MultiMatchQueryBuilder;
@@ -351,7 +349,7 @@ public class SearchQueryIT extends ESIntegTestCase {
                         .put(SETTING_NUMBER_OF_SHARDS,1)
                         .put("index.analysis.filter.syns.type","synonym")
                         .putList("index.analysis.filter.syns.synonyms","quick,fast")
-                        .put("index.analysis.analyzer.syns.tokenizer","whitespace")
+                        .put("index.analysis.analyzer.syns.tokenizer","standard")
                         .put("index.analysis.analyzer.syns.filter","syns")
                         )
                 .addMapping("type1", "field1", "type=text,analyzer=syns", "field2", "type=text,analyzer=syns"));
@@ -1764,56 +1762,6 @@ public class SearchQueryIT extends ESIntegTestCase {
         assertHitCount(client().prepareSearch().setQuery(matchAllQuery()).get(), 1L);
     }
 
-    // see #5120
-    public void testNGramCopyField() {
-        CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
-                .put(indexSettings())
-                .put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 9)
-                .put("index.analysis.analyzer.my_ngram_analyzer.type", "custom")
-                .put("index.analysis.analyzer.my_ngram_analyzer.tokenizer", "my_ngram_tokenizer")
-                .put("index.analysis.tokenizer.my_ngram_tokenizer.type", "nGram")
-                .put("index.analysis.tokenizer.my_ngram_tokenizer.min_gram", "1")
-                .put("index.analysis.tokenizer.my_ngram_tokenizer.max_gram", "10")
-                .putList("index.analysis.tokenizer.my_ngram_tokenizer.token_chars", new String[0]));
-        assertAcked(builder.addMapping("test", "origin", "type=text,copy_to=meta", "meta", "type=text,analyzer=my_ngram_analyzer"));
-        // we only have ngrams as the index analyzer so searches will get standard analyzer
-
-
-        client().prepareIndex("test", "test", "1").setSource("origin", "C.A1234.5678")
-                .setRefreshPolicy(IMMEDIATE)
-                .get();
-
-        SearchResponse searchResponse = client().prepareSearch("test")
-                .setQuery(matchQuery("meta", "1234"))
-                .get();
-        assertHitCount(searchResponse, 1L);
-
-        searchResponse = client().prepareSearch("test")
-                .setQuery(matchQuery("meta", "1234.56"))
-                .get();
-        assertHitCount(searchResponse, 1L);
-
-        searchResponse = client().prepareSearch("test")
-                .setQuery(termQuery("meta", "A1234"))
-                .get();
-        assertHitCount(searchResponse, 1L);
-
-        searchResponse = client().prepareSearch("test")
-                .setQuery(termQuery("meta", "a1234"))
-                .get();
-        assertHitCount(searchResponse, 0L); // it's upper case
-
-        searchResponse = client().prepareSearch("test")
-                .setQuery(matchQuery("meta", "A1234").analyzer("my_ngram_analyzer"))
-                .get(); // force ngram analyzer
-        assertHitCount(searchResponse, 1L);
-
-        searchResponse = client().prepareSearch("test")
-                .setQuery(matchQuery("meta", "a1234").analyzer("my_ngram_analyzer"))
-                .get(); // this one returns a hit since it's default operator is OR
-        assertHitCount(searchResponse, 1L);
-    }
-
     public void testMatchPhrasePrefixQuery() throws ExecutionException, InterruptedException {
         createIndex("test1");
         indexRandom(true, client().prepareIndex("test1", "type1", "1").setSource("field", "Johnnie Walker Black Label"),

+ 1 - 1
server/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java

@@ -427,7 +427,7 @@ public class SuggestSearchIT extends ESIntegTestCase {
     public void testStopwordsOnlyPhraseSuggest() throws IOException {
         assertAcked(prepareCreate("test").addMapping("typ1", "body", "type=text,analyzer=stopwd").setSettings(
                 Settings.builder()
-                        .put("index.analysis.analyzer.stopwd.tokenizer", "whitespace")
+                        .put("index.analysis.analyzer.stopwd.tokenizer", "standard")
                         .putList("index.analysis.analyzer.stopwd.filter", "stop")
         ));
         ensureGreen();

+ 11 - 37
test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java

@@ -22,18 +22,10 @@ package org.elasticsearch.indices.analysis;
 import org.apache.lucene.analysis.util.CharFilterFactory;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 import org.apache.lucene.analysis.util.TokenizerFactory;
-import org.elasticsearch.Version;
 import org.elasticsearch.common.collect.MapBuilder;
-import org.elasticsearch.index.analysis.ClassicTokenizerFactory;
-import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
 import org.elasticsearch.index.analysis.HunspellTokenFilterFactory;
 import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
-import org.elasticsearch.index.analysis.LetterTokenizerFactory;
-import org.elasticsearch.index.analysis.LowerCaseTokenizerFactory;
 import org.elasticsearch.index.analysis.MultiTermAwareComponent;
-import org.elasticsearch.index.analysis.NGramTokenizerFactory;
-import org.elasticsearch.index.analysis.PathHierarchyTokenizerFactory;
-import org.elasticsearch.index.analysis.PatternTokenizerFactory;
 import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
 import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
 import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
@@ -43,9 +35,6 @@ import org.elasticsearch.index.analysis.StandardTokenizerFactory;
 import org.elasticsearch.index.analysis.StopTokenFilterFactory;
 import org.elasticsearch.index.analysis.SynonymGraphTokenFilterFactory;
 import org.elasticsearch.index.analysis.SynonymTokenFilterFactory;
-import org.elasticsearch.index.analysis.ThaiTokenizerFactory;
-import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory;
-import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
 import org.elasticsearch.plugins.AnalysisPlugin;
 import org.elasticsearch.test.ESTestCase;
 
@@ -88,20 +77,20 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
 
     static final Map<String,Class<?>> KNOWN_TOKENIZERS = new MapBuilder<String,Class<?>>()
         // exposed in ES
-        .put("classic", ClassicTokenizerFactory.class)
-        .put("edgengram", EdgeNGramTokenizerFactory.class)
+        .put("classic", MovedToAnalysisCommon.class)
+        .put("edgengram", MovedToAnalysisCommon.class)
         .put("keyword", KeywordTokenizerFactory.class)
-        .put("letter", LetterTokenizerFactory.class)
-        .put("lowercase", LowerCaseTokenizerFactory.class)
-        .put("ngram", NGramTokenizerFactory.class)
-        .put("pathhierarchy", PathHierarchyTokenizerFactory.class)
-        .put("pattern", PatternTokenizerFactory.class)
+        .put("letter", MovedToAnalysisCommon.class)
+        .put("lowercase", MovedToAnalysisCommon.class)
+        .put("ngram", MovedToAnalysisCommon.class)
+        .put("pathhierarchy", MovedToAnalysisCommon.class)
+        .put("pattern", MovedToAnalysisCommon.class)
         .put("simplepattern", MovedToAnalysisCommon.class)
         .put("simplepatternsplit", MovedToAnalysisCommon.class)
         .put("standard", StandardTokenizerFactory.class)
-        .put("thai", ThaiTokenizerFactory.class)
-        .put("uax29urlemail", UAX29URLEmailTokenizerFactory.class)
-        .put("whitespace", WhitespaceTokenizerFactory.class)
+        .put("thai", MovedToAnalysisCommon.class)
+        .put("uax29urlemail", MovedToAnalysisCommon.class)
+        .put("whitespace", MovedToAnalysisCommon.class)
 
         // this one "seems to mess up offsets". probably shouldn't be a tokenizer...
         .put("wikipedia", Void.class)
@@ -292,23 +281,8 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
         Map<String, Class<?>> tokenizers = new HashMap<>();
         // TODO drop this temporary shim when all the old style tokenizers have been migrated to new style
         for (PreBuiltTokenizers tokenizer : PreBuiltTokenizers.values()) {
-            final Class<?> luceneFactoryClazz;
-            switch (tokenizer) {
-            case UAX_URL_EMAIL:
-                luceneFactoryClazz = org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory.class;
-                break;
-            case PATH_HIERARCHY:
-                luceneFactoryClazz = Void.class;
-                break;
-            default:
-                luceneFactoryClazz = null;
-            }
-            tokenizers.put(tokenizer.name().toLowerCase(Locale.ROOT), luceneFactoryClazz);
+            tokenizers.put(tokenizer.name().toLowerCase(Locale.ROOT), null);
         }
-        // TODO drop aliases once they are moved to module
-        tokenizers.put("nGram", tokenizers.get("ngram"));
-        tokenizers.put("edgeNGram", tokenizers.get("edge_ngram"));
-        tokenizers.put("PathHierarchy", tokenizers.get("path_hierarchy"));
         return tokenizers;
     }