8 years ago · 48696ab544
--- a/docs/reference/analysis/tokenizers.asciidoc
+++ b/docs/reference/analysis/tokenizers.asciidoc
@@ -97,6 +97,18 @@ The `pattern` tokenizer uses a regular expression to either split text into
 
				 terms whenever it matches a word separator, or to capture matching text as
			
 
				 terms.
			
 
				 
			
 
				+<<analysis-simplepattern-tokenizer,Simple Pattern Tokenizer>>::
			
 
				+
			
 
				+The `simplepattern` tokenizer uses a regular expression to capture matching
			
 
				+text as terms. It uses a restricted subset of regular expression features
			
 
				+and is generally faster than the `pattern` tokenizer.
			
 
				+
			
 
				+<<analysis-simplepatternsplit-tokenizer,Simple Pattern Split Tokenizer>>::
			
 
				+
			
 
				+The `simplepatternsplit` tokenizer uses the same restricted regular expression
			
 
				+subset as the `simplepattern` tokenizer, but splits the input at matches rather
			
 
				+than returning the matches as terms.
			
 
				+
			
 
				 <<analysis-pathhierarchy-tokenizer,Path Tokenizer>>::
			
 
				 
			
 
				 The `path_hierarchy` tokenizer takes a hierarchical value like a filesystem
			
@@ -131,6 +143,8 @@ include::tokenizers/keyword-tokenizer.asciidoc[]
 
				 
			
 
				 include::tokenizers/pattern-tokenizer.asciidoc[]
			
 
				 
			
 
				-include::tokenizers/pathhierarchy-tokenizer.asciidoc[]
			
 
				+include::tokenizers/simplepattern-tokenizer.asciidoc[]
			
 
				 
			
 
				+include::tokenizers/simplepatternsplit-tokenizer.asciidoc[]
			
 
				 
			
 
				+include::tokenizers/pathhierarchy-tokenizer.asciidoc[]
			
--- a/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc
@@ -0,0 +1,105 @@
 
				+[[analysis-simplepattern-tokenizer]]
			
 
				+=== Simple Pattern Tokenizer
			
 
				+
			
 
				+experimental[]
			
 
				+
			
 
				+The `simplepattern` tokenizer uses a regular expression to capture matching
			
 
				+text as terms. The set of regular expression features it supports is more
			
 
				+limited than the <<analysis-pattern-tokenizer,`pattern`>> tokenizer, but the
			
 
				+tokenization is generally faster.
			
 
				+
			
 
				+This tokenizer does not support splitting the input on a pattern match, unlike
			
 
				+the <<analysis-pattern-tokenizer,`pattern`>> tokenizer. To split on pattern
			
 
				+matches using the same restricted regular expression subset, see the
			
 
				+<<analysis-simplepatternsplit-tokenizer,`simplepatternsplit`>> tokenizer.
			
 
				+
			
 
				+This tokenizer uses {lucene-core-javadoc}/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expressions].
			
 
				+For an explanation of the supported features and syntax, see <<regexp-syntax,Regular Expression Syntax>>.
			
 
				+
			
 
				+The default pattern is the empty string, which produces no terms. This
			
 
				+tokenizer should always be configured with a non-default pattern.
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `simplepattern` tokenizer accepts the following parameters:
			
 
				+
			
 
				+[horizontal]
			
 
				+`pattern`::
			
 
				+    {lucene-core-javadoc}/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expression], defaults to the empty string.
			
 
				+
			
 
				+[float]
			
 
				+=== Example configuration
			
 
				+
			
 
				+This example configures the `simplepattern` tokenizer to produce terms that are
			
 
				+three-digit numbers
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_analyzer": {
			
 
				+          "tokenizer": "my_tokenizer"
			
 
				+        }
			
 
				+      },
			
 
				+      "tokenizer": {
			
 
				+        "my_tokenizer": {
			
 
				+          "type": "simplepattern",
			
 
				+          "pattern": "[0123456789]{3}"
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "analyzer": "my_analyzer",
			
 
				+  "text": "fd-786-335-514-x"
			
 
				+}
			
 
				+----------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens" : [
			
 
				+    {
			
 
				+      "token" : "786",
			
 
				+      "start_offset" : 3,
			
 
				+      "end_offset" : 6,
			
 
				+      "type" : "word",
			
 
				+      "position" : 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "335",
			
 
				+      "start_offset" : 7,
			
 
				+      "end_offset" : 10,
			
 
				+      "type" : "word",
			
 
				+      "position" : 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "514",
			
 
				+      "start_offset" : 11,
			
 
				+      "end_offset" : 14,
			
 
				+      "type" : "word",
			
 
				+      "position" : 2
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+The above example produces these terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ 786, 335, 514 ]
			
 
				+---------------------------
			
--- a/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc
@@ -0,0 +1,106 @@
 
				+[[analysis-simplepatternsplit-tokenizer]]
			
 
				+=== Simple Pattern Split Tokenizer
			
 
				+
			
 
				+experimental[]
			
 
				+
			
 
				+The `simplepatternsplit` tokenizer uses a regular expression to split the
			
 
				+input into terms at pattern matches. The set of regular expression features it
			
 
				+supports is more limited than the <<analysis-pattern-tokenizer,`pattern`>>
			
 
				+tokenizer, but the tokenization is generally faster.
			
 
				+
			
 
				+This tokenizer does not produce terms from the matches themselves. To produce
			
 
				+terms from matches using patterns in the same restricted regular expression
			
 
				+subset, see the <<analysis-simplepattern-tokenizer,`simplepattern`>>
			
 
				+tokenizer.
			
 
				+
			
 
				+This tokenizer uses {lucene-core-javadoc}/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expressions].
			
 
				+For an explanation of the supported features and syntax, see <<regexp-syntax,Regular Expression Syntax>>.
			
 
				+
			
 
				+The default pattern is the empty string, which produces one term containing the
			
 
				+full input. This tokenizer should always be configured with a non-default
			
 
				+pattern.
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `simplepatternsplit` tokenizer accepts the following parameters:
			
 
				+
			
 
				+[horizontal]
			
 
				+`pattern`::
			
 
				+    A {lucene-core-javadoc}/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expression], defaults to the empty string.
			
 
				+
			
 
				+[float]
			
 
				+=== Example configuration
			
 
				+
			
 
				+This example configures the `simplepatternsplit` tokenizer to split the input
			
 
				+text on underscores.
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+PUT my_index
			
 
				+{
			
 
				+  "settings": {
			
 
				+    "analysis": {
			
 
				+      "analyzer": {
			
 
				+        "my_analyzer": {
			
 
				+          "tokenizer": "my_tokenizer"
			
 
				+        }
			
 
				+      },
			
 
				+      "tokenizer": {
			
 
				+        "my_tokenizer": {
			
 
				+          "type": "simplepatternsplit",
			
 
				+          "pattern": "_"
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+POST my_index/_analyze
			
 
				+{
			
 
				+  "analyzer": "my_analyzer",
			
 
				+  "text": "an_underscored_phrase"
			
 
				+}
			
 
				+----------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+[source,js]
			
 
				+----------------------------
			
 
				+{
			
 
				+  "tokens" : [
			
 
				+    {
			
 
				+      "token" : "an",
			
 
				+      "start_offset" : 0,
			
 
				+      "end_offset" : 2,
			
 
				+      "type" : "word",
			
 
				+      "position" : 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "underscored",
			
 
				+      "start_offset" : 3,
			
 
				+      "end_offset" : 14,
			
 
				+      "type" : "word",
			
 
				+      "position" : 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token" : "phrase",
			
 
				+      "start_offset" : 15,
			
 
				+      "end_offset" : 21,
			
 
				+      "type" : "word",
			
 
				+      "position" : 2
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+----------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
 
				+/////////////////////
			
 
				+
			
 
				+The above example produces these terms:
			
 
				+
			
 
				+[source,text]
			
 
				+---------------------------
			
 
				+[ an, underscored, phrase ]
			
 
				+---------------------------
			
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
@@ -73,6 +73,7 @@ import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
 
				 import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
			
 
				 import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
			
 
				 import org.elasticsearch.index.analysis.TokenFilterFactory;
			
 
				+import org.elasticsearch.index.analysis.TokenizerFactory;
			
 
				 import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
			
 
				 import org.elasticsearch.plugins.AnalysisPlugin;
			
 
				 import org.elasticsearch.plugins.Plugin;
			
@@ -100,6 +101,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
 
				         return filters;
			
 
				     }
			
 
				 
			
 
				+    @Override
			
 
				     public Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() {
			
 
				         Map<String, AnalysisProvider<CharFilterFactory>> filters = new TreeMap<>();
			
 
				         filters.put("html_strip", HtmlStripCharFilterFactory::new);
			
@@ -108,6 +110,14 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
 
				         return filters;
			
 
				     }
			
 
				 
			
 
				+    @Override
			
 
				+    public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
			
 
				+        Map<String, AnalysisProvider<TokenizerFactory>> tokenizers = new TreeMap<>();
			
 
				+        tokenizers.put("simplepattern", SimplePatternTokenizerFactory::new);
			
 
				+        tokenizers.put("simplepatternsplit", SimplePatternSplitTokenizerFactory::new);
			
 
				+        return tokenizers;
			
 
				+    }
			
 
				+
			
 
				     @Override
			
 
				     public List<PreConfiguredCharFilter> getPreConfiguredCharFilters() {
			
 
				         List<PreConfiguredCharFilter> filters = new ArrayList<>();
			
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternSplitTokenizerFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternSplitTokenizerFactory.java
@@ -0,0 +1,43 @@
 
				+/*
			
 
				+ * Licensed to Elasticsearch under one or more contributor
			
 
				+ * license agreements. See the NOTICE file distributed with
			
 
				+ * this work for additional information regarding copyright
			
 
				+ * ownership. Elasticsearch licenses this file to you under
			
 
				+ * the Apache License, Version 2.0 (the "License"); you may
			
 
				+ * not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing,
			
 
				+ * software distributed under the License is distributed on an
			
 
				+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
			
 
				+ * KIND, either express or implied.  See the License for the
			
 
				+ * specific language governing permissions and limitations
			
 
				+ * under the License.
			
 
				+ */
			
 
				+
			
 
				+package org.elasticsearch.analysis.common;
			
 
				+
			
 
				+import org.apache.lucene.analysis.Tokenizer;
			
 
				+import org.apache.lucene.analysis.pattern.SimplePatternSplitTokenizer;
			
 
				+import org.elasticsearch.common.settings.Settings;
			
 
				+import org.elasticsearch.env.Environment;
			
 
				+import org.elasticsearch.index.IndexSettings;
			
 
				+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
			
 
				+
			
 
				+public class SimplePatternSplitTokenizerFactory extends AbstractTokenizerFactory {
			
 
				+
			
 
				+    private final String pattern;
			
 
				+
			
 
				+    public SimplePatternSplitTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
			
 
				+        super(indexSettings, name, settings);
			
 
				+
			
 
				+        pattern = settings.get("pattern", "");
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public Tokenizer create() {
			
 
				+        return new SimplePatternSplitTokenizer(pattern);
			
 
				+    }
			
 
				+}
			
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternTokenizerFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SimplePatternTokenizerFactory.java
@@ -0,0 +1,43 @@
 
				+/*
			
 
				+ * Licensed to Elasticsearch under one or more contributor
			
 
				+ * license agreements. See the NOTICE file distributed with
			
 
				+ * this work for additional information regarding copyright
			
 
				+ * ownership. Elasticsearch licenses this file to you under
			
 
				+ * the Apache License, Version 2.0 (the "License"); you may
			
 
				+ * not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing,
			
 
				+ * software distributed under the License is distributed on an
			
 
				+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
			
 
				+ * KIND, either express or implied.  See the License for the
			
 
				+ * specific language governing permissions and limitations
			
 
				+ * under the License.
			
 
				+ */
			
 
				+
			
 
				+package org.elasticsearch.analysis.common;
			
 
				+
			
 
				+import org.apache.lucene.analysis.Tokenizer;
			
 
				+import org.apache.lucene.analysis.pattern.SimplePatternTokenizer;
			
 
				+import org.elasticsearch.common.settings.Settings;
			
 
				+import org.elasticsearch.env.Environment;
			
 
				+import org.elasticsearch.index.IndexSettings;
			
 
				+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
			
 
				+
			
 
				+public class SimplePatternTokenizerFactory extends AbstractTokenizerFactory {
			
 
				+
			
 
				+    private final String pattern;
			
 
				+
			
 
				+    public SimplePatternTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
			
 
				+        super(indexSettings, name, settings);
			
 
				+
			
 
				+        pattern = settings.get("pattern", "");
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public Tokenizer create() {
			
 
				+        return new SimplePatternTokenizer(pattern);
			
 
				+    }
			
 
				+}
			
--- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java
@@ -43,6 +43,8 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
 
				     @Override
			
 
				     protected Map<String, Class<?>> getTokenizers() {
			
 
				         Map<String, Class<?>> tokenizers = new TreeMap<>(super.getTokenizers());
			
 
				+        tokenizers.put("simplepattern", SimplePatternTokenizerFactory.class);
			
 
				+        tokenizers.put("simplepatternsplit", SimplePatternSplitTokenizerFactory.class);
			
 
				         return tokenizers;
			
 
				     }
			
 
				 
			
--- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml
+++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml
@@ -25,3 +25,33 @@
 
				     - match:  { detail.tokenizer.tokens.0.token: go }
			
 
				     - match:  { detail.tokenizer.tokens.1.token: oo }
			
 
				     - match:  { detail.tokenizer.tokens.2.token: od }
			
 
				+
			
 
				+---
			
 
				+"simplepattern":
			
 
				+    - do:
			
 
				+        indices.analyze:
			
 
				+          body:
			
 
				+            text: "a6bf fooo ff61"
			
 
				+            explain: true
			
 
				+            tokenizer:
			
 
				+              type: simplepattern
			
 
				+              pattern: "[abcdef0123456789]{4}"
			
 
				+    - length: { detail.tokenizer.tokens: 2 }
			
 
				+    - match:  { detail.tokenizer.name: _anonymous_tokenizer }
			
 
				+    - match:  { detail.tokenizer.tokens.0.token: a6bf }
			
 
				+    - match:  { detail.tokenizer.tokens.1.token: ff61 }
			
 
				+
			
 
				+---
			
 
				+"simplepatternsplit":
			
 
				+    - do:
			
 
				+        indices.analyze:
			
 
				+          body:
			
 
				+            text: "foo==bar"
			
 
				+            explain: true
			
 
				+            tokenizer:
			
 
				+              type: simplepatternsplit
			
 
				+              pattern: ==
			
 
				+    - length: { detail.tokenizer.tokens: 2 }
			
 
				+    - match:  { detail.tokenizer.name: _anonymous_tokenizer }
			
 
				+    - match:  { detail.tokenizer.tokens.0.token: foo }
			
 
				+    - match:  { detail.tokenizer.tokens.1.token: bar }
			
--- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
+++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
@@ -129,25 +129,23 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
 
				 
			
 
				     static final Map<String,Class<?>> KNOWN_TOKENIZERS = new MapBuilder<String,Class<?>>()
			
 
				         // exposed in ES
			
 
				-        .put("classic",       ClassicTokenizerFactory.class)
			
 
				-        .put("edgengram",     EdgeNGramTokenizerFactory.class)
			
 
				-        .put("keyword",       KeywordTokenizerFactory.class)
			
 
				-        .put("letter",        LetterTokenizerFactory.class)
			
 
				-        .put("lowercase",     LowerCaseTokenizerFactory.class)
			
 
				-        .put("ngram",         NGramTokenizerFactory.class)
			
 
				+        .put("classic", ClassicTokenizerFactory.class)
			
 
				+        .put("edgengram", EdgeNGramTokenizerFactory.class)
			
 
				+        .put("keyword", KeywordTokenizerFactory.class)
			
 
				+        .put("letter", LetterTokenizerFactory.class)
			
 
				+        .put("lowercase", LowerCaseTokenizerFactory.class)
			
 
				+        .put("ngram", NGramTokenizerFactory.class)
			
 
				         .put("pathhierarchy", PathHierarchyTokenizerFactory.class)
			
 
				-        .put("pattern",       PatternTokenizerFactory.class)
			
 
				-        .put("standard",      StandardTokenizerFactory.class)
			
 
				-        .put("thai",          ThaiTokenizerFactory.class)
			
 
				+        .put("pattern", PatternTokenizerFactory.class)
			
 
				+        .put("simplepattern", MovedToAnalysisCommon.class)
			
 
				+        .put("simplepatternsplit", MovedToAnalysisCommon.class)
			
 
				+        .put("standard", StandardTokenizerFactory.class)
			
 
				+        .put("thai", ThaiTokenizerFactory.class)
			
 
				         .put("uax29urlemail", UAX29URLEmailTokenizerFactory.class)
			
 
				-        .put("whitespace",    WhitespaceTokenizerFactory.class)
			
 
				+        .put("whitespace", WhitespaceTokenizerFactory.class)
			
 
				 
			
 
				         // this one "seems to mess up offsets". probably shouldn't be a tokenizer...
			
 
				-        .put("wikipedia",     Void.class)
			
 
				-
			
 
				-        // TODO: expose these
			
 
				-        .put("simplepattern",    Void.class)
			
 
				-        .put("simplepatternsplit",    Void.class)
			
 
				+        .put("wikipedia", Void.class)
			
 
				         .immutableMap();
			
 
				 
			
 
				     static final Map<String,Class<?>> KNOWN_TOKENFILTERS = new MapBuilder<String,Class<?>>()