преди 7 години · 5f172b6795
--- a/docs/reference/analysis/tokenizers.asciidoc
+++ b/docs/reference/analysis/tokenizers.asciidoc
@@ -103,6 +103,11 @@ The `simple_pattern` tokenizer uses a regular expression to capture matching
 
				 text as terms. It uses a restricted subset of regular expression features
			
 
				 and is generally faster than the `pattern` tokenizer.
			
 
				 
			
 
				+<<analysis-chargroup-tokenizer,Char Group Tokenizer>>::
			
 
				+
			
 
				+The `char_group` tokenizer is configurable through sets of characters to split
			
 
				+on, which is usually less expensive than running regular expressions.
			
 
				+
			
 
				 <<analysis-simplepatternsplit-tokenizer,Simple Pattern Split Tokenizer>>::
			
 
				 
			
 
				 The `simple_pattern_split` tokenizer uses the same restricted regular expression
			
@@ -143,6 +148,8 @@ include::tokenizers/keyword-tokenizer.asciidoc[]
 
				 
			
 
				 include::tokenizers/pattern-tokenizer.asciidoc[]
			
 
				 
			
 
				+include::tokenizers/chargroup-tokenizer.asciidoc[]
			
 
				+
			
 
				 include::tokenizers/simplepattern-tokenizer.asciidoc[]
			
 
				 
			
 
				 include::tokenizers/simplepatternsplit-tokenizer.asciidoc[]
			
--- a/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc
@@ -0,0 +1,80 @@
 
				+[[analysis-chargroup-tokenizer]]
			
 
				+=== Char Group Tokenizer
			
 
				+
			
 
				+The `char_group` tokenizer breaks text into terms whenever it encounters a
			
 
				+character which is in a defined set. It is mostly useful for cases where a simple
			
 
				+custom tokenization is desired, and the overhead of use of the <<analysis-pattern-tokenizer, `pattern` tokenizer>>
			
 
				+is not acceptable.
			
 
				+
			
 
				+[float]
			
 
				+=== Configuration
			
 
				+
			
 
				+The `char_group` tokenizer accepts one parameter:
			
 
				+
			
 
				+[horizontal]
			
 
				+`tokenize_on_chars`::
			
 
				+    A list containing a list of characters to tokenize the string on. Whenever a character 
			
 
				+    from this list is encountered, a new token is started. This accepts either single
			
 
				+    characters like eg. `-`, or character groups: `whitespace`, `letter`, `digit`,
			
 
				+    `punctuation`, `symbol`.
			
 
				+
			
 
				+
			
 
				+[float]
			
 
				+=== Example output
			
 
				+
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+POST _analyze
			
 
				+{
			
 
				+  "tokenizer": {
			
 
				+    "type": "char_group",
			
 
				+    "tokenize_on_chars": [
			
 
				+      "whitespace",
			
 
				+      "-",
			
 
				+      "\n"
			
 
				+    ]
			
 
				+  },
			
 
				+  "text": "The QUICK brown-fox"
			
 
				+}
			
 
				+---------------------------
			
 
				+// CONSOLE
			
 
				+
			
 
				+returns
			
 
				+
			
 
				+[source,js]
			
 
				+---------------------------
			
 
				+{
			
 
				+  "tokens": [
			
 
				+    {
			
 
				+      "token": "The",
			
 
				+      "start_offset": 0,
			
 
				+      "end_offset": 3,
			
 
				+      "type": "word",
			
 
				+      "position": 0
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "QUICK",
			
 
				+      "start_offset": 4,
			
 
				+      "end_offset": 9,
			
 
				+      "type": "word",
			
 
				+      "position": 1
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "brown",
			
 
				+      "start_offset": 10,
			
 
				+      "end_offset": 15,
			
 
				+      "type": "word",
			
 
				+      "position": 2
			
 
				+    },
			
 
				+    {
			
 
				+      "token": "fox",
			
 
				+      "start_offset": 16,
			
 
				+      "end_offset": 19,
			
 
				+      "type": "word",
			
 
				+      "position": 3
			
 
				+    }
			
 
				+  ]
			
 
				+}
			
 
				+---------------------------
			
 
				+// TESTRESPONSE
			
 
				+
			
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactory.java
@@ -0,0 +1,135 @@
 
				+/*
			
 
				+ * Licensed to Elasticsearch under one or more contributor
			
 
				+ * license agreements. See the NOTICE file distributed with
			
 
				+ * this work for additional information regarding copyright
			
 
				+ * ownership. Elasticsearch licenses this file to you under
			
 
				+ * the Apache License, Version 2.0 (the "License"); you may
			
 
				+ * not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing,
			
 
				+ * software distributed under the License is distributed on an
			
 
				+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
			
 
				+ * KIND, either express or implied.  See the License for the
			
 
				+ * specific language governing permissions and limitations
			
 
				+ * under the License.
			
 
				+ */
			
 
				+
			
 
				+package org.elasticsearch.analysis.common;
			
 
				+
			
 
				+import org.apache.lucene.analysis.Tokenizer;
			
 
				+import org.apache.lucene.analysis.util.CharTokenizer;
			
 
				+import org.elasticsearch.common.settings.Settings;
			
 
				+import org.elasticsearch.env.Environment;
			
 
				+import org.elasticsearch.index.IndexSettings;
			
 
				+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
			
 
				+
			
 
				+import java.util.HashSet;
			
 
				+import java.util.Set;
			
 
				+
			
 
				+public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{
			
 
				+
			
 
				+    private final Set<Integer> tokenizeOnChars = new HashSet<>();
			
 
				+    private boolean tokenizeOnSpace = false;
			
 
				+    private boolean tokenizeOnLetter = false;
			
 
				+    private boolean tokenizeOnDigit = false;
			
 
				+    private boolean tokenizeOnPunctuation = false;
			
 
				+    private boolean tokenizeOnSymbol = false;
			
 
				+
			
 
				+    public CharGroupTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
			
 
				+        super(indexSettings, name, settings);
			
 
				+
			
 
				+        for (final String c : settings.getAsList("tokenize_on_chars")) {
			
 
				+            if (c == null || c.length() == 0) {
			
 
				+                throw new RuntimeException("[tokenize_on_chars] cannot contain empty characters");
			
 
				+            }
			
 
				+
			
 
				+            if (c.length() == 1) {
			
 
				+                tokenizeOnChars.add((int) c.charAt(0));
			
 
				+            }
			
 
				+            else if (c.charAt(0) == '\\') {
			
 
				+                tokenizeOnChars.add((int) parseEscapedChar(c));
			
 
				+            } else {
			
 
				+                switch (c) {
			
 
				+                    case "letter":
			
 
				+                        tokenizeOnLetter = true;
			
 
				+                        break;
			
 
				+                    case "digit":
			
 
				+                        tokenizeOnDigit = true;
			
 
				+                        break;
			
 
				+                    case "whitespace":
			
 
				+                        tokenizeOnSpace = true;
			
 
				+                        break;
			
 
				+                    case "punctuation":
			
 
				+                        tokenizeOnPunctuation = true;
			
 
				+                        break;
			
 
				+                    case "symbol":
			
 
				+                        tokenizeOnSymbol = true;
			
 
				+                        break;
			
 
				+                    default:
			
 
				+                        throw new RuntimeException("Invalid escaped char in [" + c + "]");
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    private char parseEscapedChar(final String s) {
			
 
				+        int len = s.length();
			
 
				+        char c = s.charAt(0);
			
 
				+        if (c == '\\') {
			
 
				+            if (1 >= len)
			
 
				+                throw new RuntimeException("Invalid escaped char in [" + s + "]");
			
 
				+            c = s.charAt(1);
			
 
				+            switch (c) {
			
 
				+                case '\\':
			
 
				+                    return '\\';
			
 
				+                case 'n':
			
 
				+                    return '\n';
			
 
				+                case 't':
			
 
				+                    return '\t';
			
 
				+                case 'r':
			
 
				+                    return '\r';
			
 
				+                case 'b':
			
 
				+                    return '\b';
			
 
				+                case 'f':
			
 
				+                    return '\f';
			
 
				+                case 'u':
			
 
				+                    if (len > 6) {
			
 
				+                        throw new RuntimeException("Invalid escaped char in [" + s + "]");
			
 
				+                    }
			
 
				+                    return (char) Integer.parseInt(s.substring(2), 16);
			
 
				+                default:
			
 
				+                    throw new RuntimeException("Invalid escaped char " + c + " in [" + s + "]");
			
 
				+            }
			
 
				+        } else {
			
 
				+            throw new RuntimeException("Invalid escaped char [" + s + "]");
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public Tokenizer create() {
			
 
				+        return new CharTokenizer() {
			
 
				+            @Override
			
 
				+            protected boolean isTokenChar(int c) {
			
 
				+                if (tokenizeOnSpace && Character.isWhitespace(c)) {
			
 
				+                    return false;
			
 
				+                }
			
 
				+                if (tokenizeOnLetter && Character.isLetter(c)) {
			
 
				+                    return false;
			
 
				+                }
			
 
				+                if (tokenizeOnDigit && Character.isDigit(c)) {
			
 
				+                    return false;
			
 
				+                }
			
 
				+                if (tokenizeOnPunctuation && CharMatcher.Basic.PUNCTUATION.isTokenChar(c)) {
			
 
				+                    return false;
			
 
				+                }
			
 
				+                if (tokenizeOnSymbol && CharMatcher.Basic.SYMBOL.isTokenChar(c)) {
			
 
				+                    return false;
			
 
				+                }
			
 
				+                return !tokenizeOnChars.contains(c);
			
 
				+            }
			
 
				+        };
			
 
				+    }
			
 
				+}
			
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
@@ -184,6 +184,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
 
				         tokenizers.put("ngram", NGramTokenizerFactory::new);
			
 
				         tokenizers.put("edgeNGram", EdgeNGramTokenizerFactory::new);
			
 
				         tokenizers.put("edge_ngram", EdgeNGramTokenizerFactory::new);
			
 
				+        tokenizers.put("char_group", CharGroupTokenizerFactory::new);
			
 
				         tokenizers.put("classic", ClassicTokenizerFactory::new);
			
 
				         tokenizers.put("letter", LetterTokenizerFactory::new);
			
 
				         tokenizers.put("lowercase", LowerCaseTokenizerFactory::new);
			
--- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactoryTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactoryTests.java
@@ -0,0 +1,74 @@
 
				+/*
			
 
				+ * Licensed to Elasticsearch under one or more contributor
			
 
				+ * license agreements. See the NOTICE file distributed with
			
 
				+ * this work for additional information regarding copyright
			
 
				+ * ownership. Elasticsearch licenses this file to you under
			
 
				+ * the Apache License, Version 2.0 (the "License"); you may
			
 
				+ * not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing,
			
 
				+ * software distributed under the License is distributed on an
			
 
				+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
			
 
				+ * KIND, either express or implied.  See the License for the
			
 
				+ * specific language governing permissions and limitations
			
 
				+ * under the License.
			
 
				+ */
			
 
				+
			
 
				+package org.elasticsearch.analysis.common;
			
 
				+
			
 
				+import org.apache.lucene.analysis.Tokenizer;
			
 
				+import org.elasticsearch.common.settings.Settings;
			
 
				+import org.elasticsearch.index.Index;
			
 
				+import org.elasticsearch.index.IndexSettings;
			
 
				+import org.elasticsearch.test.ESTokenStreamTestCase;
			
 
				+import org.elasticsearch.test.IndexSettingsModule;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.io.StringReader;
			
 
				+import java.util.Arrays;
			
 
				+
			
 
				+
			
 
				+public class CharGroupTokenizerFactoryTests extends ESTokenStreamTestCase {
			
 
				+    public void testParseTokenChars() {
			
 
				+        final Index index = new Index("test", "_na_");
			
 
				+        final Settings indexSettings = newAnalysisSettingsBuilder().build();
			
 
				+        IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
			
 
				+        final String name = "cg";
			
 
				+        for (String[] conf : Arrays.asList(
			
 
				+                new String[] { "\\v" },
			
 
				+                new String[] { "\\u00245" },
			
 
				+                new String[] { "commas" },
			
 
				+                new String[] { "a", "b", "c", "\\$" })) {
			
 
				+            final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", conf).build();
			
 
				+            expectThrows(RuntimeException.class, () -> new CharGroupTokenizerFactory(indexProperties, null, name, settings).create());
			
 
				+        }
			
 
				+
			
 
				+        for (String[] conf : Arrays.asList(
			
 
				+                new String[0],
			
 
				+                new String[] { "\\n" },
			
 
				+                new String[] { "\\u0024" },
			
 
				+                new String[] { "whitespace" },
			
 
				+                new String[] { "a", "b", "c" },
			
 
				+                new String[] { "a", "b", "c", "\\r" },
			
 
				+                new String[] { "\\r" },
			
 
				+                new String[] { "f", "o", "o", "symbol" })) {
			
 
				+            final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", Arrays.asList(conf)).build();
			
 
				+            new CharGroupTokenizerFactory(indexProperties, null, name, settings).create();
			
 
				+            // no exception
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    public void testTokenization() throws IOException {
			
 
				+        final Index index = new Index("test", "_na_");
			
 
				+        final String name = "cg";
			
 
				+        final Settings indexSettings = newAnalysisSettingsBuilder().build();
			
 
				+        final Settings settings = newAnalysisSettingsBuilder().putList("tokenize_on_chars", "whitespace", ":", "\\u0024").build();
			
 
				+        Tokenizer tokenizer = new CharGroupTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings),
			
 
				+                null, name, settings).create();
			
 
				+        tokenizer.setReader(new StringReader("foo bar $34 test:test2"));
			
 
				+        assertTokenStreamContents(tokenizer, new String[] {"foo", "bar", "34", "test", "test2"});
			
 
				+    }
			
 
				+}