5 years ago · da31b4b83d
--- a/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc
+++ b/docs/reference/analysis/tokenizers/chargroup-tokenizer.asciidoc
@@ -13,11 +13,15 @@ The `char_group` tokenizer accepts one parameter:
 
				 
			
 
				 [horizontal]
			
 
				 `tokenize_on_chars`::
			
 
				-    A list containing a list of characters to tokenize the string on. Whenever a character 
			
 
				+    A list containing a list of characters to tokenize the string on. Whenever a character
			
 
				     from this list is encountered, a new token is started. This accepts either single
			
 
				     characters like e.g. `-`, or character groups: `whitespace`, `letter`, `digit`,
			
 
				     `punctuation`, `symbol`.
			
 
				 
			
 
				+`max_token_length`::
			
 
				+    The maximum token length. If a token is seen that exceeds this length then
			
 
				+    it is split at `max_token_length` intervals. Defaults to `255`.
			
 
				+
			
 
				 
			
 
				 [float]
			
 
				 === Example output
			
--- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactory.java
@@ -21,6 +21,7 @@ package org.elasticsearch.analysis.common;
 
				 
			
 
				 import org.apache.lucene.analysis.Tokenizer;
			
 
				 import org.apache.lucene.analysis.util.CharTokenizer;
			
 
				+import org.apache.lucene.util.AttributeFactory;
			
 
				 import org.elasticsearch.common.settings.Settings;
			
 
				 import org.elasticsearch.env.Environment;
			
 
				 import org.elasticsearch.index.IndexSettings;
			
@@ -31,7 +32,10 @@ import java.util.Set;
 
				 
			
 
				 public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{
			
 
				 
			
 
				+    static final String MAX_TOKEN_LENGTH = "max_token_length";
			
 
				+
			
 
				     private final Set<Integer> tokenizeOnChars = new HashSet<>();
			
 
				+    private final Integer maxTokenLength;
			
 
				     private boolean tokenizeOnSpace = false;
			
 
				     private boolean tokenizeOnLetter = false;
			
 
				     private boolean tokenizeOnDigit = false;
			
@@ -41,6 +45,8 @@ public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{
 
				     public CharGroupTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
			
 
				         super(indexSettings, settings, name);
			
 
				 
			
 
				+        maxTokenLength = settings.getAsInt(MAX_TOKEN_LENGTH, CharTokenizer.DEFAULT_MAX_WORD_LEN);
			
 
				+
			
 
				         for (final String c : settings.getAsList("tokenize_on_chars")) {
			
 
				             if (c == null || c.length() == 0) {
			
 
				                 throw new RuntimeException("[tokenize_on_chars] cannot contain empty characters");
			
@@ -110,7 +116,7 @@ public class CharGroupTokenizerFactory extends AbstractTokenizerFactory{
 
				 
			
 
				     @Override
			
 
				     public Tokenizer create() {
			
 
				-        return new CharTokenizer() {
			
 
				+        return new CharTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, maxTokenLength) {
			
 
				             @Override
			
 
				             protected boolean isTokenChar(int c) {
			
 
				                 if (tokenizeOnSpace && Character.isWhitespace(c)) {
			
--- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactoryTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CharGroupTokenizerFactoryTests.java
@@ -19,7 +19,9 @@
 
				 
			
 
				 package org.elasticsearch.analysis.common;
			
 
				 
			
 
				+import com.carrotsearch.randomizedtesting.generators.RandomStrings;
			
 
				 import org.apache.lucene.analysis.Tokenizer;
			
 
				+import org.apache.lucene.analysis.util.CharTokenizer;
			
 
				 import org.elasticsearch.common.settings.Settings;
			
 
				 import org.elasticsearch.index.Index;
			
 
				 import org.elasticsearch.index.IndexSettings;
			
@@ -27,11 +29,12 @@ import org.elasticsearch.test.ESTokenStreamTestCase;
 
				 import org.elasticsearch.test.IndexSettingsModule;
			
 
				 
			
 
				 import java.io.IOException;
			
 
				+import java.io.Reader;
			
 
				 import java.io.StringReader;
			
 
				 import java.util.Arrays;
			
 
				 
			
 
				-
			
 
				 public class CharGroupTokenizerFactoryTests extends ESTokenStreamTestCase {
			
 
				+
			
 
				     public void testParseTokenChars() {
			
 
				         final Index index = new Index("test", "_na_");
			
 
				         final Settings indexSettings = newAnalysisSettingsBuilder().build();
			
@@ -61,6 +64,53 @@ public class CharGroupTokenizerFactoryTests extends ESTokenStreamTestCase {
 
				         }
			
 
				     }
			
 
				 
			
 
				+    public void testMaxTokenLength() throws IOException {
			
 
				+        final Index index = new Index("test", "_na_");
			
 
				+        final Settings indexSettings = newAnalysisSettingsBuilder().build();
			
 
				+        IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
			
 
				+        final String name = "cg";
			
 
				+
			
 
				+        String[] conf = new String[] {"-"};
			
 
				+
			
 
				+        final Settings defaultLengthSettings = newAnalysisSettingsBuilder()
			
 
				+            .putList("tokenize_on_chars", conf)
			
 
				+            .build();
			
 
				+        CharTokenizer tokenizer = (CharTokenizer) new CharGroupTokenizerFactory(indexProperties, null, name, defaultLengthSettings)
			
 
				+            .create();
			
 
				+        String textWithVeryLongToken = RandomStrings.randomAsciiAlphanumOfLength(random(), 256).concat("-trailing");
			
 
				+        try (Reader reader = new StringReader(textWithVeryLongToken)) {
			
 
				+            tokenizer.setReader(reader);
			
 
				+            assertTokenStreamContents(tokenizer, new String[] { textWithVeryLongToken.substring(0, 255),
			
 
				+                textWithVeryLongToken.substring(255, 256), "trailing"});
			
 
				+        }
			
 
				+
			
 
				+        final Settings analysisSettings = newAnalysisSettingsBuilder()
			
 
				+            .putList("tokenize_on_chars", conf)
			
 
				+            .put("max_token_length", 2)
			
 
				+            .build();
			
 
				+        tokenizer = (CharTokenizer) new CharGroupTokenizerFactory(indexProperties, null, name, analysisSettings).create();
			
 
				+        try (Reader reader = new StringReader("one-two-three")) {
			
 
				+            tokenizer.setReader(reader);
			
 
				+            assertTokenStreamContents(tokenizer, new String[] { "on", "e", "tw", "o", "th", "re", "e" });
			
 
				+        }
			
 
				+
			
 
				+        final Settings tooLongLengthSettings = newAnalysisSettingsBuilder()
			
 
				+            .putList("tokenize_on_chars", conf)
			
 
				+            .put("max_token_length", 1024 * 1024 + 1)
			
 
				+            .build();
			
 
				+        IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
			
 
				+            () -> new CharGroupTokenizerFactory(indexProperties, null, name, tooLongLengthSettings).create());
			
 
				+        assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 1048577", e.getMessage());
			
 
				+
			
 
				+        final Settings negativeLengthSettings = newAnalysisSettingsBuilder()
			
 
				+            .putList("tokenize_on_chars", conf)
			
 
				+            .put("max_token_length", -1)
			
 
				+            .build();
			
 
				+        e = expectThrows(IllegalArgumentException.class,
			
 
				+            () -> new CharGroupTokenizerFactory(indexProperties, null, name, negativeLengthSettings).create());
			
 
				+        assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", e.getMessage());
			
 
				+    }
			
 
				+
			
 
				     public void testTokenization() throws IOException {
			
 
				         final Index index = new Index("test", "_na_");
			
 
				         final String name = "cg";