10 years ago · cd527c5b92
--- a/docs/plugins/analysis-icu.asciidoc
+++ b/docs/plugins/analysis-icu.asciidoc
@@ -115,6 +115,74 @@ PUT icu_sample
 
				 --------------------------------------------------
			
 
				 // AUTOSENSE
			
 
				 
			
 
				+===== Rules customization
			
 
				+
			
 
				+experimental[]
			
 
				+
			
 
				+You can customize the `icu-tokenizer` behavior by specifying per-script rule files, see the
			
 
				+http://userguide.icu-project.org/boundaryanalysis#TOC-RBBI-Rules[RBBI rules syntax reference]
			
 
				+for a more detailed explanation.
			
 
				+
			
 
				+To add icu tokenizer rules, set the `rule_files` settings, which should contain a comma-separated list of
			
 
				+`code:rulefile` pairs in the following format:
			
 
				+http://unicode.org/iso15924/iso15924-codes.html[four-letter ISO 15924 script code],
			
 
				+followed by a colon, then a rule file name. Rule files are placed `ES_HOME/config` directory.
			
 
				+
			
 
				+As a demonstration of how the rule files can be used, save the following user file to `$ES_HOME/config/KeywordTokenizer.rbbi`:
			
 
				+
			
 
				+[source,text]
			
 
				+-----------------------
			
 
				+.+ {200};
			
 
				+-----------------------
			
 
				+
			
 
				+Then create an analyzer to use this rule file as follows:
			
 
				+
			
 
				+[source,json]
			
 
				+--------------------------------------------------
			
 
				+PUT icu_sample
			
 
				+{
			
 
				+    "settings": {
			
 
				+        "index":{
			
 
				+            "analysis":{
			
 
				+                "tokenizer" : {
			
 
				+                    "icu_user_file" : {
			
 
				+                       "type" : "icu_tokenizer",
			
 
				+                       "rule_files" : "Latn:KeywordTokenizer.rbbi"
			
 
				+                    }
			
 
				+                },
			
 
				+                "analyzer" : {
			
 
				+                    "my_analyzer" : {
			
 
				+                        "type" : "custom",
			
 
				+                        "tokenizer" : "icu_user_file"
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+POST icu_sample/_analyze?analyzer=my_analyzer&text=Elasticsearch. Wow!
			
 
				+--------------------------------------------------
			
 
				+// AUTOSENSE
			
 
				+
			
 
				+The above `analyze` request returns the following:
			
 
				+
			
 
				+[source,json]
			
 
				+--------------------------------------------------
			
 
				+# Result
			
 
				+{
			
 
				+   "tokens": [
			
 
				+      {
			
 
				+         "token": "Elasticsearch. Wow!",
			
 
				+         "start_offset": 0,
			
 
				+         "end_offset": 19,
			
 
				+         "type": "<ALPHANUM>",
			
 
				+         "position": 0
			
 
				+      }
			
 
				+   ]
			
 
				+}
			
 
				+--------------------------------------------------
			
 
				+
			
 
				 
			
 
				 [[analysis-icu-normalization]]
			
 
				 ==== ICU Normalization Token Filter
			
--- a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java
+++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuTokenizerFactory.java
@@ -19,23 +19,109 @@
 
				 
			
 
				 package org.elasticsearch.index.analysis;
			
 
				 
			
 
				+import com.ibm.icu.lang.UCharacter;
			
 
				+import com.ibm.icu.lang.UProperty;
			
 
				+import com.ibm.icu.lang.UScript;
			
 
				+import com.ibm.icu.text.BreakIterator;
			
 
				+import com.ibm.icu.text.RuleBasedBreakIterator;
			
 
				 import org.apache.lucene.analysis.Tokenizer;
			
 
				+import org.apache.lucene.analysis.icu.segmentation.DefaultICUTokenizerConfig;
			
 
				 import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
			
 
				+import org.apache.lucene.analysis.icu.segmentation.ICUTokenizerConfig;
			
 
				+import org.elasticsearch.ElasticsearchException;
			
 
				+import org.elasticsearch.common.settings.Setting;
			
 
				 import org.elasticsearch.common.settings.Settings;
			
 
				 import org.elasticsearch.env.Environment;
			
 
				 import org.elasticsearch.index.IndexSettings;
			
 
				 
			
 
				+import java.io.IOException;
			
 
				+import java.nio.file.Files;
			
 
				+import java.nio.file.Path;
			
 
				+import java.util.Collections;
			
 
				+import java.util.HashMap;
			
 
				+import java.util.List;
			
 
				+import java.util.Map;
			
 
				+import java.util.function.Function;
			
 
				+import java.util.stream.Collectors;
			
 
				+
			
 
				+
			
 
				 /**
			
 
				  */
			
 
				 public class IcuTokenizerFactory extends AbstractTokenizerFactory {
			
 
				 
			
 
				+    private final ICUTokenizerConfig config;
			
 
				+    private static final String RULE_FILES = "rule_files";
			
 
				+
			
 
				+    public static final Setting<List<String>> SETTING_RULE_FILES =
			
 
				+        Setting.listSetting(RULE_FILES, Collections.emptyList(), Function.identity(), Setting.Property.IndexScope);
			
 
				+
			
 
				     public IcuTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
			
 
				         super(indexSettings, name, settings);
			
 
				+        config = getIcuConfig(environment, settings);
			
 
				     }
			
 
				 
			
 
				     @Override
			
 
				     public Tokenizer create() {
			
 
				-        return new ICUTokenizer();
			
 
				+        if (config == null) {
			
 
				+            return new ICUTokenizer();
			
 
				+        }else{
			
 
				+            return new ICUTokenizer(config);
			
 
				+        }
			
 
				     }
			
 
				 
			
 
				+    private ICUTokenizerConfig getIcuConfig(Environment env, Settings settings) {
			
 
				+        Map<Integer, String> tailored = new HashMap<>();
			
 
				+
			
 
				+        try {
			
 
				+            List<String> ruleFiles = SETTING_RULE_FILES.get(settings);
			
 
				+
			
 
				+            for (String scriptAndResourcePath : ruleFiles) {
			
 
				+                int colonPos = scriptAndResourcePath.indexOf(":");
			
 
				+                if (colonPos == -1 || colonPos == scriptAndResourcePath.length() - 1) {
			
 
				+                    throw new IllegalArgumentException(RULE_FILES + " should contain comma-separated \"code:rulefile\" pairs");
			
 
				+                }
			
 
				+
			
 
				+                String scriptCode = scriptAndResourcePath.substring(0, colonPos).trim();
			
 
				+                String resourcePath = scriptAndResourcePath.substring(colonPos + 1).trim();
			
 
				+                tailored.put(UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptCode), resourcePath);
			
 
				+            }
			
 
				+
			
 
				+            if (tailored.isEmpty()) {
			
 
				+                return null;
			
 
				+            } else {
			
 
				+                final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT];
			
 
				+                for (Map.Entry<Integer, String> entry : tailored.entrySet()) {
			
 
				+                    int code = entry.getKey();
			
 
				+                    String resourcePath = entry.getValue();
			
 
				+                    breakers[code] = parseRules(resourcePath, env);
			
 
				+                }
			
 
				+                // cjkAsWords is not configurable yet.
			
 
				+                ICUTokenizerConfig config = new DefaultICUTokenizerConfig(true) {
			
 
				+                    @Override
			
 
				+                    public BreakIterator getBreakIterator(int script) {
			
 
				+                        if (breakers[script] != null) {
			
 
				+                            return (BreakIterator) breakers[script].clone();
			
 
				+                        } else {
			
 
				+                            return super.getBreakIterator(script);
			
 
				+                        }
			
 
				+                    }
			
 
				+                };
			
 
				+                return config;
			
 
				+            }
			
 
				+        } catch (Throwable t) {
			
 
				+            throw new ElasticsearchException("failed to load ICU rule files", t);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    //parse a single RBBi rule file
			
 
				+    private BreakIterator parseRules(String filename, Environment env) throws IOException {
			
 
				+
			
 
				+        final Path path = env.configFile().resolve(filename);
			
 
				+        String rules = Files.readAllLines(path)
			
 
				+            .stream()
			
 
				+            .filter((v) -> v.startsWith("#") == false)
			
 
				+            .collect(Collectors.joining("\n"));
			
 
				+
			
 
				+        return new RuleBasedBreakIterator(rules.toString());
			
 
				+    }
			
 
				 }
			
--- a/plugins/analysis-icu/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java
+++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/plugin/analysis/icu/AnalysisICUPlugin.java
@@ -19,6 +19,7 @@
 
				 
			
 
				 package org.elasticsearch.plugin.analysis.icu;
			
 
				 
			
 
				+import org.elasticsearch.common.settings.SettingsModule;
			
 
				 import org.elasticsearch.index.analysis.IcuCollationTokenFilterFactory;
			
 
				 import org.elasticsearch.index.analysis.IcuFoldingTokenFilterFactory;
			
 
				 import org.elasticsearch.index.analysis.IcuNormalizerCharFilterFactory;
			
@@ -54,4 +55,8 @@ public class AnalysisICUPlugin extends Plugin {
 
				         module.registerTokenFilter("icu_collation", IcuCollationTokenFilterFactory::new);
			
 
				         module.registerTokenFilter("icu_transform", IcuTransformTokenFilterFactory::new);
			
 
				     }
			
 
				+
			
 
				+    public void onModule(SettingsModule settingsModule) {
			
 
				+        settingsModule.registerSetting(IcuTokenizerFactory.SETTING_RULE_FILES);
			
 
				+    }
			
 
				 }
			
--- a/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/IcuTokenizerFactoryTests.java
+++ b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/IcuTokenizerFactoryTests.java
@@ -0,0 +1,107 @@
 
				+/*
			
 
				+ * Licensed to Elasticsearch under one or more contributor
			
 
				+ * license agreements. See the NOTICE file distributed with
			
 
				+ * this work for additional information regarding copyright
			
 
				+ * ownership. Elasticsearch licenses this file to you under
			
 
				+ * the Apache License, Version 2.0 (the "License"); you may
			
 
				+ * not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing,
			
 
				+ * software distributed under the License is distributed on an
			
 
				+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
			
 
				+ * KIND, either express or implied.  See the License for the
			
 
				+ * specific language governing permissions and limitations
			
 
				+ * under the License.
			
 
				+ */
			
 
				+
			
 
				+package org.elasticsearch.index.analysis;
			
 
				+
			
 
				+import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
			
 
				+import org.elasticsearch.Version;
			
 
				+import org.elasticsearch.cluster.metadata.IndexMetaData;
			
 
				+import org.elasticsearch.common.settings.Settings;
			
 
				+import org.elasticsearch.env.Environment;
			
 
				+import org.elasticsearch.index.Index;
			
 
				+import org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin;
			
 
				+import org.elasticsearch.test.ESTestCase;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.io.InputStream;
			
 
				+import java.io.Reader;
			
 
				+import java.io.StringReader;
			
 
				+import java.nio.file.Files;
			
 
				+import java.nio.file.Path;
			
 
				+
			
 
				+import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
			
 
				+
			
 
				+/**
			
 
				+ */
			
 
				+public class IcuTokenizerFactoryTests extends ESTestCase {
			
 
				+
			
 
				+    public void testSimpleIcuTokenizer() throws IOException {
			
 
				+        AnalysisService analysisService = createAnalysisService();
			
 
				+
			
 
				+        TokenizerFactory tokenizerFactory = analysisService.tokenizer("icu_tokenizer");
			
 
				+        ICUTokenizer tokenizer = (ICUTokenizer) tokenizerFactory.create();
			
 
				+
			
 
				+        Reader reader = new StringReader("向日葵, one-two");
			
 
				+        tokenizer.setReader(reader);
			
 
				+        assertTokenStreamContents(tokenizer, new String[]{"向日葵", "one", "two"});
			
 
				+    }
			
 
				+
			
 
				+    public void testIcuCustomizeRuleFile() throws IOException {
			
 
				+        AnalysisService analysisService = createAnalysisService();
			
 
				+
			
 
				+        // test the tokenizer with single rule file
			
 
				+        TokenizerFactory tokenizerFactory = analysisService.tokenizer("user_rule_tokenizer");
			
 
				+        ICUTokenizer tokenizer = (ICUTokenizer) tokenizerFactory.create();
			
 
				+        Reader reader = new StringReader
			
 
				+            ("One-two punch.  Brang-, not brung-it.  This one--not that one--is the right one, -ish.");
			
 
				+
			
 
				+        tokenizer.setReader(reader);
			
 
				+        assertTokenStreamContents(tokenizer,
			
 
				+            new String[]{"One-two", "punch", "Brang", "not", "brung-it",
			
 
				+                "This", "one", "not", "that", "one", "is", "the", "right", "one", "ish"});
			
 
				+    }
			
 
				+
			
 
				+    public void testMultipleIcuCustomizeRuleFiles() throws IOException {
			
 
				+        AnalysisService analysisService = createAnalysisService();
			
 
				+
			
 
				+        // test the tokenizer with two rule files
			
 
				+        TokenizerFactory tokenizerFactory = analysisService.tokenizer("multi_rule_tokenizer");
			
 
				+        ICUTokenizer tokenizer = (ICUTokenizer) tokenizerFactory.create();
			
 
				+        StringReader reader = new StringReader
			
 
				+            ("Some English.  Немного русский.  ข้อความภาษาไทยเล็ก ๆ น้อย ๆ  More English.");
			
 
				+
			
 
				+        tokenizer.setReader(reader);
			
 
				+        assertTokenStreamContents(tokenizer, new String[]{"Some", "English",
			
 
				+            "Немного русский.  ",
			
 
				+            "ข้อความภาษาไทยเล็ก ๆ น้อย ๆ  ",
			
 
				+            "More", "English"});
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    private static AnalysisService createAnalysisService() throws IOException {
			
 
				+        InputStream keywords = IcuTokenizerFactoryTests.class.getResourceAsStream("KeywordTokenizer.rbbi");
			
 
				+        InputStream latin = IcuTokenizerFactoryTests.class.getResourceAsStream("Latin-dont-break-on-hyphens.rbbi");
			
 
				+
			
 
				+        Path home = createTempDir();
			
 
				+        Path config = home.resolve("config");
			
 
				+        Files.createDirectory(config);
			
 
				+        Files.copy(keywords, config.resolve("KeywordTokenizer.rbbi"));
			
 
				+        Files.copy(latin, config.resolve("Latin-dont-break-on-hyphens.rbbi"));
			
 
				+
			
 
				+        String json = "/org/elasticsearch/index/analysis/icu_analysis.json";
			
 
				+
			
 
				+        Settings settings = Settings.builder()
			
 
				+            .loadFromStream(json, IcuTokenizerFactoryTests.class.getResourceAsStream(json))
			
 
				+            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
			
 
				+            .build();
			
 
				+        Settings nodeSettings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), home).build();
			
 
				+
			
 
				+        return createAnalysisService(new Index("test", "_na_"), nodeSettings, settings, new AnalysisICUPlugin()::onModule);
			
 
				+    }
			
 
				+}
			
--- a/plugins/analysis-icu/src/test/resources/org/elasticsearch/index/analysis/KeywordTokenizer.rbbi
+++ b/plugins/analysis-icu/src/test/resources/org/elasticsearch/index/analysis/KeywordTokenizer.rbbi
@@ -0,0 +1,21 @@
 
				+#
			
 
				+# Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				+# contributor license agreements.  See the NOTICE file distributed with
			
 
				+# this work for additional information regarding copyright ownership.
			
 
				+# The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				+# (the "License"); you may not use this file except in compliance with
			
 
				+# the License.  You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+#
			
 
				+# RBBI Keyword tokenizer: keep everything as a single token.
			
 
				+
			
 
				+# Apply rule status {200}=RBBI.WORD_LETTER, which is mapped
			
 
				+# to <ALPHANUM> token type by DefaultICUTokenizerConfig.
			
 
				+.+ {200};
			
--- a/plugins/analysis-icu/src/test/resources/org/elasticsearch/index/analysis/Latin-dont-break-on-hyphens.rbbi
+++ b/plugins/analysis-icu/src/test/resources/org/elasticsearch/index/analysis/Latin-dont-break-on-hyphens.rbbi
@@ -0,0 +1,135 @@
 
				+#
			
 
				+# Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				+# contributor license agreements.  See the NOTICE file distributed with
			
 
				+# this work for additional information regarding copyright ownership.
			
 
				+# The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				+# (the "License"); you may not use this file except in compliance with
			
 
				+# the License.  You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+#
			
 
				+# Based on Default.rbbi, the default RBBI rules, based on UAX#29.
			
 
				+# Added dashes to $MidLetter, so that words aren't broken on single dashes.
			
 
				+#
			
 
				+
			
 
				+!!chain;
			
 
				+
			
 
				+#
			
 
				+#  Character Class Definitions.
			
 
				+#
			
 
				+
			
 
				+$CR           = [\p{Word_Break = CR}];
			
 
				+$LF           = [\p{Word_Break = LF}];
			
 
				+$Newline      = [\p{Word_Break = Newline}];
			
 
				+$Extend       = [\p{Word_Break = Extend}];
			
 
				+$Format       = [\p{Word_Break = Format}];
			
 
				+$Katakana     = [\p{Word_Break = Katakana}];
			
 
				+$ALetter      = [\p{Word_Break = ALetter}];
			
 
				+$MidNumLet    = [\p{Word_Break = MidNumLet}];
			
 
				+# Don't use [:Dash:] here - it contains lots of chars that should continue to trigger word breaks
			
 
				+$Dash         = [\N{HYPHEN-MINUS}
			
 
				+                 \N{HYPHEN}
			
 
				+                 \N{EN DASH}
			
 
				+                 \N{MINUS SIGN}
			
 
				+                 \N{SMALL HYPHEN-MINUS}
			
 
				+                 \N{FULLWIDTH HYPHEN-MINUS}];
			
 
				+$MidLetter    = [\p{Word_Break = MidLetter}$Dash]; # Don't break on (single) hyphen
			
 
				+$MidNum       = [\p{Word_Break = MidNum}];
			
 
				+$Numeric      = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
			
 
				+$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
			
 
				+
			
 
				+
			
 
				+#   Dictionary character set, for triggering language-based break engines. Currently
			
 
				+#   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
			
 
				+#   5.0 or later as the definition of Complex_Context was corrected to include all
			
 
				+#   characters requiring dictionary break.
			
 
				+
			
 
				+$dictionary   = [:LineBreak = Complex_Context:];
			
 
				+$Control        = [\p{Grapheme_Cluster_Break = Control}];
			
 
				+$ALetterPlus  = [$ALetter [$dictionary-$Extend-$Control]];   # Note:  default ALetter does not
			
 
				+                                                             #  include the dictionary characters.
			
 
				+
			
 
				+#
			
 
				+#  Rules 4    Ignore Format and Extend characters,
			
 
				+#             except when they appear at the beginning of a region of text.
			
 
				+#
			
 
				+$KatakanaEx     = $Katakana     ($Extend |  $Format)*;
			
 
				+$ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;
			
 
				+$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
			
 
				+$MidLetterEx    = $MidLetter    ($Extend |  $Format)*;
			
 
				+$MidNumEx       = $MidNum       ($Extend |  $Format)*;
			
 
				+$NumericEx      = $Numeric      ($Extend |  $Format)*;
			
 
				+$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
			
 
				+
			
 
				+$Hiragana       = [\p{script=Hiragana}];
			
 
				+$Ideographic    = [\p{Ideographic}];
			
 
				+$HiraganaEx     = $Hiragana     ($Extend |  $Format)*;
			
 
				+$IdeographicEx  = $Ideographic  ($Extend |  $Format)*;
			
 
				+
			
 
				+## -------------------------------------------------
			
 
				+
			
 
				+!!forward;
			
 
				+
			
 
				+
			
 
				+# Rule 3 - CR x LF
			
 
				+#
			
 
				+$CR $LF;
			
 
				+
			
 
				+# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
			
 
				+#          of a region of Text.   The rule here comes into play when the start of text
			
 
				+#          begins with a group of Format chars, or with a "word" consisting of a single
			
 
				+#          char that is not in any of the listed word break categories followed by
			
 
				+#          format char(s).
			
 
				+[^$CR $LF $Newline]? ($Extend |  $Format)+;
			
 
				+
			
 
				+$NumericEx {100};
			
 
				+$ALetterEx {200};
			
 
				+$KatakanaEx {300};       # note:  these status values override those from rule 5
			
 
				+$HiraganaEx {300};       #        by virtual of being numerically larger.
			
 
				+$IdeographicEx {400};    #
			
 
				+
			
 
				+#
			
 
				+# rule 5
			
 
				+#    Do not break between most letters.
			
 
				+#
			
 
				+$ALetterEx $ALetterEx {200};
			
 
				+
			
 
				+# rule 6 and 7
			
 
				+$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
			
 
				+
			
 
				+# rule 8
			
 
				+
			
 
				+$NumericEx $NumericEx {100};
			
 
				+
			
 
				+# rule 9
			
 
				+
			
 
				+$ALetterEx $NumericEx {200};
			
 
				+
			
 
				+# rule 10
			
 
				+
			
 
				+$NumericEx $ALetterEx {200};
			
 
				+
			
 
				+# rule 11 and 12
			
 
				+
			
 
				+$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
			
 
				+
			
 
				+# rule 13
			
 
				+
			
 
				+$KatakanaEx  $KatakanaEx {300};
			
 
				+
			
 
				+# rule 13a/b
			
 
				+
			
 
				+$ALetterEx      $ExtendNumLetEx {200};    #  (13a)
			
 
				+$NumericEx      $ExtendNumLetEx {100};    #  (13a)
			
 
				+$KatakanaEx     $ExtendNumLetEx {300};    #  (13a)
			
 
				+$ExtendNumLetEx $ExtendNumLetEx {200};    #  (13a)
			
 
				+
			
 
				+$ExtendNumLetEx $ALetterEx  {200};    #  (13b)
			
 
				+$ExtendNumLetEx $NumericEx  {100};    #  (13b)
			
 
				+$ExtendNumLetEx $KatakanaEx {300};    #  (13b)
			
--- a/plugins/analysis-icu/src/test/resources/org/elasticsearch/index/analysis/icu_analysis.json
+++ b/plugins/analysis-icu/src/test/resources/org/elasticsearch/index/analysis/icu_analysis.json
@@ -0,0 +1,20 @@
 
				+{
			
 
				+    "index":{
			
 
				+        "analysis":{
			
 
				+            "tokenizer" : {
			
 
				+                "icu_tokenizer" : {
			
 
				+                    "type":"icu_tokenizer"
			
 
				+                },
			
 
				+                "user_rule_tokenizer" : {
			
 
				+                    "type":"icu_tokenizer",
			
 
				+                    "rule_files":"Latn:Latin-dont-break-on-hyphens.rbbi"
			
 
				+                },
			
 
				+                "multi_rule_tokenizer" : {
			
 
				+                    "type":"icu_tokenizer",
			
 
				+                    "rule_files":["Cyrl:KeywordTokenizer.rbbi", "thai:KeywordTokenizer.rbbi"]
			
 
				+
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}