Browse Source

Added Duplicate Word Check Feature to Analysis Nori (#103325)

Added Duplicate Word Check Feature to Analysis Nori

---------

Co-authored-by: Jim Ferenczi <jim.ferenczi@elastic.co>
twosom 1 year ago
parent
commit
e1cd77e687

+ 6 - 0
docs/changelog/103325.yaml

@@ -0,0 +1,6 @@
+pr: 103325
+summary: Added Duplicate Word Check Feature to Analysis Nori
+area: Search
+type: feature
+issues:
+ - 103321

+ 1 - 22
plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/KuromojiTokenizerFactory.java

@@ -12,7 +12,6 @@ import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ja.JapaneseTokenizer;
 import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
 import org.apache.lucene.analysis.ja.dict.UserDictionary;
-import org.apache.lucene.analysis.util.CSVUtil;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
@@ -23,10 +22,8 @@ import org.elasticsearch.index.analysis.Analysis;
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
-import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
-import java.util.Set;
 
 public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
 
@@ -60,11 +57,10 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
                 "It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" + " with [" + USER_DICT_RULES_OPTION + "]"
             );
         }
-        List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION, false);
+        List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION, false, true);
         if (ruleList == null || ruleList.isEmpty()) {
             return null;
         }
-        validateDuplicatedWords(ruleList);
         StringBuilder sb = new StringBuilder();
         for (String line : ruleList) {
             sb.append(line).append(System.lineSeparator());
@@ -76,23 +72,6 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
         }
     }
 
-    private static void validateDuplicatedWords(List<String> ruleList) {
-        Set<String> dup = new HashSet<>();
-        int lineNum = 0;
-        for (String line : ruleList) {
-            // ignore comments
-            if (line.startsWith("#") == false) {
-                String[] values = CSVUtil.parse(line);
-                if (dup.add(values[0]) == false) {
-                    throw new IllegalArgumentException(
-                        "Found duplicate term [" + values[0] + "] in user dictionary " + "at line [" + lineNum + "]"
-                    );
-                }
-            }
-            ++lineNum;
-        }
-    }
-
     public static JapaneseTokenizer.Mode getMode(Settings settings) {
         String modeSetting = settings.get("mode", JapaneseTokenizer.DEFAULT_MODE.name());
         return JapaneseTokenizer.Mode.valueOf(modeSetting.toUpperCase(Locale.ENGLISH));

+ 1 - 1
plugins/analysis-nori/src/main/java/org/elasticsearch/plugin/analysis/nori/NoriAnalyzerProvider.java

@@ -30,7 +30,7 @@ public class NoriAnalyzerProvider extends AbstractIndexAnalyzerProvider<KoreanAn
     public NoriAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
         super(name, settings);
         final KoreanTokenizer.DecompoundMode mode = NoriTokenizerFactory.getMode(settings);
-        final UserDictionary userDictionary = NoriTokenizerFactory.getUserDictionary(env, settings);
+        final UserDictionary userDictionary = NoriTokenizerFactory.getUserDictionary(env, settings, indexSettings);
         final List<String> tagList = Analysis.getWordList(env, settings, "stoptags");
         final Set<POS.Tag> stopTags = tagList != null ? resolvePOSList(tagList) : KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS;
         analyzer = new KoreanAnalyzer(userDictionary, mode, stopTags, false);

+ 26 - 3
plugins/analysis-nori/src/main/java/org/elasticsearch/plugin/analysis/nori/NoriTokenizerFactory.java

@@ -15,6 +15,7 @@ import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.IndexVersions;
 import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
 import org.elasticsearch.index.analysis.Analysis;
 
@@ -35,17 +36,24 @@ public class NoriTokenizerFactory extends AbstractTokenizerFactory {
     public NoriTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
         super(indexSettings, settings, name);
         decompoundMode = getMode(settings);
-        userDictionary = getUserDictionary(env, settings);
+        userDictionary = getUserDictionary(env, settings, indexSettings);
         discardPunctuation = settings.getAsBoolean("discard_punctuation", true);
     }
 
-    public static UserDictionary getUserDictionary(Environment env, Settings settings) {
+    public static UserDictionary getUserDictionary(Environment env, Settings settings, IndexSettings indexSettings) {
         if (settings.get(USER_DICT_PATH_OPTION) != null && settings.get(USER_DICT_RULES_OPTION) != null) {
             throw new IllegalArgumentException(
                 "It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" + " with [" + USER_DICT_RULES_OPTION + "]"
             );
         }
-        List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION, true);
+        List<String> ruleList = Analysis.getWordList(
+            env,
+            settings,
+            USER_DICT_PATH_OPTION,
+            USER_DICT_RULES_OPTION,
+            true,
+            isSupportDuplicateCheck(indexSettings)
+        );
         if (ruleList == null || ruleList.isEmpty()) {
             return null;
         }
@@ -60,6 +68,21 @@ public class NoriTokenizerFactory extends AbstractTokenizerFactory {
         }
     }
 
+    /**
+     * Determines if the specified index version supports duplicate checks.
+     * This method checks if the version of the index where it was created
+     * is at Version 8.13.0 or above.
+     * The feature of duplicate checks is introduced starting
+     * from version 8.13.0, hence any versions earlier than this do not support duplicate checks.
+     *
+     * @param indexSettings The settings of the index in question.
+     * @return Returns true if the version is 8.13.0 or later which means
+     * that the duplicate check feature is supported.
+     */
+    private static boolean isSupportDuplicateCheck(IndexSettings indexSettings) {
+        return indexSettings.getIndexVersionCreated().onOrAfter(IndexVersions.ES_VERSION_8_13);
+    }
+
     public static KoreanTokenizer.DecompoundMode getMode(Settings settings) {
         String modeSetting = settings.get("decompound_mode", KoreanTokenizer.DEFAULT_DECOMPOUND.name());
         return KoreanTokenizer.DecompoundMode.valueOf(modeSetting.toUpperCase(Locale.ENGLISH));

+ 26 - 0
plugins/analysis-nori/src/test/java/org/elasticsearch/plugin/analysis/nori/NoriAnalysisTests.java

@@ -17,6 +17,7 @@ import org.elasticsearch.cluster.metadata.IndexMetadata;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexVersion;
+import org.elasticsearch.index.IndexVersions;
 import org.elasticsearch.index.analysis.AnalysisTestsHelper;
 import org.elasticsearch.index.analysis.IndexAnalyzers;
 import org.elasticsearch.index.analysis.NamedAnalyzer;
@@ -117,6 +118,31 @@ public class NoriAnalysisTests extends ESTokenStreamTestCase {
         );
     }
 
+    public void testNoriAnalyzerDuplicateUserDictRule() throws Exception {
+        Settings settings = Settings.builder()
+            .put("index.analysis.analyzer.my_analyzer.type", "nori")
+            .put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersions.ES_VERSION_8_13)
+            .putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++", "C쁠쁠", "세종", "세종", "세종시 세종 시")
+            .build();
+
+        final IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings));
+        assertThat(exc.getMessage(), containsString("[세종] in user dictionary at line [3]"));
+    }
+
+    public void testNoriAnalyzerDuplicateUserDictRuleWithLegacyVersion() throws IOException {
+        Settings settings = Settings.builder()
+            .put("index.analysis.analyzer.my_analyzer.type", "nori")
+            .put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersions.V_8_10_0)
+            .putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++", "C쁠쁠", "세종", "세종", "세종시 세종 시")
+            .build();
+
+        final TestAnalysis analysis = createTestAnalysis(settings);
+        Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
+        try (TokenStream stream = analyzer.tokenStream("", "세종")) {
+            assertTokenStreamContents(stream, new String[] { "세종" });
+        }
+    }
+
     public void testNoriTokenizer() throws Exception {
         Settings settings = Settings.builder()
             .put("index.analysis.tokenizer.my_tokenizer.type", "nori_tokenizer")

+ 48 - 0
server/src/main/java/org/elasticsearch/index/analysis/Analysis.java

@@ -44,6 +44,7 @@ import org.apache.lucene.analysis.sr.SerbianAnalyzer;
 import org.apache.lucene.analysis.sv.SwedishAnalyzer;
 import org.apache.lucene.analysis.th.ThaiAnalyzer;
 import org.apache.lucene.analysis.tr.TurkishAnalyzer;
+import org.apache.lucene.analysis.util.CSVUtil;
 import org.elasticsearch.action.support.PlainActionFuture;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.logging.DeprecationCategory;
@@ -64,6 +65,7 @@ import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
@@ -257,6 +259,52 @@ public class Analysis {
         }
     }
 
+    public static List<String> getWordList(
+        Environment env,
+        Settings settings,
+        String settingPath,
+        String settingList,
+        boolean removeComments,
+        boolean checkDuplicate
+    ) {
+        final List<String> ruleList = getWordList(env, settings, settingPath, settingList, removeComments);
+        if (ruleList != null && ruleList.isEmpty() == false && checkDuplicate) {
+            checkDuplicateRules(ruleList);
+        }
+        return ruleList;
+    }
+
+    /**
+     * This method checks for any duplicate rules in the provided ruleList. Each rule in the list is parsed with CSVUtil.parse
+     * to separate the rule into individual components, represented as a String array. Only the first component from each rule
+     * is considered in the duplication check.
+     *
+     * The method will ignore any line that starts with a '#' character, treating it as a comment.
+     *
+     * The check is performed by adding the first component of each rule into a HashSet (dup), which does not allow duplicates.
+     * If the addition to the HashSet returns false, it means that item was already present in the set, indicating a duplicate.
+     * In such a case, an IllegalArgumentException is thrown specifying the duplicate term and the line number in the original list.
+     *
+     * @param ruleList The list of rules to check for duplicates.
+     * @throws IllegalArgumentException If a duplicate rule is found.
+     */
+    private static void checkDuplicateRules(List<String> ruleList) {
+        Set<String> dup = new HashSet<>();
+        int lineNum = 0;
+        for (String line : ruleList) {
+            // ignore comments
+            if (line.startsWith("#") == false) {
+                String[] values = CSVUtil.parse(line);
+                if (dup.add(values[0]) == false) {
+                    throw new IllegalArgumentException(
+                        "Found duplicate term [" + values[0] + "] in user dictionary " + "at line [" + lineNum + "]"
+                    );
+                }
+            }
+            ++lineNum;
+        }
+    }
+
     private static List<String> loadWordList(Path path, boolean removeComments) throws IOException {
         final List<String> result = new ArrayList<>();
         try (BufferedReader br = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {