Browse Source

Add support for pattern replace filter in normalizers (#96588)

This change adds support for `pattern_replace` token filters use in custom normalizers. 

Closes #83005
Marantidis Kiriakos 2 years ago
parent
commit
a8cf4d6006

+ 5 - 0
docs/changelog/96588.yaml

@@ -0,0 +1,5 @@
+pr: 96588
+summary: Support for patter_replace filter in keyword normalizer
+area: Search
+type: enhancement
+issues: []

+ 1 - 1
docs/reference/analysis/normalizers.asciidoc

@@ -9,7 +9,7 @@ allowed, but not a stemming filter, which needs to look at the keyword as a
 whole. The current list of filters that can be used in a normalizer is
 following: `arabic_normalization`, `asciifolding`, `bengali_normalization`,
 `cjk_width`, `decimal_digit`, `elision`, `german_normalization`,
-`hindi_normalization`, `indic_normalization`, `lowercase`,
+`hindi_normalization`, `indic_normalization`, `lowercase`, `pattern_replace`,
 `persian_normalization`, `scandinavian_folding`, `serbian_normalization`,
 `sorani_normalization`, `uppercase`.
 

+ 2 - 1
modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PatternReplaceTokenFilterFactory.java

@@ -15,10 +15,11 @@ import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.index.analysis.NormalizingTokenFilterFactory;
 
 import java.util.regex.Pattern;
 
-public class PatternReplaceTokenFilterFactory extends AbstractTokenFilterFactory {
+public class PatternReplaceTokenFilterFactory extends AbstractTokenFilterFactory implements NormalizingTokenFilterFactory {
 
     private final Pattern pattern;
     private final String replacement;

+ 41 - 0
modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/PatternReplaceTokenFilterTests.java

@@ -0,0 +1,41 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.analysis.common;
+
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.analysis.AnalysisTestsHelper;
+import org.elasticsearch.index.analysis.NamedAnalyzer;
+import org.elasticsearch.test.ESTestCase;
+import org.elasticsearch.test.ESTokenStreamTestCase;
+
+import java.io.IOException;
+
+public class PatternReplaceTokenFilterTests extends ESTokenStreamTestCase {
+
+    public void testNormalizer() throws IOException {
+        Settings settings = Settings.builder()
+            .putList("index.analysis.normalizer.my_normalizer.filter", "replace_zeros")
+            .put("index.analysis.filter.replace_zeros.type", "pattern_replace")
+            .put("index.analysis.filter.replace_zeros.pattern", "0+")
+            .put("index.analysis.filter.replace_zeros.replacement", "")
+            .put("index.analysis.filter.replace_zeros.all", true)
+            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+            .build();
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
+        assertNull(analysis.indexAnalyzers.get("my_normalizer"));
+        NamedAnalyzer normalizer = analysis.indexAnalyzers.getNormalizer("my_normalizer");
+        assertNotNull(normalizer);
+        assertEquals("my_normalizer", normalizer.name());
+        assertTokenStreamContents(normalizer.tokenStream("foo", "0000111"), new String[] { "111" });
+        assertEquals(new BytesRef("111"), normalizer.normalize("foo", "0000111"));
+    }
+
+}

+ 41 - 0
modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml

@@ -1683,3 +1683,44 @@
     - length: { tokens: 6 }
     - match: { tokens.0.token: the }
     - match: { tokens.1.token: THE }
+
+---
+"pattern_replace_filter":
+  - do:
+      indices.create:
+        index: test
+        body:
+          settings:
+            analysis:
+              normalizer:
+                my_normalizer:
+                  type: custom
+                  filter: ["replace_zeros"]
+              filter:
+                replace_zeros:
+                  type: pattern_replace
+                  pattern: "0+"
+                  replacement: ""
+                  all: true
+          mappings:
+            properties:
+              pagerank:
+                type: keyword
+                normalizer: my_normalizer
+
+  - do:
+      index:
+        index:  test
+        id:     "1"
+        body:   { pagerank: "000000111"}
+
+  - do:
+      indices.refresh:
+        index: [ test ]
+
+  - do:
+      search:
+        index: test
+        q: pagerank:111
+
+  - match: {hits.total.value: 1}