Browse Source

Add exclusion option to `keep_types` token filter (#32012)

Currently the `keep_types` token filter includes all token types specified using
its `types` parameter. Lucenes TypeTokenFilter also provides a second mode where
instead of keeping the specified tokens (include) they are filtered out
(exclude). This change exposes this option as a new `mode` parameter that can
either take the values `include` (the default, if not specified) or `exclude`.

Closes #29277
Christoph Büscher 7 years ago
parent
commit
61486680a2

+ 71 - 3
docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc

@@ -8,8 +8,9 @@ contained in a predefined set.
 [float]
 === Options
 [horizontal]
-types:: a list of types to keep
-
+types:: a list of types to include (default mode) or exclude
+mode:: if set to `include` (default) the specified token types will be kept, 
+if set to `exclude` the specified token types will be removed from the stream
 
 [float]
 === Settings example
@@ -53,7 +54,7 @@ POST /keep_types_example/_analyze
 // CONSOLE
 // TEST[continued]
 
-And it'd respond:
+The response will be:
 
 [source,js]
 --------------------------------------------------
@@ -72,3 +73,70 @@ And it'd respond:
 // TESTRESPONSE
 
 Note how only the `<NUM>` token is in the output.
+
+=== Exclude mode settings example
+
+If the `mode` parameter is set to `exclude` like in the following example:
+
+[source,js]
+--------------------------------------------------
+PUT /keep_types_exclude_example
+{
+    "settings" : {
+        "analysis" : {
+            "analyzer" : {
+                "my_analyzer" : {
+                    "tokenizer" : "standard",
+                    "filter" : ["standard", "lowercase", "remove_numbers"]
+                }
+            },
+            "filter" : {
+                "remove_numbers" : {
+                    "type" : "keep_types",
+                    "mode" : "exclude",
+                    "types" : [ "<NUM>" ]
+                }
+            }
+        }
+    }
+}
+--------------------------------------------------
+// CONSOLE
+
+And we test it like:
+
+[source,js]
+--------------------------------------------------
+POST /keep_types_exclude_example/_analyze
+{
+  "analyzer" : "my_analyzer",
+  "text" : "hello 101 world"
+}
+--------------------------------------------------
+// CONSOLE
+// TEST[continued]
+
+The response will be:
+
+[source,js]
+--------------------------------------------------
+{
+  "tokens": [
+    {
+      "token": "hello",
+      "start_offset": 0,
+      "end_offset": 5,
+      "type": "<ALPHANUM>",
+      "position": 0
+    }, 
+    {
+      "token": "world",
+      "start_offset": 10,
+      "end_offset": 15,
+      "type": "<ALPHANUM>",
+      "position": 2
+    }
+  ]
+}
+--------------------------------------------------
+// TESTRESPONSE

+ 31 - 5
modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/KeepTypesFilterFactory.java

@@ -29,21 +29,47 @@ import org.elasticsearch.index.analysis.TokenFilterFactory;
 
 import java.util.HashSet;
 import java.util.List;
+import java.util.Locale;
 import java.util.Set;
 
 /**
  * A {@link TokenFilterFactory} for {@link TypeTokenFilter}. This filter only
  * keep tokens that are contained in the set configured via
- * {@value #KEEP_TYPES_KEY} setting.
+ * {@value #KEEP_TYPES_MODE_KEY} setting.
  * <p>
  * Configuration options:
  * <ul>
- * <li>{@value #KEEP_TYPES_KEY} the array of words / tokens to keep.</li>
+ * <li>{@value #KEEP_TYPES_KEY} the array of words / tokens.</li>
+ * <li>{@value #KEEP_TYPES_MODE_KEY} whether to keep ("include") or discard
+ * ("exclude") the specified token types.</li>
  * </ul>
  */
 public class KeepTypesFilterFactory extends AbstractTokenFilterFactory {
     private final Set<String> keepTypes;
-    private static final String KEEP_TYPES_KEY = "types";
+    private final KeepTypesMode includeMode;
+    static final String KEEP_TYPES_KEY = "types";
+    static final String KEEP_TYPES_MODE_KEY = "mode";
+
+    enum KeepTypesMode {
+        INCLUDE, EXCLUDE;
+
+        @Override
+        public String toString() {
+            return this.name().toLowerCase(Locale.ROOT);
+        }
+
+        private static KeepTypesMode fromString(String modeString) {
+            String lc = modeString.toLowerCase(Locale.ROOT);
+            if (lc.equals("include")) {
+                return INCLUDE;
+            } else if (lc.equals("exclude")) {
+                return EXCLUDE;
+            } else {
+                throw new IllegalArgumentException("`keep_types` tokenfilter mode can only be [" + KeepTypesMode.INCLUDE + "] or ["
+                        + KeepTypesMode.EXCLUDE + "] but was [" + modeString + "].");
+            }
+        }
+    };
 
     KeepTypesFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
         super(indexSettings, name, settings);
@@ -52,12 +78,12 @@ public class KeepTypesFilterFactory extends AbstractTokenFilterFactory {
         if ((arrayKeepTypes == null)) {
             throw new IllegalArgumentException("keep_types requires `" + KEEP_TYPES_KEY + "` to be configured");
         }
-
+        this.includeMode = KeepTypesMode.fromString(settings.get(KEEP_TYPES_MODE_KEY, "include"));
         this.keepTypes = new HashSet<>(arrayKeepTypes);
     }
 
     @Override
     public TokenStream create(TokenStream tokenStream) {
-        return new TypeTokenFilter(tokenStream, keepTypes, true);
+        return new TypeTokenFilter(tokenStream, keepTypes, includeMode == KeepTypesMode.INCLUDE);
     }
 }

+ 40 - 8
modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/KeepTypesFilterFactoryTests.java

@@ -34,19 +34,51 @@ import java.io.StringReader;
 import static org.hamcrest.Matchers.instanceOf;
 
 public class KeepTypesFilterFactoryTests extends ESTokenStreamTestCase {
-    public void testKeepTypes() throws IOException {
-        Settings settings = Settings.builder()
-                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
-                .put("index.analysis.filter.keep_numbers.type", "keep_types")
-                .putList("index.analysis.filter.keep_numbers.types", new String[] {"<NUM>", "<SOMETHINGELSE>"})
-                .build();
+
+    private static final String BASE_SETTING = "index.analysis.filter.keep_numbers";
+
+    public void testKeepTypesInclude() throws IOException {
+        Settings.Builder settingsBuilder = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put(BASE_SETTING + ".type", "keep_types")
+                .putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "<NUM>", "<SOMETHINGELSE>" });
+        // either use default mode or set "include" mode explicitly
+        if (random().nextBoolean()) {
+            settingsBuilder.put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY,
+                    KeepTypesFilterFactory.KeepTypesMode.INCLUDE);
+        }
+        Settings settings = settingsBuilder.build();
+        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers");
+        assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class));
+        String source = "Hello 123 world";
+        String[] expected = new String[] { "123" };
+        Tokenizer tokenizer = new StandardTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] { 2 });
+    }
+
+    public void testKeepTypesExclude() throws IOException {
+        Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put(BASE_SETTING + ".type", "keep_types")
+                .putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "<NUM>", "<SOMETHINGELSE>" })
+                .put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY, KeepTypesFilterFactory.KeepTypesMode.EXCLUDE).build();
         ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
         TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers");
         assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class));
         String source = "Hello 123 world";
-        String[] expected = new String[]{"123"};
+        String[] expected = new String[] { "Hello", "world" };
         Tokenizer tokenizer = new StandardTokenizer();
         tokenizer.setReader(new StringReader(source));
-        assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{2});
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[] { 1, 2 });
+    }
+
+    public void testKeepTypesException() throws IOException {
+        Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+                .put(BASE_SETTING + ".type", "keep_types")
+                .putList(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_KEY, new String[] { "<NUM>", "<SOMETHINGELSE>" })
+                .put(BASE_SETTING + "." + KeepTypesFilterFactory.KEEP_TYPES_MODE_KEY, "bad_parameter").build();
+        IllegalArgumentException ex = expectThrows(IllegalArgumentException.class,
+                () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin()));
+        assertEquals("`keep_types` tokenfilter mode can only be [include] or [exclude] but was [bad_parameter].", ex.getMessage());
     }
 }