Explorar o código

Analysis: Add keep_types for filtering by token type

Robert Muir %!s(int64=11) %!d(string=hai) anos
pai
achega
5c7cefa292

+ 2 - 0
docs/reference/analysis/tokenfilters.asciidoc

@@ -79,6 +79,8 @@ include::tokenfilters/delimited-payload-tokenfilter.asciidoc[]
 
 include::tokenfilters/keep-words-tokenfilter.asciidoc[]
 
+include::tokenfilters/keep-types-tokenfilter.asciidoc[]
+
 include::tokenfilters/classic-tokenfilter.asciidoc[]
 
 include::tokenfilters/apostrophe-tokenfilter.asciidoc[]

+ 39 - 0
docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc

@@ -0,0 +1,39 @@
+[[analysis-keep-types-tokenfilter]]
+=== Keep Types Token Filter
+
+coming[1.4.0]
+
+A token filter of type `keep_types` that only keeps tokens with a token type 
+contained in a predefined set.
+
+
+[float]
+=== Options
+[horizontal]
+types:: a list of types to keep
+
+
+[float]
+=== Settings example
+
+[source,js]
+--------------------------------------------------
+{
+    "index" : {
+        "analysis" : {
+            "analyzer" : {
+                "my_analyzer" : {
+                    "tokenizer" : "standard",
+                    "filter" : ["standard", "lowercase", "extract_numbers"]
+                },
+            },
+            "filter" : {
+                "extract_numbers" : {
+                    "type" : "keep_types",
+                    "types" : [ "<NUM>" ]
+                },
+            }
+        }
+    }
+}
+--------------------------------------------------

+ 1 - 0
src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java

@@ -485,6 +485,7 @@ public class AnalysisModule extends AbstractModule {
             tokenFiltersBindings.processTokenFilter("synonym", SynonymTokenFilterFactory.class);
             tokenFiltersBindings.processTokenFilter("elision", ElisionTokenFilterFactory.class);
             tokenFiltersBindings.processTokenFilter("keep", KeepWordFilterFactory.class);
+            tokenFiltersBindings.processTokenFilter("keep_types", KeepTypesFilterFactory.class);
 
             tokenFiltersBindings.processTokenFilter("pattern_capture", PatternCaptureGroupTokenFilterFactory.class);
             tokenFiltersBindings.processTokenFilter("pattern_replace", PatternReplaceTokenFilterFactory.class);

+ 69 - 0
src/main/java/org/elasticsearch/index/analysis/KeepTypesFilterFactory.java

@@ -0,0 +1,69 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.TypeTokenFilter;
+import org.elasticsearch.ElasticsearchIllegalArgumentException;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * A {@link TokenFilterFactory} for {@link TypeFilter}. This filter only
+ * keep tokens that are contained in the set configured via
+ * {@value #KEEP_TYPES_KEY} setting. 
+ * <p/>
+ * Configuration options:
+ * <p/>
+ * <ul>
+ * <li>{@value #KEEP_TYPES_KEY} the array of words / tokens to keep.</li>
+ * </ul>
+ */
+@AnalysisSettingsRequired
+public class KeepTypesFilterFactory extends AbstractTokenFilterFactory {
+    private final Set<String> keepTypes;
+    private static final String KEEP_TYPES_KEY = "types";
+
+    @Inject
+    public KeepTypesFilterFactory(Index index, @IndexSettings Settings indexSettings,
+                                 Environment env, @Assisted String name, @Assisted Settings settings) {
+        super(index, indexSettings, name, settings);
+
+        final String[] arrayKeepTypes = settings.getAsArray(KEEP_TYPES_KEY, null);
+        if ((arrayKeepTypes == null)) {
+            throw new ElasticsearchIllegalArgumentException("keep_types requires `" + KEEP_TYPES_KEY + "` to be configured");
+        }
+
+        this.keepTypes = new HashSet<>(Arrays.asList(arrayKeepTypes));
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        return new TypeTokenFilter(version, tokenStream, keepTypes, true);
+    }
+}

+ 1 - 2
src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java

@@ -147,6 +147,7 @@ public class AnalysisFactoryTests extends ElasticsearchTestCase {
         put("trim",                      TrimTokenFilterFactory.class);
         put("truncate",                  TruncateTokenFilterFactory.class);
         put("turkishlowercase",          LowerCaseTokenFilterFactory.class);
+        put("type",                      KeepTypesFilterFactory.class);
         put("uppercase",                 UpperCaseTokenFilterFactory.class);
         put("worddelimiter",             WordDelimiterTokenFilterFactory.class);
                 
@@ -168,8 +169,6 @@ public class AnalysisFactoryTests extends ElasticsearchTestCase {
         put("removeduplicates",          Void.class);
         // ???
         put("tokenoffsetpayload",        Void.class);
-        // like a stop filter but by token-type
-        put("type",                      Void.class);
         // puts the type into the payload
         put("typeaspayload",             Void.class);
     }};

+ 50 - 0
src/test/java/org/elasticsearch/index/analysis/KeepTypesFilterFactoryTests.java

@@ -0,0 +1,50 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.elasticsearch.common.settings.ImmutableSettings;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.test.ElasticsearchTokenStreamTestCase;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import static org.hamcrest.Matchers.instanceOf;
+
+public class KeepTypesFilterFactoryTests extends ElasticsearchTokenStreamTestCase {
+
+    @Test
+    public void testKeepTypes() throws IOException {
+        Settings settings = ImmutableSettings.settingsBuilder()
+                .put("index.analysis.filter.keep_numbers.type", "keep_types")
+                .putArray("index.analysis.filter.keep_numbers.types", new String[] {"<NUM>", "<SOMETHINGELSE>"})
+                .build();
+        AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
+        TokenFilterFactory tokenFilter = analysisService.tokenFilter("keep_numbers");
+        assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class));
+        String source = "Hello 123 world";
+        String[] expected = new String[]{"123"};
+        Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(source));
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{2});
+    }
+}