1
0
Эх сурвалжийг харах

Expose FlattenGraphTokenFilter (#22643)

FlattenGraphTokenFilter is necessary for using graph-based token streams (e.g. the new SynonymGraphFilter) during indexing.
Michael McCandless 8 жил өмнө
parent
commit
ebd38e2a6a

+ 38 - 0
core/src/main/java/org/elasticsearch/index/analysis/FlattenGraphTokenFilterFactory.java

@@ -0,0 +1,38 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.synonym.FlattenGraphFilter;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+
+public class FlattenGraphTokenFilterFactory extends AbstractTokenFilterFactory {
+
+    public FlattenGraphTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+        super(indexSettings, name, settings);
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        return new FlattenGraphFilter(tokenStream);
+    }
+}

+ 73 - 0
core/src/test/java/org/elasticsearch/index/analysis/FlattenGraphTokenFilterFactoryTests.java

@@ -0,0 +1,73 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.test.ESTokenStreamTestCase;
+import org.elasticsearch.test.IndexSettingsModule;
+
+public class FlattenGraphTokenFilterFactoryTests extends ESTokenStreamTestCase {
+
+    public void testBasic() throws IOException {
+
+        Index index = new Index("test", "_na_");
+        String name = "ngr";
+        Settings indexSettings = newAnalysisSettingsBuilder().build();
+        IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
+        Settings settings = newAnalysisSettingsBuilder().build();
+
+        // "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input:
+        TokenStream in = new CannedTokenStream(0, 12, new Token[] {
+                    token("wtf", 1, 5, 0, 3),
+                    token("what", 0, 1, 0, 3),
+                    token("wow", 0, 3, 0, 3),
+                    token("the", 1, 1, 0, 3),
+                    token("fudge", 1, 3, 0, 3),
+                    token("that's", 1, 1, 0, 3),
+                    token("funny", 1, 1, 0, 3),
+                    token("happened", 1, 1, 4, 12)
+                });
+
+        TokenStream tokens = new FlattenGraphTokenFilterFactory(indexProperties, null, name, settings).create(in);
+
+        // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
+        assertTokenStreamContents(tokens,
+                new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"},
+                new int[] {0, 0, 0, 0, 0, 0, 0, 4},
+                new int[] {3, 3, 3, 3, 3, 3, 3, 12},
+                new int[] {1, 0, 0, 1, 0, 1, 0, 1},
+                new int[] {3, 1, 1, 1, 1, 1, 1, 1},
+                12);
+    }
+
+    private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
+        final Token t = new Token(term, startOffset, endOffset);
+        t.setPositionIncrement(posInc);
+        t.setPositionLength(posLength);
+        return t;
+    }
+}

+ 2 - 2
test/framework/src/main/java/org/elasticsearch/AnalysisFactoryTestCase.java

@@ -42,6 +42,7 @@ import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
 import org.elasticsearch.index.analysis.EdgeNGramTokenFilterFactory;
 import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
 import org.elasticsearch.index.analysis.ElisionTokenFilterFactory;
+import org.elasticsearch.index.analysis.FlattenGraphTokenFilterFactory;
 import org.elasticsearch.index.analysis.GermanNormalizationFilterFactory;
 import org.elasticsearch.index.analysis.GermanStemTokenFilterFactory;
 import org.elasticsearch.index.analysis.HindiNormalizationFilterFactory;
@@ -248,6 +249,7 @@ public class AnalysisFactoryTestCase extends ESTestCase {
         .put("type",                      KeepTypesFilterFactory.class)
         .put("uppercase",                 UpperCaseTokenFilterFactory.class)
         .put("worddelimiter",             WordDelimiterTokenFilterFactory.class)
+        .put("flattengraph",              FlattenGraphTokenFilterFactory.class)
 
         // TODO: these tokenfilters are not yet exposed: useful?
 
@@ -277,8 +279,6 @@ public class AnalysisFactoryTestCase extends ESTestCase {
         .put("fingerprint",               Void.class)
         // for tee-sinks
         .put("daterecognizer",            Void.class)
-        // to flatten graphs created by the synonym graph filter
-        .put("flattengraph",              Void.class)
 
         .immutableMap();