Browse Source

AsciiFoldingFilter's multi-term component should never preserve the original token. (#21982)

This ports the fix of https://issues.apache.org/jira/browse/LUCENE-7536 to
Elasticsearch's ASCIIFoldingTokenFilterFactory.
Adrien Grand 9 years ago
parent
commit
26cbda41ea

+ 15 - 1
core/src/main/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactory.java

@@ -47,6 +47,20 @@ public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory i
 
     @Override
     public Object getMultiTermComponent() {
-        return this;
+        if (preserveOriginal == false) {
+            return this;
+        } else {
+            // See https://issues.apache.org/jira/browse/LUCENE-7536 for the reasoning
+            return new TokenFilterFactory() {
+                @Override
+                public String name() {
+                    return ASCIIFoldingTokenFilterFactory.this.name();
+                }
+                @Override
+                public TokenStream create(TokenStream tokenStream) {
+                    return new ASCIIFoldingFilter(tokenStream, false);
+                }
+            };
+        }
     }
 }

+ 7 - 0
core/src/test/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactoryTests.java

@@ -55,5 +55,12 @@ public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase {
         Tokenizer tokenizer = new WhitespaceTokenizer();
         tokenizer.setReader(new StringReader(source));
         assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
+
+        // but the multi-term aware component still emits a single token
+        tokenFilter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenFilter).getMultiTermComponent();
+        tokenizer = new WhitespaceTokenizer();
+        tokenizer.setReader(new StringReader(source));
+        expected = new String[]{"Anspruche"};
+        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
     }
 }