Browse Source

Move AsciiFolding earlier in FingerprintAnalyzer filter chain

Rearranges the FingerprintAnalyzer so that AsciiFolding comes earlier in the chain (after lowercasing, before stop removal, for maximum deduping power)

Closes #18266
Zachary Tong 9 years ago
parent
commit
5ee5cc25cc

+ 1 - 1
core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzer.java

@@ -48,9 +48,9 @@ public final class FingerprintAnalyzer extends Analyzer {
         final Tokenizer tokenizer = new StandardTokenizer();
         TokenStream stream = tokenizer;
         stream = new LowerCaseFilter(stream);
+        stream = new ASCIIFoldingFilter(stream, preserveOriginal);
         stream = new StopFilter(stream, stopWords);
         stream = new FingerprintFilter(stream, maxOutputSize, separator);
-        stream = new ASCIIFoldingFilter(stream, preserveOriginal);
         return new TokenStreamComponents(tokenizer, stream);
     }
 }

+ 5 - 1
core/src/test/java/org/elasticsearch/index/analysis/FingerprintAnalyzerTests.java

@@ -43,12 +43,15 @@ public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
         Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
         assertAnalyzesTo(a, "gödel escher bach",
             new String[]{"bach escher godel"});
+
+        assertAnalyzesTo(a, "gödel godel escher bach",
+            new String[]{"bach escher godel"});
     }
 
     public void testPreserveOriginal() throws Exception {
         Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, true);
         assertAnalyzesTo(a, "gödel escher bach",
-            new String[]{"bach escher godel", "bach escher gödel"});
+            new String[]{"bach escher godel gödel"});
     }
 
     public void testLimit() throws Exception {
@@ -65,4 +68,5 @@ public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
         assertAnalyzesTo(a, "b c a",
             new String[]{"a_b_c"});
     }
+
 }

+ 5 - 6
docs/reference/analysis/analyzers/fingerprint-analyzer.asciidoc

@@ -17,11 +17,11 @@ It consists of:
 Tokenizer::
 * <<analysis-standard-tokenizer,Standard Tokenizer>>
 
-Token Filters::
-* <<analysis-lowercase-tokenfilter,Lower Case Token Filter>>
-* <<analysis-stop-tokenfilter,Stop Token Filter>> (disabled by default)
-* <<analysis-fingerprint-tokenfilter>>
-* <<analysis-asciifolding-tokenfilter>>
+Token Filters (in order)::
+1. <<analysis-lowercase-tokenfilter,Lower Case Token Filter>>
+2. <<analysis-asciifolding-tokenfilter>>
+3. <<analysis-stop-tokenfilter,Stop Token Filter>> (disabled by default)
+4. <<analysis-fingerprint-tokenfilter>>
 
 [float]
 === Example output
@@ -68,7 +68,6 @@ The `fingerprint` analyzer accepts the following parameters:
 
     A pre-defined stop words list like `_english_` or an array  containing a
     list of stop words.  Defaults to `_none_`.
-
 `stopwords_path`::
 
     The path to a file containing stop words.