Procházet zdrojové kódy

Remove the preserve_original option from the FingerprintAnalyzer (#18471)

The preserve_original option to the ASCIIFoldingFilter doesn't
play well with the FingerprintFilter, as it ends up producing
fingerprints like:

    "and consistent godel gödel is said sentence this yes"

The goal of the OpenRefine algorithm is to product a small normalized
ASCII fingerprint. There's no need to expose preserve_original.
Clinton Gormley před 9 roky
rodič
revize
dc33a83231

+ 2 - 4
core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzer.java

@@ -33,13 +33,11 @@ import org.apache.lucene.analysis.util.CharArraySet;
 public final class FingerprintAnalyzer extends Analyzer {
     private final char separator;
     private final int maxOutputSize;
-    private final boolean preserveOriginal;
     private final CharArraySet stopWords;
 
-    public FingerprintAnalyzer(CharArraySet stopWords, char separator, int maxOutputSize, boolean preserveOriginal) {
+    public FingerprintAnalyzer(CharArraySet stopWords, char separator, int maxOutputSize) {
         this.separator = separator;
         this.maxOutputSize = maxOutputSize;
-        this.preserveOriginal = preserveOriginal;
         this.stopWords = stopWords;
     }
 
@@ -48,7 +46,7 @@ public final class FingerprintAnalyzer extends Analyzer {
         final Tokenizer tokenizer = new StandardTokenizer();
         TokenStream stream = tokenizer;
         stream = new LowerCaseFilter(stream);
-        stream = new ASCIIFoldingFilter(stream, preserveOriginal);
+        stream = new ASCIIFoldingFilter(stream, false);
         stream = new StopFilter(stream, stopWords);
         stream = new FingerprintFilter(stream, maxOutputSize, separator);
         return new TokenStreamComponents(tokenizer, stream);

+ 1 - 4
core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzerProvider.java

@@ -34,10 +34,8 @@ import org.elasticsearch.index.IndexSettings;
 public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider<Analyzer> {
 
     public static ParseField MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.MAX_OUTPUT_SIZE;
-    public static ParseField PRESERVE_ORIGINAL = ASCIIFoldingTokenFilterFactory.PRESERVE_ORIGINAL;
 
     public static int DEFAULT_MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.DEFAULT_MAX_OUTPUT_SIZE;
-    public static boolean DEFAULT_PRESERVE_ORIGINAL = ASCIIFoldingTokenFilterFactory.DEFAULT_PRESERVE_ORIGINAL;
     public static CharArraySet DEFAULT_STOP_WORDS = CharArraySet.EMPTY_SET;
 
     private final FingerprintAnalyzer analyzer;
@@ -47,10 +45,9 @@ public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider<A
 
         char separator = FingerprintTokenFilterFactory.parseSeparator(settings);
         int maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(),DEFAULT_MAX_OUTPUT_SIZE);
-        boolean preserveOriginal = settings.getAsBoolean(PRESERVE_ORIGINAL.getPreferredName(), DEFAULT_PRESERVE_ORIGINAL);
         CharArraySet stopWords = Analysis.parseStopWords(env, settings, DEFAULT_STOP_WORDS);
 
-        this.analyzer = new FingerprintAnalyzer(stopWords, separator, maxOutputSize, preserveOriginal);
+        this.analyzer = new FingerprintAnalyzer(stopWords, separator, maxOutputSize);
     }
 
     @Override

+ 4 - 16
core/src/test/java/org/elasticsearch/index/analysis/FingerprintAnalyzerTests.java

@@ -26,13 +26,13 @@ import org.elasticsearch.test.ESTokenStreamTestCase;
 public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
 
     public void testFingerprint() throws Exception {
-        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
+        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255);
         assertAnalyzesTo(a, "foo bar@baz Baz $ foo foo FOO. FoO",
             new String[]{"bar baz foo"});
     }
 
     public void testReusableTokenStream() throws Exception {
-        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
+        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255);
         assertAnalyzesTo(a, "foo bar baz Baz foo foo FOO. FoO",
             new String[]{"bar baz foo"});
         assertAnalyzesTo(a, "xyz XYZ abc 123.2 abc",
@@ -40,7 +40,7 @@ public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
     }
 
     public void testAsciifolding() throws Exception {
-        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
+        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255);
         assertAnalyzesTo(a, "gödel escher bach",
             new String[]{"bach escher godel"});
 
@@ -48,14 +48,8 @@ public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
             new String[]{"bach escher godel"});
     }
 
-    public void testPreserveOriginal() throws Exception {
-        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, true);
-        assertAnalyzesTo(a, "gödel escher bach",
-            new String[]{"bach escher godel gödel"});
-    }
-
     public void testLimit() throws Exception {
-        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 3, false);
+        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 3);
         assertAnalyzesTo(a, "e d c b a",
             new String[]{});
 
@@ -63,10 +57,4 @@ public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
             new String[]{"a b"});
     }
 
-    public void testSeparator() throws Exception {
-        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, '_', 255, true);
-        assertAnalyzesTo(a, "b c a",
-            new String[]{"a_b_c"});
-    }
-
 }