9 years ago · 80288ad60c
--- a/core/src/main/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactory.java
+++ b/core/src/main/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactory.java
@@ -21,6 +21,7 @@ package org.elasticsearch.index.analysis;
 
				 
			
 
				 import org.apache.lucene.analysis.TokenStream;
			
 
				 import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
			
 
				+import org.elasticsearch.common.ParseField;
			
 
				 import org.elasticsearch.common.settings.Settings;
			
 
				 import org.elasticsearch.env.Environment;
			
 
				 import org.elasticsearch.index.IndexSettings;
			
@@ -29,11 +30,14 @@ import org.elasticsearch.index.IndexSettings;
 
				  * Factory for ASCIIFoldingFilter.
			
 
				  */
			
 
				 public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory {
			
 
				+    public static ParseField PRESERVE_ORIGINAL = new ParseField("preserve_original");
			
 
				+    public static boolean DEFAULT_PRESERVE_ORIGINAL = false;
			
 
				+
			
 
				     private final boolean preserveOriginal;
			
 
				 
			
 
				     public ASCIIFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
			
 
				         super(indexSettings, name, settings);
			
 
				-        preserveOriginal = settings.getAsBoolean("preserve_original", false);
			
 
				+        preserveOriginal = settings.getAsBoolean(PRESERVE_ORIGINAL.getPreferredName(), DEFAULT_PRESERVE_ORIGINAL);
			
 
				     }
			
 
				 
			
 
				     @Override
			
--- a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java
+++ b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisRegistry.java
@@ -258,6 +258,7 @@ public final class AnalysisRegistry implements Closeable {
 
				         tokenFilters.put("apostrophe", ApostropheFilterFactory::new);
			
 
				         tokenFilters.put("classic", ClassicFilterFactory::new);
			
 
				         tokenFilters.put("decimal_digit", DecimalDigitFilterFactory::new);
			
 
				+        tokenFilters.put("fingerprint", FingerprintTokenFilterFactory::new);
			
 
				     }
			
 
				 
			
 
				     private void registerBuiltInAnalyzer(Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider>> analyzers) {
			
@@ -304,6 +305,7 @@ public final class AnalysisRegistry implements Closeable {
 
				         analyzers.put("swedish", SwedishAnalyzerProvider::new);
			
 
				         analyzers.put("turkish", TurkishAnalyzerProvider::new);
			
 
				         analyzers.put("thai", ThaiAnalyzerProvider::new);
			
 
				+        analyzers.put("fingerprint", FingerprintAnalyzerProvider::new);
			
 
				     }
			
 
				 
			
 
				     private <T> Map<String, T> buildMapping(boolean analyzer, String toBuild, IndexSettings settings, Map<String, Settings> settingsMap, Map<String, AnalysisModule.AnalysisProvider<T>> providerMap, Map<String, AnalysisModule.AnalysisProvider<T>> defaultInstance) throws IOException {
			
--- a/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzer.java
+++ b/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzer.java
@@ -0,0 +1,56 @@
 
				+/*
			
 
				+ * Licensed to Elasticsearch under one or more contributor
			
 
				+ * license agreements. See the NOTICE file distributed with
			
 
				+ * this work for additional information regarding copyright
			
 
				+ * ownership. Elasticsearch licenses this file to you under
			
 
				+ * the Apache License, Version 2.0 (the "License"); you may
			
 
				+ * not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing,
			
 
				+ * software distributed under the License is distributed on an
			
 
				+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
			
 
				+ * KIND, either express or implied.  See the License for the
			
 
				+ * specific language governing permissions and limitations
			
 
				+ * under the License.
			
 
				+ */
			
 
				+
			
 
				+package org.elasticsearch.index.analysis;
			
 
				+
			
 
				+import org.apache.lucene.analysis.Analyzer;
			
 
				+import org.apache.lucene.analysis.TokenStream;
			
 
				+import org.apache.lucene.analysis.Tokenizer;
			
 
				+import org.apache.lucene.analysis.core.LowerCaseFilter;
			
 
				+import org.apache.lucene.analysis.core.StopFilter;
			
 
				+import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
			
 
				+import org.apache.lucene.analysis.miscellaneous.FingerprintFilter;
			
 
				+import org.apache.lucene.analysis.standard.StandardTokenizer;
			
 
				+import org.apache.lucene.analysis.util.CharArraySet;
			
 
				+
			
 
				+/** OpenRefine Fingerprinting, which uses a Standard tokenizer and lowercase + stop + fingerprint + asciifolding filters */
			
 
				+public final class FingerprintAnalyzer extends Analyzer {
			
 
				+    private final char separator;
			
 
				+    private final int maxOutputSize;
			
 
				+    private final boolean preserveOriginal;
			
 
				+    private final CharArraySet stopWords;
			
 
				+
			
 
				+    public FingerprintAnalyzer(CharArraySet stopWords, char separator, int maxOutputSize, boolean preserveOriginal) {
			
 
				+        this.separator = separator;
			
 
				+        this.maxOutputSize = maxOutputSize;
			
 
				+        this.preserveOriginal = preserveOriginal;
			
 
				+        this.stopWords = stopWords;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    protected TokenStreamComponents createComponents(String s) {
			
 
				+        final Tokenizer tokenizer = new StandardTokenizer();
			
 
				+        TokenStream stream = tokenizer;
			
 
				+        stream = new LowerCaseFilter(stream);
			
 
				+        stream = new StopFilter(stream, stopWords);
			
 
				+        stream = new FingerprintFilter(stream, maxOutputSize, separator);
			
 
				+        stream = new ASCIIFoldingFilter(stream, preserveOriginal);
			
 
				+        return new TokenStreamComponents(tokenizer, stream);
			
 
				+    }
			
 
				+}
			
--- a/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzerProvider.java
+++ b/core/src/main/java/org/elasticsearch/index/analysis/FingerprintAnalyzerProvider.java
@@ -0,0 +1,60 @@
 
				+/*
			
 
				+ * Licensed to Elasticsearch under one or more contributor
			
 
				+ * license agreements. See the NOTICE file distributed with
			
 
				+ * this work for additional information regarding copyright
			
 
				+ * ownership. Elasticsearch licenses this file to you under
			
 
				+ * the Apache License, Version 2.0 (the "License"); you may
			
 
				+ * not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing,
			
 
				+ * software distributed under the License is distributed on an
			
 
				+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
			
 
				+ * KIND, either express or implied.  See the License for the
			
 
				+ * specific language governing permissions and limitations
			
 
				+ * under the License.
			
 
				+ */
			
 
				+
			
 
				+package org.elasticsearch.index.analysis;
			
 
				+
			
 
				+import org.apache.lucene.analysis.Analyzer;
			
 
				+import org.apache.lucene.analysis.util.CharArraySet;
			
 
				+import org.elasticsearch.common.ParseField;
			
 
				+import org.elasticsearch.common.settings.Settings;
			
 
				+import org.elasticsearch.env.Environment;
			
 
				+import org.elasticsearch.index.IndexSettings;
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * Builds an OpenRefine Fingerprint analyzer.  Uses the default settings from the various components
			
 
				+ * (Standard Tokenizer and lowercase + stop + fingerprint + ascii-folding filters)
			
 
				+ */
			
 
				+public class FingerprintAnalyzerProvider extends AbstractIndexAnalyzerProvider<Analyzer> {
			
 
				+
			
 
				+    public static ParseField MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.MAX_OUTPUT_SIZE;
			
 
				+    public static ParseField PRESERVE_ORIGINAL = ASCIIFoldingTokenFilterFactory.PRESERVE_ORIGINAL;
			
 
				+
			
 
				+    public static int DEFAULT_MAX_OUTPUT_SIZE = FingerprintTokenFilterFactory.DEFAULT_MAX_OUTPUT_SIZE;
			
 
				+    public static boolean DEFAULT_PRESERVE_ORIGINAL = ASCIIFoldingTokenFilterFactory.DEFAULT_PRESERVE_ORIGINAL;
			
 
				+    public static CharArraySet DEFAULT_STOP_WORDS = CharArraySet.EMPTY_SET;
			
 
				+
			
 
				+    private final FingerprintAnalyzer analyzer;
			
 
				+
			
 
				+    public FingerprintAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
			
 
				+        super(indexSettings, name, settings);
			
 
				+
			
 
				+        char separator = FingerprintTokenFilterFactory.parseSeparator(settings);
			
 
				+        int maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(),DEFAULT_MAX_OUTPUT_SIZE);
			
 
				+        boolean preserveOriginal = settings.getAsBoolean(PRESERVE_ORIGINAL.getPreferredName(), DEFAULT_PRESERVE_ORIGINAL);
			
 
				+        CharArraySet stopWords = Analysis.parseStopWords(env, settings, DEFAULT_STOP_WORDS);
			
 
				+
			
 
				+        this.analyzer = new FingerprintAnalyzer(stopWords, separator, maxOutputSize, preserveOriginal);
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public FingerprintAnalyzer get() {
			
 
				+        return analyzer;
			
 
				+    }
			
 
				+}
			
--- a/core/src/main/java/org/elasticsearch/index/analysis/FingerprintTokenFilterFactory.java
+++ b/core/src/main/java/org/elasticsearch/index/analysis/FingerprintTokenFilterFactory.java
@@ -0,0 +1,69 @@
 
				+/*
			
 
				+ * Licensed to Elasticsearch under one or more contributor
			
 
				+ * license agreements. See the NOTICE file distributed with
			
 
				+ * this work for additional information regarding copyright
			
 
				+ * ownership. Elasticsearch licenses this file to you under
			
 
				+ * the Apache License, Version 2.0 (the "License"); you may
			
 
				+ * not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing,
			
 
				+ * software distributed under the License is distributed on an
			
 
				+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
			
 
				+ * KIND, either express or implied.  See the License for the
			
 
				+ * specific language governing permissions and limitations
			
 
				+ * under the License.
			
 
				+ */
			
 
				+
			
 
				+package org.elasticsearch.index.analysis;
			
 
				+
			
 
				+import org.apache.lucene.analysis.TokenStream;
			
 
				+import org.apache.lucene.analysis.miscellaneous.FingerprintFilter;
			
 
				+import org.elasticsearch.common.ParseField;
			
 
				+import org.elasticsearch.common.settings.Settings;
			
 
				+import org.elasticsearch.env.Environment;
			
 
				+import org.elasticsearch.index.IndexSettings;
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ *
			
 
				+ */
			
 
				+public class FingerprintTokenFilterFactory extends AbstractTokenFilterFactory {
			
 
				+
			
 
				+    private final char separator;
			
 
				+    private final int maxOutputSize;
			
 
				+
			
 
				+    public static ParseField SEPARATOR = new ParseField("separator");
			
 
				+    public static ParseField MAX_OUTPUT_SIZE = new ParseField("max_output_size");
			
 
				+
			
 
				+    public static final char DEFAULT_SEPARATOR  = ' ';
			
 
				+    public static final int DEFAULT_MAX_OUTPUT_SIZE = 255;
			
 
				+
			
 
				+    public FingerprintTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
			
 
				+        super(indexSettings, name, settings);
			
 
				+        this.separator = parseSeparator(settings);
			
 
				+        this.maxOutputSize = settings.getAsInt(MAX_OUTPUT_SIZE.getPreferredName(),
			
 
				+            FingerprintTokenFilterFactory.DEFAULT_MAX_OUTPUT_SIZE);
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public TokenStream create(TokenStream tokenStream) {
			
 
				+        TokenStream result = tokenStream;
			
 
				+        result = new FingerprintFilter(result, maxOutputSize, separator);
			
 
				+        return result;
			
 
				+    }
			
 
				+
			
 
				+    public static char parseSeparator(Settings settings) throws IllegalArgumentException {
			
 
				+        String customSeparator = settings.get(SEPARATOR.getPreferredName());
			
 
				+        if (customSeparator == null) {
			
 
				+            return FingerprintTokenFilterFactory.DEFAULT_SEPARATOR;
			
 
				+        } else if (customSeparator.length() == 1) {
			
 
				+            return customSeparator.charAt(0);
			
 
				+        }
			
 
				+
			
 
				+        throw new IllegalArgumentException("Setting [separator] must be a single, non-null character. ["
			
 
				+            + customSeparator + "] was provided.");
			
 
				+    }
			
 
				+}
			
--- a/core/src/test/java/org/elasticsearch/index/analysis/FingerprintAnalyzerTests.java
+++ b/core/src/test/java/org/elasticsearch/index/analysis/FingerprintAnalyzerTests.java
@@ -0,0 +1,68 @@
 
				+package org.elasticsearch.index.analysis;
			
 
				+
			
 
				+/*
			
 
				+ * Licensed to Elasticsearch under one or more contributor
			
 
				+ * license agreements. See the NOTICE file distributed with
			
 
				+ * this work for additional information regarding copyright
			
 
				+ * ownership. Elasticsearch licenses this file to you under
			
 
				+ * the Apache License, Version 2.0 (the "License"); you may
			
 
				+ * not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing,
			
 
				+ * software distributed under the License is distributed on an
			
 
				+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
			
 
				+ * KIND, either express or implied.  See the License for the
			
 
				+ * specific language governing permissions and limitations
			
 
				+ * under the License.
			
 
				+ */
			
 
				+
			
 
				+import org.apache.lucene.analysis.Analyzer;
			
 
				+import org.apache.lucene.analysis.util.CharArraySet;
			
 
				+import org.elasticsearch.test.ESTokenStreamTestCase;
			
 
				+
			
 
				+public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {
			
 
				+
			
 
				+    public void testFingerprint() throws Exception {
			
 
				+        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
			
 
				+        assertAnalyzesTo(a, "foo bar@baz Baz $ foo foo FOO. FoO",
			
 
				+            new String[]{"bar baz foo"});
			
 
				+    }
			
 
				+
			
 
				+    public void testReusableTokenStream() throws Exception {
			
 
				+        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
			
 
				+        assertAnalyzesTo(a, "foo bar baz Baz foo foo FOO. FoO",
			
 
				+            new String[]{"bar baz foo"});
			
 
				+        assertAnalyzesTo(a, "xyz XYZ abc 123.2 abc",
			
 
				+            new String[]{"123.2 abc xyz"});
			
 
				+    }
			
 
				+
			
 
				+    public void testAsciifolding() throws Exception {
			
 
				+        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, false);
			
 
				+        assertAnalyzesTo(a, "gödel escher bach",
			
 
				+            new String[]{"bach escher godel"});
			
 
				+    }
			
 
				+
			
 
				+    public void testPreserveOriginal() throws Exception {
			
 
				+        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 255, true);
			
 
				+        assertAnalyzesTo(a, "gödel escher bach",
			
 
				+            new String[]{"bach escher godel", "bach escher gödel"});
			
 
				+    }
			
 
				+
			
 
				+    public void testLimit() throws Exception {
			
 
				+        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, ' ', 3, false);
			
 
				+        assertAnalyzesTo(a, "e d c b a",
			
 
				+            new String[]{});
			
 
				+
			
 
				+        assertAnalyzesTo(a, "b a",
			
 
				+            new String[]{"a b"});
			
 
				+    }
			
 
				+
			
 
				+    public void testSeparator() throws Exception {
			
 
				+        Analyzer a = new FingerprintAnalyzer(CharArraySet.EMPTY_SET, '_', 255, true);
			
 
				+        assertAnalyzesTo(a, "b c a",
			
 
				+            new String[]{"a_b_c"});
			
 
				+    }
			
 
				+}
			
--- a/docs/reference/analysis/analyzers.asciidoc
+++ b/docs/reference/analysis/analyzers.asciidoc
@@ -79,5 +79,7 @@ include::analyzers/lang-analyzer.asciidoc[]
 
				 
			
 
				 include::analyzers/snowball-analyzer.asciidoc[]
			
 
				 
			
 
				+include::analyzers/fingerprint-analyzer.asciidoc[]
			
 
				+
			
 
				 include::analyzers/custom-analyzer.asciidoc[]
			
 
				 
			
--- a/docs/reference/analysis/analyzers/fingerprint-analyzer.asciidoc
+++ b/docs/reference/analysis/analyzers/fingerprint-analyzer.asciidoc
@@ -0,0 +1,41 @@
 
				+[[analysis-fingerprint-analyzer]]
			
 
				+=== Fingerprint Analyzer
			
 
				+
			
 
				+The `fingerprint` analyzer implements a
			
 
				+https://github.com/OpenRefine/OpenRefine/wiki/Clustering-In-Depth#fingerprint[fingerprinting algorithm]
			
 
				+which is used by the OpenRefine project to assist in clustering.
			
 
				+
			
 
				+The `fingerprint` analyzer is composed of a <<analysis-standard-tokenizer>>, and four
			
 
				+token filters (in this order): <<analysis-lowercase-tokenfilter>>, <<analysis-stop-tokenfilter>>,
			
 
				+<<analysis-fingerprint-tokenfilter>> and <<analysis-asciifolding-tokenfilter>>.
			
 
				+
			
 
				+Input text is lowercased, normalized to remove extended characters, sorted, deduplicated and
			
 
				+concatenated into a single token.  If a stopword list is configured, stop words will
			
 
				+also be removed. For example, the sentence:
			
 
				+
			
 
				+____
			
 
				+"Yes yes, Gödel said this sentence is consistent and."
			
 
				+____
			
 
				+
			
 
				+will be transformed into the token: `"and consistent godel is said sentence this yes"`
			
 
				+
			
 
				+
			
 
				+Notice how the words are all lowercased, the umlaut in "gödel" has been normalized to "godel",
			
 
				+punctuation has been removed, and "yes" has been de-duplicated.
			
 
				+
			
 
				+The `fingerprint` analyzer has these configurable settings
			
 
				+
			
 
				+[cols="<,<",options="header",]
			
 
				+|=======================================================================
			
 
				+|Setting |Description
			
 
				+|`separator` | The character that separates the tokens after concatenation.
			
 
				+Defaults to a space.
			
 
				+|`max_output_size` | The maximum token size to emit. Defaults to `255`. See <<analysis-fingerprint-tokenfilter-max-size>>
			
 
				+|`preserve_original`| If true, emits both the original and folded version of
			
 
				+ tokens that contain extended characters.  Defaults to `false`
			
 
				+|`stopwords` | A list of stop words to use. Defaults to an empty list (`_none_`).
			
 
				+|`stopwords_path` | A path (either relative to `config` location, or absolute) to a stopwords
			
 
				+                        file configuration. Each stop word should be in its own "line" (separated
			
 
				+                        by a line break). The file must be UTF-8 encoded.
			
 
				+|=======================================================================
			
 
				+
			
--- a/docs/reference/analysis/tokenfilters.asciidoc
+++ b/docs/reference/analysis/tokenfilters.asciidoc
@@ -86,3 +86,5 @@ include::tokenfilters/classic-tokenfilter.asciidoc[]
 
				 include::tokenfilters/apostrophe-tokenfilter.asciidoc[]
			
 
				 
			
 
				 include::tokenfilters/decimal-digit-tokenfilter.asciidoc[]
			
 
				+
			
 
				+include::tokenfilters/fingerprint-tokenfilter.asciidoc[]
			
--- a/docs/reference/analysis/tokenfilters/fingerprint-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/fingerprint-tokenfilter.asciidoc
@@ -0,0 +1,28 @@
 
				+[[analysis-fingerprint-tokenfilter]]
			
 
				+=== Fingerprint Token Filter
			
 
				+
			
 
				+The `fingerprint` token filter emits a single token which is useful for fingerprinting
			
 
				+a body of text, and/or providing a token that can be clustered on.  It does this by
			
 
				+sorting the tokens, deduplicating and then concatenating them back into a single token.
			
 
				+
			
 
				+For example, the tokens `["the", "quick", "quick", "brown", "fox", "was", "very", "brown"]` will be
			
 
				+transformed into a single token: `"brown fox quick the very was"`.  Notice how the tokens were sorted
			
 
				+alphabetically, and there is only one `"quick"`.
			
 
				+
			
 
				+The following are settings that can be set for a `fingerprint` token
			
 
				+filter type:
			
 
				+
			
 
				+[cols="<,<",options="header",]
			
 
				+|======================================================
			
 
				+|Setting |Description
			
 
				+|`separator` |Defaults to a space.
			
 
				+|`max_output_size` |Defaults to `255`.
			
 
				+|======================================================
			
 
				+
			
 
				+[[analysis-fingerprint-tokenfilter-max-size]]
			
 
				+==== Maximum token size
			
 
				+
			
 
				+Because a field may have many unique tokens, it is important to set a cutoff so that fields do not grow
			
 
				+too large.  The `max_output_size` setting controls this behavior.  If the concatenated fingerprint
			
 
				+grows larger than `max_output_size`, the token filter will exit and will not emit a token (e.g. the
			
 
				+field will be empty).