Browse Source

Adds boolean similarity to Elasticsearch (#23637)

This commit adds the boolean similarity scoring from Lucene to
Elasticsearch.  The boolean similarity provides a means to specify that
a field should not be scored with typical full-text ranking algorithms,
but rather just whether the query terms match the document or not.
Boolean similarity scores a query term equal to its query boost only.
Boolean similarity is available as a default similarity option and thus
a field can be specified to have boolean similarity by declaring in its
mapping:
    "similarity": "boolean"

Closes #6731
Ali Beyad 8 years ago
parent
commit
8359dd05c9

+ 48 - 0
core/src/main/java/org/elasticsearch/index/similarity/BooleanSimilarityProvider.java

@@ -0,0 +1,48 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.similarity;
+
+import org.apache.lucene.search.similarities.BooleanSimilarity;
+import org.elasticsearch.common.settings.Settings;
+
+/**
+ * {@link SimilarityProvider} for the {@link BooleanSimilarity},
+ * which is a simple similarity that gives terms a score equal
+ * to their query boost only.  This is useful in situations where
+ * a field does not need to be scored by a full-text ranking
+ * algorithm, but rather all that matters is whether the query
+ * terms matched or not.
+ */
+public class BooleanSimilarityProvider extends AbstractSimilarityProvider {
+
+    private final BooleanSimilarity similarity = new BooleanSimilarity();
+
+    public BooleanSimilarityProvider(String name, Settings settings, Settings indexSettings) {
+        super(name);
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    @Override
+    public BooleanSimilarity get() {
+        return similarity;
+    }
+}

+ 1 - 0
core/src/main/java/org/elasticsearch/index/similarity/SimilarityService.java

@@ -47,6 +47,7 @@ public final class SimilarityService extends AbstractIndexComponent {
         Map<String, TriFunction<String, Settings, Settings, SimilarityProvider>> buildIn = new HashMap<>();
         defaults.put("classic", ClassicSimilarityProvider::new);
         defaults.put("BM25", BM25SimilarityProvider::new);
+        defaults.put("boolean", BooleanSimilarityProvider::new);
         buildIn.put("classic", ClassicSimilarityProvider::new);
         buildIn.put("BM25", BM25SimilarityProvider::new);
         buildIn.put("DFR", DFRSimilarityProvider::new);

+ 17 - 0
core/src/test/java/org/elasticsearch/index/similarity/SimilarityTests.java

@@ -19,6 +19,7 @@
 
 package org.elasticsearch.index.similarity;
 
+import org.apache.lucene.search.similarities.BooleanSimilarity;
 import org.apache.lucene.search.similarities.ClassicSimilarity;
 import org.apache.lucene.search.similarities.DFISimilarity;
 import org.apache.lucene.search.similarities.AfterEffectL;
@@ -64,6 +65,7 @@ public class SimilarityTests extends ESSingleNodeTestCase {
         SimilarityService similarityService = createIndex("foo").similarityService();
         assertThat(similarityService.getSimilarity("classic").get(), instanceOf(ClassicSimilarity.class));
         assertThat(similarityService.getSimilarity("BM25").get(), instanceOf(BM25Similarity.class));
+        assertThat(similarityService.getSimilarity("boolean").get(), instanceOf(BooleanSimilarity.class));
         assertThat(similarityService.getSimilarity("default"), equalTo(null));
     }
 
@@ -109,6 +111,21 @@ public class SimilarityTests extends ESSingleNodeTestCase {
         assertThat(similarity.getDiscountOverlaps(), equalTo(false));
     }
 
+    public void testResolveSimilaritiesFromMapping_boolean() throws IOException {
+        String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
+            .startObject("properties")
+            .startObject("field1").field("type", "text").field("similarity", "boolean").endObject()
+            .endObject()
+            .endObject().endObject().string();
+
+        IndexService indexService = createIndex("foo", Settings.EMPTY);
+        DocumentMapper documentMapper = indexService.mapperService()
+            .documentMapperParser()
+            .parse("type", new CompressedXContent(mapping));
+        assertThat(documentMapper.mappers().getMapper("field1").fieldType().similarity(),
+            instanceOf(BooleanSimilarityProvider.class));
+    }
+
     public void testResolveSimilaritiesFromMapping_DFR() throws IOException {
         String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
             .startObject("properties")

+ 11 - 1
docs/reference/mapping/params/similarity.asciidoc

@@ -3,7 +3,7 @@
 
 Elasticsearch allows you to configure a scoring algorithm or _similarity_ per
 field. The `similarity` setting provides a simple way of choosing a similarity
-algorithm other than the default TF/IDF, such as `BM25`.
+algorithm other than the default `BM25`, such as `TF/IDF`.
 
 Similarities are mostly useful for <<text,`text`>> fields, but can also apply
 to other field types.
@@ -25,6 +25,11 @@ configuration are:
         Lucene. See {defguide}/practical-scoring-function.html[Lucene’s Practical Scoring Function]
         for more information.
 
+`boolean`::
+        A simple boolean similarity, which is used when full-text ranking is not needed
+        and the score should only be based on whether the query terms match or not.
+        Boolean similarity gives terms a score equal to their query boost.
+
 
 The `similarity` can be set on the field level when a field is first created,
 as follows:
@@ -42,6 +47,10 @@ PUT my_index
         "classic_field": {
           "type": "text",
           "similarity": "classic" <2>
+        },
+        "boolean_sim_field": {
+          "type": "text",
+          "similarity": "boolean" <3>
         }
       }
     }
@@ -51,3 +60,4 @@ PUT my_index
 // CONSOLE
 <1> The `default_field` uses the `BM25` similarity.
 <2> The `classic_field` uses the `classic` similarity (ie TF/IDF).
+<3> The `boolean_sim_field` uses the `boolean` similarity.