Browse Source

Merge pull request #12294 from jpountz/fix/multi_match_boost

`multi_match` query applies boosts too many times.
Adrien Grand 10 years ago
parent
commit
00093a21dc

+ 26 - 7
core/src/main/java/org/apache/lucene/queries/BlendedTermQuery.java

@@ -33,6 +33,7 @@ import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.InPlaceMergeSorter;
+import org.apache.lucene.util.ToStringUtils;
 
 import java.io.IOException;
 import java.util.Arrays;
@@ -62,13 +63,17 @@ import java.util.List;
 public abstract class BlendedTermQuery extends Query {
 
     private final Term[] terms;
+    private final float[] boosts;
 
-
-    public BlendedTermQuery(Term[] terms) {
+    public BlendedTermQuery(Term[] terms, float[] boosts) {
         if (terms == null || terms.length == 0) {
             throw new IllegalArgumentException("terms must not be null or empty");
         }
+        if (boosts != null && boosts.length != terms.length) {
+            throw new IllegalArgumentException("boosts must have the same size as terms");
+        }
         this.terms = terms;
+        this.boosts = boosts;
     }
 
     @Override
@@ -231,8 +236,22 @@ public abstract class BlendedTermQuery extends Query {
 
     @Override
     public String toString(String field) {
-        return "blended(terms: " + Arrays.toString(terms) + ")";
-
+        StringBuilder builder = new StringBuilder("blended(terms:[");
+        for (int i = 0; i < terms.length; ++i) {
+            builder.append(terms[i]);
+            float boost = 1f;
+            if (boosts != null) {
+                boost = boosts[i];
+            }
+            builder.append(ToStringUtils.boost(boost));
+            builder.append(", ");
+        }
+        if (terms.length > 0) {
+            builder.setLength(builder.length() - 2);
+        }
+        builder.append("])");
+        builder.append(ToStringUtils.boost(getBoost()));
+        return builder.toString();
     }
 
     private volatile Term[] equalTerms = null;
@@ -277,7 +296,7 @@ public abstract class BlendedTermQuery extends Query {
     }
 
     public static BlendedTermQuery booleanBlendedQuery(Term[] terms, final float[] boosts, final boolean disableCoord) {
-        return new BlendedTermQuery(terms) {
+        return new BlendedTermQuery(terms, boosts) {
             @Override
             protected Query topLevelQuery(Term[] terms, TermContext[] ctx, int[] docFreqs, int maxDoc) {
                 BooleanQuery query = new BooleanQuery(disableCoord);
@@ -294,7 +313,7 @@ public abstract class BlendedTermQuery extends Query {
     }
 
     public static BlendedTermQuery commonTermsBlendedQuery(Term[] terms, final float[] boosts, final boolean disableCoord, final float maxTermFrequency) {
-        return new BlendedTermQuery(terms) {
+        return new BlendedTermQuery(terms, boosts) {
             @Override
             protected Query topLevelQuery(Term[] terms, TermContext[] ctx, int[] docFreqs, int maxDoc) {
                 BooleanQuery query = new BooleanQuery(true);
@@ -334,7 +353,7 @@ public abstract class BlendedTermQuery extends Query {
     }
 
     public static BlendedTermQuery dismaxBlendedQuery(Term[] terms, final float[] boosts, final float tieBreakerMultiplier) {
-        return new BlendedTermQuery(terms) {
+        return new BlendedTermQuery(terms, boosts) {
             @Override
             protected Query topLevelQuery(Term[] terms, TermContext[] ctx, int[] docFreqs, int maxDoc) {
                 DisjunctionMaxQuery query = new DisjunctionMaxQuery(tieBreakerMultiplier);

+ 3 - 3
core/src/main/java/org/elasticsearch/index/search/MultiMatchQuery.java

@@ -85,7 +85,7 @@ public class MultiMatchQuery extends MatchQuery {
                 throw new IllegalStateException("No such type: " + type);
         }
         final List<? extends Query> queries = queryBuilder.buildGroupedQueries(type, fieldNames, value, minimumShouldMatch);
-        return queryBuilder.conbineGrouped(queries);
+        return queryBuilder.combineGrouped(queries);
     }
 
     private QueryBuilder queryBuilder;
@@ -119,7 +119,7 @@ public class MultiMatchQuery extends MatchQuery {
             return parseAndApply(type, field, value, minimumShouldMatch, boostValue);
         }
 
-        public Query conbineGrouped(List<? extends Query> groupQuery) {
+        public Query combineGrouped(List<? extends Query> groupQuery) {
             if (groupQuery == null || groupQuery.isEmpty()) {
                 return null;
             }
@@ -196,7 +196,7 @@ public class MultiMatchQuery extends MatchQuery {
                     blendedFields = null;
                 }
                 final FieldAndFieldType fieldAndFieldType = group.get(0);
-                Query q = parseGroup(type.matchQueryType(), fieldAndFieldType.field, fieldAndFieldType.boost, value, minimumShouldMatch);
+                Query q = parseGroup(type.matchQueryType(), fieldAndFieldType.field, 1f, value, minimumShouldMatch);
                 if (q != null) {
                     queries.add(q);
                 }

+ 20 - 0
core/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java

@@ -54,6 +54,7 @@ import org.elasticsearch.common.xcontent.XContentFactory;
 import org.elasticsearch.common.xcontent.XContentHelper;
 import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.index.IndexService;
+import org.elasticsearch.index.engine.Engine;
 import org.elasticsearch.index.mapper.MapperService;
 import org.elasticsearch.index.mapper.ParsedDocument;
 import org.elasticsearch.index.mapper.core.NumberFieldMapper;
@@ -83,6 +84,7 @@ import static org.hamcrest.Matchers.*;
 public class SimpleIndexQueryParserTests extends ESSingleNodeTestCase {
 
     private IndexQueryParserService queryParser;
+    private IndexService indexService;
 
     @Before
     public void setup() throws IOException {
@@ -99,6 +101,7 @@ public class SimpleIndexQueryParserTests extends ESSingleNodeTestCase {
         assertNotNull(doc.dynamicMappingsUpdate());
         client().admin().indices().preparePutMapping("test").setType("person").setSource(doc.dynamicMappingsUpdate().toString()).get();
 
+        this.indexService = indexService;
         queryParser = indexService.queryParserService();
     }
 
@@ -2269,6 +2272,23 @@ public class SimpleIndexQueryParserTests extends ESSingleNodeTestCase {
         assertThat(parsedQuery, instanceOf(BooleanQuery.class));
     }
 
+    public void testCrossFieldMultiMatchQuery() throws IOException {
+        IndexQueryParserService queryParser = queryParser();
+        Query parsedQuery = queryParser.parse(multiMatchQuery("banon", "name.first^2", "name.last^3", "foobar").type(MultiMatchQueryBuilder.Type.CROSS_FIELDS)).query();
+        try (Engine.Searcher searcher = indexService.shardSafe(0).acquireSearcher("test")) {
+            Query rewrittenQuery = searcher.searcher().rewrite(parsedQuery);
+
+            BooleanQuery expected = new BooleanQuery();
+            expected.add(new TermQuery(new Term("foobar", "banon")), Occur.SHOULD);
+            TermQuery tq1 = new TermQuery(new Term("name.first", "banon"));
+            tq1.setBoost(2);
+            TermQuery tq2 = new TermQuery(new Term("name.last", "banon"));
+            tq2.setBoost(3);
+            expected.add(new DisjunctionMaxQuery(Arrays.<Query>asList(tq1, tq2), 0f), Occur.SHOULD);
+            assertEquals(expected, rewrittenQuery);
+        }
+    }
+
     @Test
     public void testSimpleQueryString() throws Exception {
         IndexQueryParserService queryParser = queryParser();

+ 11 - 4
docs/reference/query-dsl/multi-match-query.asciidoc

@@ -302,10 +302,17 @@ document to match.  (Compare this to
 
 That solves one of the two problems. The problem of differing term frequencies
 is solved by _blending_ the term frequencies for all fields in order to even
-out the differences.  In other words, `first_name:smith` will be treated as
-though it has the same weight as `last_name:smith`. (Actually,
-`last_name:smith` is given a tiny advantage over `first_name:smith`, just to
-make the order of results more stable.)
+out the differences.
+
+In practice, `first_name:smith` will be treated as though it has the same
+frequencies as `last_name:smith`, plus one. This will make matches on
+`first_name` and `last_name` have comparable scores, with a tiny advantage
+for `last_name` since it is the most likely field that contains `smith`.
+
+Note that `cross_fields` is usually only useful on short string fields
+that all have a `boost` of `1`. Otherwise boosts, term freqs and length
+normalization contribute to the score in such a way that the blending of term
+statistics is not meaningful anymore.
 
 If you run the above query through the <<search-validate>>, it returns this
 explanation: