Browse Source

Avoid negative scores with cross_fields type (#89016)

The cross_fields scoring type can produce negative scores when some documents
are missing fields. When blending term document frequencies, we take the maximum
document frequency across all fields. If one field appears in fewer documents
than another, this means that its IDF can become negative. This is because IDF
is calculated as `Math.log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5))`

This change adjusts the docFreq for each field to `Math.min(docCount, docFreq)`
so that the IDF can never become negative. It makes sense that the term document
frequency should never exceed the number of documents containing the field.
Julie Tibshirani 3 years ago
parent
commit
3c1b070329

+ 6 - 0
docs/changelog/89016.yaml

@@ -0,0 +1,6 @@
+pr: 89016
+summary: Avoid negative scores with `cross_fields` type
+area: Ranking
+type: bug
+issues:
+ - 44700

+ 6 - 5
docs/reference/query-dsl/multi-match-query.asciidoc

@@ -388,11 +388,12 @@ explanation:
 Also, accepts `analyzer`, `boost`, `operator`, `minimum_should_match`,
 `lenient` and `zero_terms_query`.
 
-WARNING: The `cross_fields` type blends field statistics in a way that does
-not always produce well-formed scores (for example scores can become
-negative). As an alternative, you can consider the
-<<query-dsl-combined-fields-query,`combined_fields`>> query, which is also
-term-centric but combines field statistics in a more robust way.
+WARNING: The `cross_fields` type blends field statistics in a complex way that
+can be hard to interpret. The score combination can even be incorrect, in
+particular when some documents contain some of the search fields, but not all
+of them. You should consider the
+<<query-dsl-combined-fields-query,`combined_fields`>> query as an alternative,
+which is also term-centric but combines field statistics in a more robust way.
 
 [[cross-field-analysis]]
 ===== `cross_field` and analysis

+ 4 - 1
server/src/main/java/org/elasticsearch/lucene/queries/BlendedTermQuery.java

@@ -148,7 +148,10 @@ public abstract class BlendedTermQuery extends Query {
             if (prev > current) {
                 actualDf++;
             }
-            contexts[i] = ctx = adjustDF(reader.getContext(), ctx, Math.min(maxDoc, actualDf));
+
+            int docCount = reader.getDocCount(terms[i].field());
+            int newDocFreq = Math.min(actualDf, docCount);
+            contexts[i] = ctx = adjustDF(reader.getContext(), ctx, newDocFreq);
             prev = current;
             sumTTF += ctx.totalTermFreq();
         }

+ 33 - 0
server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java

@@ -248,6 +248,39 @@ public class BlendedTermQueryTests extends ESTestCase {
         dir.close();
     }
 
+    public void testMissingFields() throws IOException {
+        Directory dir = newDirectory();
+        IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
+        FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
+        ft.freeze();
+
+        for (int i = 0; i < 10; i++) {
+            Document d = new Document();
+            d.add(new TextField("id", Integer.toString(i), Field.Store.YES));
+            d.add(new Field("dense", "foo", ft));
+            // Add a sparse field with high totalTermFreq but low docCount
+            if (i % 5 == 0) {
+                d.add(new Field("sparse", "foo", ft));
+                d.add(new Field("sparse", "one two three four five size", ft));
+            }
+            w.addDocument(d);
+        }
+        w.commit();
+
+        DirectoryReader reader = DirectoryReader.open(w);
+        IndexSearcher searcher = setSimilarity(newSearcher(reader));
+
+        String[] fields = new String[] { "dense", "sparse" };
+        Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "foo"), 0.1f);
+        TopDocs search = searcher.search(query, 10);
+        ScoreDoc[] scoreDocs = search.scoreDocs;
+        assertEquals(Integer.toString(0), reader.document(scoreDocs[0].doc).getField("id").stringValue());
+
+        reader.close();
+        w.close();
+        dir.close();
+    }
+
     public void testEqualsAndHash() {
         String[] fields = new String[1 + random().nextInt(10)];
         for (int i = 0; i < fields.length; i++) {