Browse Source

percolator: Do not take duplicate query extractions into account for minimum_should_match attribute

If a percolator query contains duplicate query clauses somewhere in the query tree then
when these clauses are extracted then they should not affect the msm.

This can lead a percolator query that should be a valid match not become a candidate match,
because at query time, the msm that is being used by the CoveringQuery would never match with
the msm used at index time.

Closes #28315
Martijn van Groningen 7 years ago
parent
commit
204f4022c2

+ 22 - 2
modules/percolator/src/main/java/org/elasticsearch/percolator/QueryAnalyzer.java

@@ -380,7 +380,21 @@ final class QueryAnalyzer {
                                     msm += 1;
                                 }
                             } else {
-                                msm += result.minimumShouldMatch;
+                                // In case that there are duplicate query extractions we need to be careful with incrementing msm,
+                                // because that could lead to valid matches not becoming candidate matches:
+                                // query: (field:val1 AND field:val2) AND (field:val2 AND field:val3)
+                                // doc:   field: val1 val2 val3
+                                // So lets be protective and decrease the msm:
+                                int resultMsm = result.minimumShouldMatch;
+                                for (QueryExtraction queryExtraction : result.extractions) {
+                                    if (extractions.contains(queryExtraction)) {
+                                        // To protect against negative msm:
+                                        // (sub results could consist out of disjunction and conjunction and
+                                        // then we do not know which extraction contributed to msm)
+                                        resultMsm = Math.max(0, resultMsm - 1);
+                                    }
+                                }
+                                msm += resultMsm;
                             }
                             verified &= result.verified;
                             matchAllDocs &= result.matchAllDocs;
@@ -519,10 +533,16 @@ final class QueryAnalyzer {
             if (subResult.matchAllDocs) {
                 numMatchAllClauses++;
             }
+            int resultMsm = subResult.minimumShouldMatch;
+            for (QueryExtraction extraction : subResult.extractions) {
+                if (terms.contains(extraction)) {
+                    resultMsm = Math.max(1, resultMsm - 1);
+                }
+            }
+            msmPerClause[i] = resultMsm;
             terms.addAll(subResult.extractions);
 
             QueryExtraction[] t = subResult.extractions.toArray(new QueryExtraction[1]);
-            msmPerClause[i] = subResult.minimumShouldMatch;
             if (subResult.extractions.size() == 1 && t[0].range != null) {
                 rangeFieldNames[i] = t[0].range.fieldName;
             }

+ 50 - 0
modules/percolator/src/test/java/org/elasticsearch/percolator/CandidateQueryTests.java

@@ -72,6 +72,7 @@ import org.apache.lucene.store.RAMDirectory;
 import org.elasticsearch.Version;
 import org.elasticsearch.common.CheckedFunction;
 import org.elasticsearch.common.bytes.BytesArray;
+import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.compress.CompressedXContent;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.xcontent.XContentFactory;
@@ -622,6 +623,55 @@ public class CandidateQueryTests extends ESSingleNodeTestCase {
         }
     }
 
+    public void testDuplicatedClauses() throws Exception {
+        List<ParseContext.Document> docs = new ArrayList<>();
+
+        BooleanQuery.Builder builder = new BooleanQuery.Builder();
+        BooleanQuery.Builder builder1 = new BooleanQuery.Builder();
+        builder1.add(new TermQuery(new Term("field", "value1")), BooleanClause.Occur.MUST);
+        builder1.add(new TermQuery(new Term("field", "value2")), BooleanClause.Occur.MUST);
+        builder.add(builder1.build(), BooleanClause.Occur.MUST);
+        BooleanQuery.Builder builder2 = new BooleanQuery.Builder();
+        builder2.add(new TermQuery(new Term("field", "value2")), BooleanClause.Occur.MUST);
+        builder2.add(new TermQuery(new Term("field", "value3")), BooleanClause.Occur.MUST);
+        builder.add(builder2.build(), BooleanClause.Occur.MUST);
+        addQuery(builder.build(), docs);
+
+        builder = new BooleanQuery.Builder()
+                .setMinimumNumberShouldMatch(2);
+        builder1 = new BooleanQuery.Builder();
+        builder1.add(new TermQuery(new Term("field", "value1")), BooleanClause.Occur.MUST);
+        builder1.add(new TermQuery(new Term("field", "value2")), BooleanClause.Occur.MUST);
+        builder.add(builder1.build(), BooleanClause.Occur.SHOULD);
+        builder2 = new BooleanQuery.Builder();
+        builder2.add(new TermQuery(new Term("field", "value2")), BooleanClause.Occur.MUST);
+        builder2.add(new TermQuery(new Term("field", "value3")), BooleanClause.Occur.MUST);
+        builder.add(builder2.build(), BooleanClause.Occur.SHOULD);
+        BooleanQuery.Builder builder3 = new BooleanQuery.Builder();
+        builder3.add(new TermQuery(new Term("field", "value3")), BooleanClause.Occur.MUST);
+        builder3.add(new TermQuery(new Term("field", "value4")), BooleanClause.Occur.MUST);
+        builder.add(builder3.build(), BooleanClause.Occur.SHOULD);
+        addQuery(builder.build(), docs);
+
+        indexWriter.addDocuments(docs);
+        indexWriter.close();
+        directoryReader = DirectoryReader.open(directory);
+        IndexSearcher shardSearcher = newSearcher(directoryReader);
+        shardSearcher.setQueryCache(null);
+
+        Version v = Version.CURRENT;
+        List<BytesReference> sources = Collections.singletonList(new BytesArray("{}"));
+
+        MemoryIndex memoryIndex = new MemoryIndex();
+        memoryIndex.addField("field", "value1 value2 value3", new WhitespaceAnalyzer());
+        IndexSearcher percolateSearcher = memoryIndex.createSearcher();
+        PercolateQuery query = (PercolateQuery) fieldType.percolateQuery("_name", queryStore, sources, percolateSearcher, v);
+        TopDocs topDocs = shardSearcher.search(query, 10, new Sort(SortField.FIELD_DOC), true, true);
+        assertEquals(2L, topDocs.totalHits);
+        assertEquals(0, topDocs.scoreDocs[0].doc);
+        assertEquals(1, topDocs.scoreDocs[1].doc);
+    }
+
     private void duelRun(PercolateQuery.QueryStore queryStore, MemoryIndex memoryIndex, IndexSearcher shardSearcher) throws IOException {
         boolean requireScore = randomBoolean();
         IndexSearcher percolateSearcher = memoryIndex.createSearcher();

+ 75 - 0
modules/percolator/src/test/java/org/elasticsearch/percolator/PercolatorFieldMapperTests.java

@@ -100,8 +100,10 @@ import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
 import java.util.function.Function;
+import java.util.stream.Collectors;
 
 import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
+import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
 import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery;
 import static org.elasticsearch.index.query.QueryBuilders.matchPhraseQuery;
 import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
@@ -850,6 +852,79 @@ public class PercolatorFieldMapperTests extends ESSingleNodeTestCase {
         }
     }
 
+    public void testDuplicatedClauses() throws Exception {
+        addQueryFieldMappings();
+
+        QueryBuilder qb = boolQuery()
+                .must(boolQuery().must(termQuery("field", "value1")).must(termQuery("field", "value2")))
+                .must(boolQuery().must(termQuery("field", "value2")).must(termQuery("field", "value3")));
+        ParsedDocument doc = mapperService.documentMapper("doc").parse(SourceToParse.source("test", "doc", "1",
+                XContentFactory.jsonBuilder().startObject()
+                        .field(fieldName, qb)
+                        .endObject().bytes(),
+                XContentType.JSON));
+
+        List<String> values = Arrays.stream(doc.rootDoc().getFields(fieldType.queryTermsField.name()))
+                .map(f -> f.binaryValue().utf8ToString())
+                .sorted()
+                .collect(Collectors.toList());
+        assertThat(values.size(), equalTo(3));
+        assertThat(values.get(0), equalTo("field\0value1"));
+        assertThat(values.get(1), equalTo("field\0value2"));
+        assertThat(values.get(2), equalTo("field\0value3"));
+        int msm = doc.rootDoc().getFields(fieldType.minimumShouldMatchField.name())[0].numericValue().intValue();
+        assertThat(msm, equalTo(3));
+
+        qb = boolQuery()
+                .must(boolQuery().must(termQuery("field", "value1")).must(termQuery("field", "value2")))
+                .must(boolQuery().must(termQuery("field", "value2")).must(termQuery("field", "value3")))
+                .must(boolQuery().must(termQuery("field", "value3")).must(termQuery("field", "value4")))
+                .must(boolQuery().should(termQuery("field", "value4")).should(termQuery("field", "value5")));
+        doc = mapperService.documentMapper("doc").parse(SourceToParse.source("test", "doc", "1",
+                XContentFactory.jsonBuilder().startObject()
+                        .field(fieldName, qb)
+                        .endObject().bytes(),
+                XContentType.JSON));
+
+        values = Arrays.stream(doc.rootDoc().getFields(fieldType.queryTermsField.name()))
+                .map(f -> f.binaryValue().utf8ToString())
+                .sorted()
+                .collect(Collectors.toList());
+        assertThat(values.size(), equalTo(5));
+        assertThat(values.get(0), equalTo("field\0value1"));
+        assertThat(values.get(1), equalTo("field\0value2"));
+        assertThat(values.get(2), equalTo("field\0value3"));
+        assertThat(values.get(3), equalTo("field\0value4"));
+        assertThat(values.get(4), equalTo("field\0value5"));
+        msm = doc.rootDoc().getFields(fieldType.minimumShouldMatchField.name())[0].numericValue().intValue();
+        assertThat(msm, equalTo(4));
+
+        qb = boolQuery()
+                .minimumShouldMatch(3)
+                .should(boolQuery().should(termQuery("field", "value1")).should(termQuery("field", "value2")))
+                .should(boolQuery().should(termQuery("field", "value2")).should(termQuery("field", "value3")))
+                .should(boolQuery().should(termQuery("field", "value3")).should(termQuery("field", "value4")))
+                .should(boolQuery().should(termQuery("field", "value4")).should(termQuery("field", "value5")));
+        doc = mapperService.documentMapper("doc").parse(SourceToParse.source("test", "doc", "1",
+                XContentFactory.jsonBuilder().startObject()
+                        .field(fieldName, qb)
+                        .endObject().bytes(),
+                XContentType.JSON));
+
+        values = Arrays.stream(doc.rootDoc().getFields(fieldType.queryTermsField.name()))
+                .map(f -> f.binaryValue().utf8ToString())
+                .sorted()
+                .collect(Collectors.toList());
+        assertThat(values.size(), equalTo(5));
+        assertThat(values.get(0), equalTo("field\0value1"));
+        assertThat(values.get(1), equalTo("field\0value2"));
+        assertThat(values.get(2), equalTo("field\0value3"));
+        assertThat(values.get(3), equalTo("field\0value4"));
+        assertThat(values.get(4), equalTo("field\0value5"));
+        msm = doc.rootDoc().getFields(fieldType.minimumShouldMatchField.name())[0].numericValue().intValue();
+        assertThat(msm, equalTo(3));
+    }
+
     private static byte[] subByteArray(byte[] source, int offset, int length) {
         return Arrays.copyOfRange(source, offset, offset + length);
     }

+ 60 - 0
modules/percolator/src/test/java/org/elasticsearch/percolator/QueryAnalyzerTests.java

@@ -1108,6 +1108,66 @@ public class QueryAnalyzerTests extends ESTestCase {
         assertEquals("_field1", new ArrayList<>(result.extractions).get(1).range.fieldName);
     }
 
+    public void testExtractQueryMetadata_duplicatedClauses() {
+        BooleanQuery.Builder builder = new BooleanQuery.Builder();
+        builder.add(
+                new BooleanQuery.Builder()
+                        .add(new TermQuery(new Term("field", "value1")), BooleanClause.Occur.MUST)
+                        .add(new TermQuery(new Term("field", "value2")), BooleanClause.Occur.MUST)
+                        .build(),
+                BooleanClause.Occur.MUST
+        );
+        builder.add(
+                new BooleanQuery.Builder()
+                        .add(new TermQuery(new Term("field", "value2")), BooleanClause.Occur.MUST)
+                        .add(new TermQuery(new Term("field", "value3")), BooleanClause.Occur.MUST)
+                        .build(),
+                BooleanClause.Occur.MUST
+        );
+        builder.add(
+                new BooleanQuery.Builder()
+                        .add(new TermQuery(new Term("field", "value3")), BooleanClause.Occur.MUST)
+                        .add(new TermQuery(new Term("field", "value4")), BooleanClause.Occur.MUST)
+                        .build(),
+                BooleanClause.Occur.MUST
+        );
+        Result result = analyze(builder.build(), Version.CURRENT);
+        assertThat(result.verified, is(true));
+        assertThat(result.matchAllDocs, is(false));
+        assertThat(result.minimumShouldMatch, equalTo(4));
+        assertTermsEqual(result.extractions, new Term("field", "value1"), new Term("field", "value2"),
+                new Term("field", "value3"), new Term("field", "value4"));
+
+        builder = new BooleanQuery.Builder().setMinimumNumberShouldMatch(2);
+        builder.add(
+                new BooleanQuery.Builder()
+                        .add(new TermQuery(new Term("field", "value1")), BooleanClause.Occur.MUST)
+                        .add(new TermQuery(new Term("field", "value2")), BooleanClause.Occur.MUST)
+                        .build(),
+                BooleanClause.Occur.SHOULD
+        );
+        builder.add(
+                new BooleanQuery.Builder()
+                        .add(new TermQuery(new Term("field", "value2")), BooleanClause.Occur.MUST)
+                        .add(new TermQuery(new Term("field", "value3")), BooleanClause.Occur.MUST)
+                        .build(),
+                BooleanClause.Occur.SHOULD
+        );
+        builder.add(
+                new BooleanQuery.Builder()
+                        .add(new TermQuery(new Term("field", "value3")), BooleanClause.Occur.MUST)
+                        .add(new TermQuery(new Term("field", "value4")), BooleanClause.Occur.MUST)
+                        .build(),
+                BooleanClause.Occur.SHOULD
+        );
+        result = analyze(builder.build(), Version.CURRENT);
+        assertThat(result.verified, is(true));
+        assertThat(result.matchAllDocs, is(false));
+        assertThat(result.minimumShouldMatch, equalTo(2));
+        assertTermsEqual(result.extractions, new Term("field", "value1"), new Term("field", "value2"),
+                new Term("field", "value3"), new Term("field", "value4"));
+    }
+
     private static void assertDimension(byte[] expected, Consumer<byte[]> consumer) {
         byte[] dest = new byte[expected.length];
         consumer.accept(dest);