Kaynağa Gözat

percolator: Avoid TooManyClauses exception if number of terms / ranges is exactly equal to 1024

The logic whether to use CoveringQuery was in two places which is why this bug snug in.
Martijn van Groningen 8 yıl önce
ebeveyn
işleme
4ab638b71d

+ 47 - 39
modules/percolator/src/main/java/org/elasticsearch/percolator/PercolatorFieldMapper.java

@@ -244,20 +244,9 @@ public class PercolatorFieldMapper extends FieldMapper {
         Query percolateQuery(String name, PercolateQuery.QueryStore queryStore, List<BytesReference> documents,
                              IndexSearcher searcher, Version indexVersion) throws IOException {
             IndexReader indexReader = searcher.getIndexReader();
-            Tuple<List<Query>, Boolean> t = createCandidateQueryClauses(indexReader);
-            BooleanQuery.Builder candidateQuery = new BooleanQuery.Builder();
-            if (t.v2() && indexVersion.onOrAfter(Version.V_6_1_0)) {
-                LongValuesSource valuesSource = LongValuesSource.fromIntField(minimumShouldMatchField.name());
-                candidateQuery.add(new CoveringQuery(t.v1(), valuesSource), BooleanClause.Occur.SHOULD);
-            } else {
-                for (Query query : t.v1()) {
-                    candidateQuery.add(query, BooleanClause.Occur.SHOULD);
-                }
-            }
-            // include extractionResultField:failed, because docs with this term have no extractedTermsField
-            // and otherwise we would fail to return these docs. Docs that failed query term extraction
-            // always need to be verified by MemoryIndex:
-            candidateQuery.add(new TermQuery(new Term(extractionResultField.name(), EXTRACTION_FAILED)), BooleanClause.Occur.SHOULD);
+            Tuple<BooleanQuery, Boolean> t = createCandidateQuery(indexReader, indexVersion);
+            Query candidateQuery = t.v1();
+            boolean canUseMinimumShouldMatchField = t.v2();
 
             Query verifiedMatchesQuery;
             // We can only skip the MemoryIndex verification when percolating a single non nested document. We cannot
@@ -265,15 +254,55 @@ public class PercolatorFieldMapper extends FieldMapper {
             // ranges are extracted from IndexReader backed by a RamDirectory holding multiple documents we do
             // not know to which document the terms belong too and for certain queries we incorrectly emit candidate
             // matches as actual match.
-            if (t.v2() && indexReader.maxDoc() == 1) {
+            if (canUseMinimumShouldMatchField && indexReader.maxDoc() == 1) {
                 verifiedMatchesQuery = new TermQuery(new Term(extractionResultField.name(), EXTRACTION_COMPLETE));
             } else {
                 verifiedMatchesQuery = new MatchNoDocsQuery("multiple or nested docs or CoveringQuery could not be used");
             }
-            return new PercolateQuery(name, queryStore, documents, candidateQuery.build(), searcher, verifiedMatchesQuery);
+            return new PercolateQuery(name, queryStore, documents, candidateQuery, searcher, verifiedMatchesQuery);
+        }
+
+        Tuple<BooleanQuery, Boolean> createCandidateQuery(IndexReader indexReader, Version indexVersion) throws IOException {
+            Tuple<List<BytesRef>, Map<String, List<byte[]>>> t = extractTermsAndRanges(indexReader);
+            List<BytesRef> extractedTerms = t.v1();
+            Map<String, List<byte[]>> encodedPointValuesByField = t.v2();
+            // `1 + ` is needed to take into account the EXTRACTION_FAILED should clause
+            boolean canUseMinimumShouldMatchField = 1 + extractedTerms.size() + encodedPointValuesByField.size() <=
+                BooleanQuery.getMaxClauseCount();
+
+            List<Query> subQueries = new ArrayList<>();
+            for (Map.Entry<String, List<byte[]>> entry : encodedPointValuesByField.entrySet()) {
+                String rangeFieldName = entry.getKey();
+                List<byte[]> encodedPointValues = entry.getValue();
+                byte[] min = encodedPointValues.get(0);
+                byte[] max = encodedPointValues.get(1);
+                Query query = BinaryRange.newIntersectsQuery(rangeField.name(), encodeRange(rangeFieldName, min, max));
+                subQueries.add(query);
+            }
+
+            BooleanQuery.Builder candidateQuery = new BooleanQuery.Builder();
+            if (canUseMinimumShouldMatchField && indexVersion.onOrAfter(Version.V_6_1_0)) {
+                LongValuesSource valuesSource = LongValuesSource.fromIntField(minimumShouldMatchField.name());
+                for (BytesRef extractedTerm : extractedTerms) {
+                    subQueries.add(new TermQuery(new Term(queryTermsField.name(), extractedTerm)));
+                }
+                candidateQuery.add(new CoveringQuery(subQueries, valuesSource), BooleanClause.Occur.SHOULD);
+            } else {
+                candidateQuery.add(new TermInSetQuery(queryTermsField.name(), extractedTerms), BooleanClause.Occur.SHOULD);
+                for (Query subQuery : subQueries) {
+                    candidateQuery.add(subQuery, BooleanClause.Occur.SHOULD);
+                }
+            }
+            // include extractionResultField:failed, because docs with this term have no extractedTermsField
+            // and otherwise we would fail to return these docs. Docs that failed query term extraction
+            // always need to be verified by MemoryIndex:
+            candidateQuery.add(new TermQuery(new Term(extractionResultField.name(), EXTRACTION_FAILED)), BooleanClause.Occur.SHOULD);
+            return new Tuple<>(candidateQuery.build(), canUseMinimumShouldMatchField);
         }
 
-        Tuple<List<Query>, Boolean> createCandidateQueryClauses(IndexReader indexReader) throws IOException {
+        // This was extracted the method above, because otherwise it is difficult to test what terms are included in
+        // the query in case a CoveringQuery is used (it does not have a getter to retrieve the clauses)
+        Tuple<List<BytesRef>, Map<String, List<byte[]>>> extractTermsAndRanges(IndexReader indexReader) throws IOException {
             List<BytesRef> extractedTerms = new ArrayList<>();
             Map<String, List<byte[]>> encodedPointValuesByField = new HashMap<>();
 
@@ -299,28 +328,7 @@ public class PercolatorFieldMapper extends FieldMapper {
                     encodedPointValuesByField.put(info.name, encodedPointValues);
                 }
             }
-
-            final boolean canUseMinimumShouldMatchField;
-            final List<Query> queries = new ArrayList<>();
-            if (extractedTerms.size() + encodedPointValuesByField.size() <= BooleanQuery.getMaxClauseCount()) {
-                canUseMinimumShouldMatchField = true;
-                for (BytesRef extractedTerm : extractedTerms) {
-                    queries.add(new TermQuery(new Term(queryTermsField.name(), extractedTerm)));
-                }
-            } else {
-                canUseMinimumShouldMatchField = false;
-                queries.add(new TermInSetQuery(queryTermsField.name(), extractedTerms));
-            }
-
-            for (Map.Entry<String, List<byte[]>> entry : encodedPointValuesByField.entrySet()) {
-                String rangeFieldName = entry.getKey();
-                List<byte[]> encodedPointValues = entry.getValue();
-                byte[] min = encodedPointValues.get(0);
-                byte[] max = encodedPointValues.get(1);
-                Query query = BinaryRange.newIntersectsQuery(rangeField.name(), encodeRange(rangeFieldName, min, max));
-                queries.add(query);
-            }
-            return new Tuple<>(queries, canUseMinimumShouldMatchField);
+            return new Tuple<>(extractedTerms, encodedPointValuesByField);
         }
 
     }

+ 6 - 4
modules/percolator/src/test/java/org/elasticsearch/percolator/CandidateQueryTests.java

@@ -45,6 +45,7 @@ import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.ConstantScoreQuery;
 import org.apache.lucene.search.ConstantScoreScorer;
+import org.apache.lucene.search.CoveringQuery;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.Explanation;
 import org.apache.lucene.search.FilterScorer;
@@ -505,16 +506,17 @@ public class CandidateQueryTests extends ESSingleNodeTestCase {
             }
             try (IndexReader ir = DirectoryReader.open(directory)){
                 IndexSearcher percolateSearcher = new IndexSearcher(ir);
-                Query query =
+                PercolateQuery query = (PercolateQuery)
                     fieldType.percolateQuery("_name", queryStore, Collections.singletonList(new BytesArray("{}")), percolateSearcher, v);
+                BooleanQuery candidateQuery = (BooleanQuery) query.getCandidateMatchesQuery();
+                assertThat(candidateQuery.clauses().get(0).getQuery(), instanceOf(CoveringQuery.class));
                 TopDocs topDocs = shardSearcher.search(query, 10);
                 assertEquals(2L, topDocs.totalHits);
                 assertEquals(2, topDocs.scoreDocs.length);
                 assertEquals(0, topDocs.scoreDocs[0].doc);
                 assertEquals(2, topDocs.scoreDocs[1].doc);
 
-                query = new ConstantScoreQuery(query);
-                topDocs = shardSearcher.search(query, 10);
+                topDocs = shardSearcher.search(new ConstantScoreQuery(query), 10);
                 assertEquals(2L, topDocs.totalHits);
                 assertEquals(2, topDocs.scoreDocs.length);
                 assertEquals(0, topDocs.scoreDocs[0].doc);
@@ -526,7 +528,7 @@ public class CandidateQueryTests extends ESSingleNodeTestCase {
         try (RAMDirectory directory = new RAMDirectory()) {
             try (IndexWriter iw = new IndexWriter(directory, newIndexWriterConfig())) {
                 Document document = new Document();
-                for (int i = 0; i < 1025; i++) {
+                for (int i = 0; i < 1024; i++) {
                     int fieldNumber = 2 + i;
                     document.add(new StringField("field", "value" + fieldNumber, Field.Store.NO));
                 }

+ 2 - 1
modules/percolator/src/test/java/org/elasticsearch/percolator/PercolateQueryBuilderTests.java

@@ -53,6 +53,7 @@ import org.elasticsearch.test.AbstractQueryTestCase;
 import org.hamcrest.Matchers;
 
 import java.io.IOException;
+import java.io.UncheckedIOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Base64;
@@ -340,7 +341,7 @@ public class PercolateQueryBuilderTests extends AbstractQueryTestCase<PercolateQ
             xContent.map(source);
             return xContent.bytes();
         } catch (IOException e) {
-            throw new RuntimeException(e);
+            throw new UncheckedIOException(e);
         }
     }
 

+ 102 - 46
modules/percolator/src/test/java/org/elasticsearch/percolator/PercolatorFieldMapperTests.java

@@ -32,6 +32,7 @@ import org.apache.lucene.index.Term;
 import org.apache.lucene.index.memory.MemoryIndex;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.CoveringQuery;
 import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermInSetQuery;
@@ -39,6 +40,7 @@ import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TermRangeQuery;
 import org.apache.lucene.search.join.ScoreMode;
 import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.Version;
 import org.elasticsearch.action.support.PlainActionFuture;
 import org.elasticsearch.common.bytes.BytesArray;
 import org.elasticsearch.common.collect.Tuple;
@@ -114,7 +116,6 @@ import static org.elasticsearch.percolator.PercolatorFieldMapper.EXTRACTION_PART
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.instanceOf;
-import static org.hamcrest.Matchers.is;
 
 public class PercolatorFieldMapperTests extends ESSingleNodeTestCase {
 
@@ -304,7 +305,7 @@ public class PercolatorFieldMapperTests extends ESSingleNodeTestCase {
         assertThat(document.getField(fieldType.extractionResultField.name()).stringValue(), equalTo(EXTRACTION_PARTIAL));
     }
 
-    public void testCreateCandidateQuery() throws Exception {
+    public void testExtractTermsAndRanges() throws Exception {
         addQueryFieldMappings();
 
         MemoryIndex memoryIndex = new MemoryIndex(false);
@@ -316,60 +317,87 @@ public class PercolatorFieldMapperTests extends ESSingleNodeTestCase {
 
         IndexReader indexReader = memoryIndex.createSearcher().getIndexReader();
 
-        Tuple<List<Query>, Boolean> t = fieldType.createCandidateQueryClauses(indexReader);
-        assertTrue(t.v2());
-        List<Query> clauses = t.v1();
-        clauses.sort(Comparator.comparing(Query::toString));
-        assertEquals(15, clauses.size());
-        assertEquals(fieldType.queryTermsField.name() + ":_field3\u0000me", clauses.get(0).toString());
-        assertEquals(fieldType.queryTermsField.name() + ":_field3\u0000unhide", clauses.get(1).toString());
-        assertEquals(fieldType.queryTermsField.name() + ":field1\u0000brown", clauses.get(2).toString());
-        assertEquals(fieldType.queryTermsField.name() + ":field1\u0000dog", clauses.get(3).toString());
-        assertEquals(fieldType.queryTermsField.name() + ":field1\u0000fox", clauses.get(4).toString());
-        assertEquals(fieldType.queryTermsField.name() + ":field1\u0000jumps", clauses.get(5).toString());
-        assertEquals(fieldType.queryTermsField.name() + ":field1\u0000lazy", clauses.get(6).toString());
-        assertEquals(fieldType.queryTermsField.name() + ":field1\u0000over", clauses.get(7).toString());
-        assertEquals(fieldType.queryTermsField.name() + ":field1\u0000quick", clauses.get(8).toString());
-        assertEquals(fieldType.queryTermsField.name() + ":field1\u0000the", clauses.get(9).toString());
-        assertEquals(fieldType.queryTermsField.name() + ":field2\u0000more", clauses.get(10).toString());
-        assertEquals(fieldType.queryTermsField.name() + ":field2\u0000some", clauses.get(11).toString());
-        assertEquals(fieldType.queryTermsField.name() + ":field2\u0000text", clauses.get(12).toString());
-        assertEquals(fieldType.queryTermsField.name() + ":field4\u0000123", clauses.get(13).toString());
-        assertThat(clauses.get(14).toString(), containsString(fieldName + ".range_field:<ranges:"));
+        Tuple<List<BytesRef>, Map<String, List<byte[]>>> t = fieldType.extractTermsAndRanges(indexReader);
+        assertEquals(1, t.v2().size());
+        Map<String, List<byte[]>> rangesMap = t.v2();
+        assertEquals(1, rangesMap.size());
+
+        List<byte[]> range = rangesMap.get("number_field2");
+        assertNotNull(range);
+        assertEquals(10, LongPoint.decodeDimension(range.get(0), 0));
+        assertEquals(10, LongPoint.decodeDimension(range.get(1), 0));
+
+        List<BytesRef> terms = t.v1();
+        terms.sort(BytesRef::compareTo);
+        assertEquals(14, terms.size());
+        assertEquals("_field3\u0000me", terms.get(0).utf8ToString());
+        assertEquals("_field3\u0000unhide", terms.get(1).utf8ToString());
+        assertEquals("field1\u0000brown", terms.get(2).utf8ToString());
+        assertEquals("field1\u0000dog", terms.get(3).utf8ToString());
+        assertEquals("field1\u0000fox", terms.get(4).utf8ToString());
+        assertEquals("field1\u0000jumps", terms.get(5).utf8ToString());
+        assertEquals("field1\u0000lazy", terms.get(6).utf8ToString());
+        assertEquals("field1\u0000over", terms.get(7).utf8ToString());
+        assertEquals("field1\u0000quick", terms.get(8).utf8ToString());
+        assertEquals("field1\u0000the", terms.get(9).utf8ToString());
+        assertEquals("field2\u0000more", terms.get(10).utf8ToString());
+        assertEquals("field2\u0000some", terms.get(11).utf8ToString());
+        assertEquals("field2\u0000text", terms.get(12).utf8ToString());
+        assertEquals("field4\u0000123", terms.get(13).utf8ToString());
     }
 
 
-    public void testCreateCandidateQuery_largeDocument() throws Exception {
+    public void testCreateCandidateQuery() throws Exception {
         addQueryFieldMappings();
 
         MemoryIndex memoryIndex = new MemoryIndex(false);
         StringBuilder text = new StringBuilder();
-        for (int i = 0; i < 1023; i++) {
+        for (int i = 0; i < 1022; i++) {
             text.append(i).append(' ');
         }
         memoryIndex.addField("field1", text.toString(), new WhitespaceAnalyzer());
         memoryIndex.addField(new LongPoint("field2", 10L), new WhitespaceAnalyzer());
         IndexReader indexReader = memoryIndex.createSearcher().getIndexReader();
 
-        Tuple<List<Query>, Boolean> t = fieldType.createCandidateQueryClauses(indexReader);
+        Tuple<BooleanQuery, Boolean> t = fieldType.createCandidateQuery(indexReader, Version.CURRENT);
         assertTrue(t.v2());
-        List<Query> clauses = t.v1();
-        assertEquals(1024, clauses.size());
-        assertThat(clauses.get(1023).toString(), containsString(fieldName + ".range_field:<ranges:"));
+        assertEquals(2, t.v1().clauses().size());
+        assertThat(t.v1().clauses().get(0).getQuery(), instanceOf(CoveringQuery.class));
+        assertThat(t.v1().clauses().get(1).getQuery(), instanceOf(TermQuery.class));
 
         // Now push it over the edge, so that it falls back using TermInSetQuery
         memoryIndex.addField("field2", "value", new WhitespaceAnalyzer());
         indexReader = memoryIndex.createSearcher().getIndexReader();
-        t = fieldType.createCandidateQueryClauses(indexReader);
+        t = fieldType.createCandidateQuery(indexReader, Version.CURRENT);
         assertFalse(t.v2());
-        clauses = t.v1();
-        assertEquals(2, clauses.size());
-        TermInSetQuery termInSetQuery = (TermInSetQuery) clauses.get(0);
-        assertEquals(1024, termInSetQuery.getTermData().size());
-        assertThat(clauses.get(1).toString(), containsString(fieldName + ".range_field:<ranges:"));
+        assertEquals(3, t.v1().clauses().size());
+        TermInSetQuery terms = (TermInSetQuery) t.v1().clauses().get(0).getQuery();
+        assertEquals(1023, terms.getTermData().size());
+        assertThat(t.v1().clauses().get(1).getQuery().toString(), containsString(fieldName + ".range_field:<ranges:"));
+        assertThat(t.v1().clauses().get(2).getQuery().toString(), containsString(fieldName + ".extraction_result:failed"));
+    }
+
+    public void testCreateCandidateQuery_oldIndex() throws Exception {
+        addQueryFieldMappings();
+
+        MemoryIndex memoryIndex = new MemoryIndex(false);
+        memoryIndex.addField("field1", "value1", new WhitespaceAnalyzer());
+        IndexReader indexReader = memoryIndex.createSearcher().getIndexReader();
+
+        Tuple<BooleanQuery, Boolean> t = fieldType.createCandidateQuery(indexReader, Version.CURRENT);
+        assertTrue(t.v2());
+        assertEquals(2, t.v1().clauses().size());
+        assertThat(t.v1().clauses().get(0).getQuery(), instanceOf(CoveringQuery.class));
+        assertThat(t.v1().clauses().get(1).getQuery(), instanceOf(TermQuery.class));
+
+        t = fieldType.createCandidateQuery(indexReader, Version.V_6_0_0);
+        assertTrue(t.v2());
+        assertEquals(2, t.v1().clauses().size());
+        assertThat(t.v1().clauses().get(0).getQuery(), instanceOf(TermInSetQuery.class));
+        assertThat(t.v1().clauses().get(1).getQuery(), instanceOf(TermQuery.class));
     }
 
-    public void testCreateCandidateQuery_numberFields() throws Exception {
+    public void testExtractTermsAndRanges_numberFields() throws Exception {
         addQueryFieldMappings();
 
         MemoryIndex memoryIndex = new MemoryIndex(false);
@@ -385,17 +413,45 @@ public class PercolatorFieldMapperTests extends ESSingleNodeTestCase {
 
         IndexReader indexReader = memoryIndex.createSearcher().getIndexReader();
 
-        Tuple<List<Query>, Boolean> t = fieldType.createCandidateQueryClauses(indexReader);
-        assertThat(t.v2(), is(true));
-        List<Query> clauses = t.v1();
-        assertEquals(7, clauses.size());
-        assertThat(clauses.get(0).toString(), containsString(fieldName + ".range_field:<ranges:[["));
-        assertThat(clauses.get(1).toString(), containsString(fieldName + ".range_field:<ranges:[["));
-        assertThat(clauses.get(2).toString(), containsString(fieldName + ".range_field:<ranges:[["));
-        assertThat(clauses.get(3).toString(), containsString(fieldName + ".range_field:<ranges:[["));
-        assertThat(clauses.get(4).toString(), containsString(fieldName + ".range_field:<ranges:[["));
-        assertThat(clauses.get(5).toString(), containsString(fieldName + ".range_field:<ranges:[["));
-        assertThat(clauses.get(6).toString(), containsString(fieldName + ".range_field:<ranges:[["));
+        Tuple<List<BytesRef>, Map<String, List<byte[]>>> t = fieldType.extractTermsAndRanges(indexReader);
+        assertEquals(0, t.v1().size());
+        Map<String, List<byte[]>> rangesMap = t.v2();
+        assertEquals(7, rangesMap.size());
+
+        List<byte[]> range = rangesMap.get("number_field1");
+        assertNotNull(range);
+        assertEquals(10, IntPoint.decodeDimension(range.get(0), 0));
+        assertEquals(10, IntPoint.decodeDimension(range.get(1), 0));
+
+        range = rangesMap.get("number_field2");
+        assertNotNull(range);
+        assertEquals(20L, LongPoint.decodeDimension(range.get(0), 0));
+        assertEquals(20L, LongPoint.decodeDimension(range.get(1), 0));
+
+        range = rangesMap.get("number_field3");
+        assertNotNull(range);
+        assertEquals(30L, LongPoint.decodeDimension(range.get(0), 0));
+        assertEquals(30L, LongPoint.decodeDimension(range.get(1), 0));
+
+        range = rangesMap.get("number_field4");
+        assertNotNull(range);
+        assertEquals(30F, HalfFloatPoint.decodeDimension(range.get(0), 0), 0F);
+        assertEquals(30F, HalfFloatPoint.decodeDimension(range.get(1), 0), 0F);
+
+        range = rangesMap.get("number_field5");
+        assertNotNull(range);
+        assertEquals(40F, FloatPoint.decodeDimension(range.get(0), 0), 0F);
+        assertEquals(40F, FloatPoint.decodeDimension(range.get(1), 0), 0F);
+
+        range = rangesMap.get("number_field6");
+        assertNotNull(range);
+        assertEquals(50D, DoublePoint.decodeDimension(range.get(0), 0), 0D);
+        assertEquals(50D, DoublePoint.decodeDimension(range.get(1), 0), 0D);
+
+        range = rangesMap.get("number_field7");
+        assertNotNull(range);
+        assertEquals(InetAddresses.forString("192.168.1.12"), InetAddressPoint.decode(range.get(0)));
+        assertEquals(InetAddresses.forString("192.168.1.24"), InetAddressPoint.decode(range.get(1)));
     }
 
     public void testPercolatorFieldMapper() throws Exception {