Browse Source

Wildcard field regex query fix (#78839)

Fix for wildcard field query optimisation that rewrites to a match all for regexes like .*

A bug was found in this complex rewrite logic so we have simplified the detection of .* type regexes by examining the Automaton for the regex rather than our parsed form of it which is expressed as a Lucene BooleanQuery. The old logic relied on a recursive "simplify" function on the BooleanQuery which has now been removed. We now rely on Lucene's query rewrite logic to simplify expressions at query time and consequently some of the tests had to change to do some of this rewriting before running test comparisons.

Closes #78391
markharwood 4 years ago
parent
commit
216f135412

+ 0 - 51
x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/MatchAllButRequireVerificationQuery.java

@@ -1,51 +0,0 @@
-/*
- * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
- * or more contributor license agreements. Licensed under the Elastic License
- * 2.0; you may not use this file except in compliance with the Elastic License
- * 2.0.
- */
-
-package org.elasticsearch.xpack.wildcard.mapper;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.search.MatchAllDocsQuery;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.QueryVisitor;
-
-import java.io.IOException;
-
-/**
- * A query that matches all documents. The class is more of a marker
- * that we encountered something that will need verification.
- * (A MatchAllDocs query is used to indicate we can match all
- * _without_ verification)
- */
-public final class MatchAllButRequireVerificationQuery extends Query {
-
-  @Override
-    public Query rewrite(IndexReader reader) throws IOException {
-        return new MatchAllDocsQuery();
-    }
-
-  @Override
-  public String toString(String field) {
-    return "*:* (tbc)";
-  }
-
-  @Override
-  public boolean equals(Object o) {
-    return sameClassAs(o);
-  }
-
-  @Override
-  public int hashCode() {
-    return classHash();
-  }
-
-  @Override
-  public void visit(QueryVisitor visitor) {
-    visitor.visitLeaf(this);
-  }
-
-
-}

+ 29 - 88
x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java

@@ -37,6 +37,8 @@ import org.apache.lucene.search.TermRangeQuery;
 import org.apache.lucene.search.WildcardQuery;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.MinimizationOperations;
+import org.apache.lucene.util.automaton.Operations;
 import org.apache.lucene.util.automaton.RegExp;
 import org.apache.lucene.util.automaton.RegExp.Kind;
 import org.elasticsearch.ElasticsearchParseException;
@@ -348,26 +350,25 @@ public class WildcardFieldMapper extends FieldMapper {
             if (value.length() == 0) {
                 return new MatchNoDocsQuery();
             }
+            
+            //Check for simple "match all expressions e.g. .*
+            RegExp regExp = new RegExp(value, syntaxFlags, matchFlags);
+            Automaton a = regExp.toAutomaton();
+            a = Operations.determinize(a, maxDeterminizedStates);
+            a = MinimizationOperations.minimize(a, maxDeterminizedStates);
+            if (Operations.isTotal(a)) { // Will match all
+                return existsQuery(context);
+            }
 
             RegExp ngramRegex = new RegExp(addLineEndChars(value), syntaxFlags, matchFlags);
+            
 
             Query approxBooleanQuery = toApproximationQuery(ngramRegex);
             Query approxNgramQuery = rewriteBoolToNgramQuery(approxBooleanQuery);
 
-            // MatchAll is a special case meaning the regex is known to match everything .* and
-            // there is no need for verification.
-            if (approxNgramQuery instanceof MatchAllDocsQuery) {
-                return existsQuery(context);
-            }
             RegExp regex = new RegExp(value, syntaxFlags, matchFlags);
             Automaton automaton = regex.toAutomaton(maxDeterminizedStates);
 
-            // MatchAllButRequireVerificationQuery is a special case meaning the regex is reduced to a single
-            // clause which we can't accelerate at all and needs verification. Example would be ".."
-            if (approxNgramQuery instanceof MatchAllButRequireVerificationQuery) {
-                return new BinaryDvConfirmedAutomatonQuery(new MatchAllDocsQuery(), name(), value, automaton);
-            }
-
             // We can accelerate execution with the ngram query
             return new BinaryDvConfirmedAutomatonQuery(approxNgramQuery, name(), value, automaton);
         }
@@ -414,12 +415,12 @@ public class WildcardFieldMapper extends FieldMapper {
                             // plain TermQuery objects together. Boolean queries are interpreted as a black box and not
                             // concatenated.
                             BooleanQuery.Builder wrapper = new BooleanQuery.Builder();
-                            wrapper.add(result, Occur.MUST);
+                            wrapper.add(result, Occur.FILTER);
                             result = wrapper.build();
                         }
                     } else {
                         // Expressions like (a){0,3} match empty string or up to 3 a's.
-                        result = new MatchAllButRequireVerificationQuery();
+                        result = new MatchAllDocsQuery();
                     }
                     break;
                 case REGEXP_ANYSTRING:
@@ -436,7 +437,7 @@ public class WildcardFieldMapper extends FieldMapper {
                 case REGEXP_INTERVAL:
                 case REGEXP_EMPTY:
                 case REGEXP_AUTOMATON:
-                    result = new MatchAllButRequireVerificationQuery();
+                    result = new MatchAllDocsQuery();
                     break;
             }
             assert result != null; // All regex types are understood and translated to a query.
@@ -456,21 +457,21 @@ public class WildcardFieldMapper extends FieldMapper {
                     sequence.append(tq.getTerm().text());
                 } else {
                     if (sequence.length() > 0) {
-                        bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.MUST);
+                        bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.FILTER);
                         sequence = new StringBuilder();
                     }
-                    bAnd.add(query, Occur.MUST);
+                    bAnd.add(query, Occur.FILTER);
                 }
             }
             if (sequence.length() > 0) {
-                bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.MUST);
+                bAnd.add(new TermQuery(new Term("", sequence.toString())), Occur.FILTER);
             }
             BooleanQuery combined = bAnd.build();
             if (combined.clauses().size() > 0) {
                 return combined;
             }
             // There's something in the regex we couldn't represent as a query - resort to a match all with verification
-            return new MatchAllButRequireVerificationQuery();
+            return new MatchAllDocsQuery();
 
         }
 
@@ -498,7 +499,7 @@ public class WildcardFieldMapper extends FieldMapper {
                 }
             }
             // There's something in the regex we couldn't represent as a query - resort to a match all with verification
-            return new MatchAllButRequireVerificationQuery();
+            return new MatchAllDocsQuery();
         }
 
         private static void findLeaves(RegExp exp, Kind kind, List<Query> queries) {
@@ -529,7 +530,7 @@ public class WildcardFieldMapper extends FieldMapper {
                 for (BooleanClause clause : bq) {
                     Query q = rewriteBoolToNgramQuery(clause.getQuery());
                     if (q != null) {
-                        if (clause.getOccur().equals(Occur.MUST)) {
+                        if (clause.getOccur().equals(Occur.FILTER)) {
                             // Can't drop "should" clauses because it can elevate a sibling optional item
                             // to mandatory (shoulds with 1 clause) causing false negatives
                             // Dropping MUSTs increase false positives which are OK because are verified anyway.
@@ -541,7 +542,7 @@ public class WildcardFieldMapper extends FieldMapper {
                         rewritten.add(q, clause.getOccur());
                     }
                 }
-                return simplify(rewritten.build());
+                return rewritten.build();
             }
             if (approxQuery instanceof TermQuery) {
                 TermQuery tq = (TermQuery) approxQuery;
@@ -549,7 +550,7 @@ public class WildcardFieldMapper extends FieldMapper {
                 //Remove simple terms that are only string beginnings or ends.
                 String s = tq.getTerm().text();
                 if (s.equals(WildcardFieldMapper.TOKEN_START_STRING) || s.equals(WildcardFieldMapper.TOKEN_END_STRING)) {
-                    return new MatchAllButRequireVerificationQuery();
+                    return new MatchAllDocsQuery();
                 }
 
                 // Break term into tokens
@@ -557,75 +558,15 @@ public class WildcardFieldMapper extends FieldMapper {
                 getNgramTokens(tokens, s);
                 BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
                 for (String string : tokens) {
-                    addClause(string, rewritten, Occur.MUST);
+                    addClause(string, rewritten, Occur.FILTER);
                 }
-                return simplify(rewritten.build());
+                return rewritten.build();
             }
-            if (isMatchAll(approxQuery)) {
+            if (approxQuery instanceof MatchAllDocsQuery) {
                 return approxQuery;
             }
             throw new IllegalStateException("Invalid query type found parsing regex query:" + approxQuery);
         }
-
-        static Query simplify(Query input) {
-            if (input instanceof BooleanQuery == false) {
-                return input;
-            }
-            BooleanQuery result = (BooleanQuery) input;
-            if (result.clauses().size() == 0) {
-                // A ".*" clause can produce zero clauses in which case we return MatchAll
-                return new MatchAllDocsQuery();
-            }
-            if (result.clauses().size() == 1) {
-                return simplify(result.clauses().get(0).getQuery());
-            }
-
-            // We may have a mix of MatchAll and concrete queries - assess if we can simplify
-            int matchAllCount = 0;
-            int verifyCount = 0;
-            boolean allConcretesAreOptional = true;
-            for (BooleanClause booleanClause : result.clauses()) {
-                Query q = booleanClause.getQuery();
-                if (q instanceof MatchAllDocsQuery) {
-                    matchAllCount++;
-                } else if (q instanceof MatchAllButRequireVerificationQuery) {
-                    verifyCount++;
-                } else {
-                    // Concrete query
-                    if (booleanClause.getOccur() != Occur.SHOULD) {
-                        allConcretesAreOptional = false;
-                    }
-                }
-            }
-
-            if ((allConcretesAreOptional && matchAllCount > 0)) {
-                // Any match all expression takes precedence over all optional concrete queries.
-                return new MatchAllDocsQuery();
-            }
-
-            if ((allConcretesAreOptional && verifyCount > 0)) {
-                // Any match all expression that needs verification takes precedence over all optional concrete queries.
-                return new MatchAllButRequireVerificationQuery();
-            }
-
-            // We have some mandatory concrete queries - strip out the superfluous match all expressions
-            if (allConcretesAreOptional == false && matchAllCount + verifyCount > 0) {
-                BooleanQuery.Builder rewritten = new BooleanQuery.Builder();
-                for (BooleanClause booleanClause : result.clauses()) {
-                    if (isMatchAll(booleanClause.getQuery()) == false) {
-                        rewritten.add(booleanClause);
-                    }
-                }
-                return simplify(rewritten.build());
-            }
-            return result;
-        }
-
-
-        static boolean isMatchAll(Query q) {
-            return q instanceof MatchAllDocsQuery || q instanceof MatchAllButRequireVerificationQuery;
-        }
-
         protected void getNgramTokens(Set<String> tokens, String fragment) {
             if (fragment.equals(TOKEN_START_STRING) || fragment.equals(TOKEN_END_STRING)) {
                 // If a regex is a form of match-all e.g. ".*" we only produce the token start/end markers as search
@@ -666,7 +607,7 @@ public class WildcardFieldMapper extends FieldMapper {
             if (tokenSize < 2 || token.equals(WildcardFieldMapper.TOKEN_END_STRING)) {
                 // there's something concrete to be searched but it's too short
                 // Require verification.
-                bqBuilder.add(new BooleanClause(new MatchAllButRequireVerificationQuery(), occur));
+                bqBuilder.add(new BooleanClause(new MatchAllDocsQuery(), occur));
                 return;
             }
             if (tokenSize == NGRAM_SIZE) {
@@ -723,11 +664,11 @@ public class WildcardFieldMapper extends FieldMapper {
 
                         if (tokenSize == NGRAM_SIZE) {
                             TermQuery tq = new TermQuery(new Term(name(), token));
-                            bqBuilder.add(new BooleanClause(tq, Occur.MUST));
+                            bqBuilder.add(new BooleanClause(tq, Occur.FILTER));
                         } else {
                             PrefixQuery wq = new PrefixQuery(new Term(name(), token));
                             wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
-                            bqBuilder.add(new BooleanClause(wq, Occur.MUST));
+                            bqBuilder.add(new BooleanClause(wq, Occur.FILTER));
                         }
                     }
                     BooleanQuery bq = bqBuilder.build();

+ 96 - 11
x-pack/plugin/wildcard/src/test/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapperTests.java

@@ -19,8 +19,10 @@ import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queryparser.classic.ParseException;
 import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.BoostQuery;
 import org.apache.lucene.search.ConstantScoreQuery;
 import org.apache.lucene.search.DocValuesFieldExistsQuery;
 import org.apache.lucene.search.IndexSearcher;
@@ -34,6 +36,7 @@ import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TermRangeQuery;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.WildcardQuery;
+import org.apache.lucene.store.BaseDirectoryWrapper;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.automaton.Automaton;
@@ -97,6 +100,8 @@ public class WildcardFieldMapperTests extends MapperTestCase {
     static WildcardFieldMapper wildcardFieldType;
     static WildcardFieldMapper wildcardFieldType79;
     static KeywordFieldMapper keywordFieldType;
+    private DirectoryReader rewriteReader;
+    private BaseDirectoryWrapper rewriteDir;
 
     @Override
     protected Collection<? extends Plugin> getPlugins() {
@@ -120,8 +125,38 @@ public class WildcardFieldMapperTests extends MapperTestCase {
 
         org.elasticsearch.index.mapper.KeywordFieldMapper.Builder kwBuilder = new KeywordFieldMapper.Builder(KEYWORD_FIELD_NAME);
         keywordFieldType = kwBuilder.build(MapperBuilderContext.ROOT);
+        
+        rewriteDir = newDirectory();
+        IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER_7_10);
+        RandomIndexWriter iw = new RandomIndexWriter(random(), rewriteDir, iwc);
+
+        // Create a string that is too large and will not be indexed
+        String docContent = "a";
+        Document doc = new Document();
+        LuceneDocument parseDoc = new LuceneDocument();
+        addFields(parseDoc, doc, docContent);
+        indexDoc(parseDoc, doc, iw);
+
+        iw.forceMerge(1);
+        rewriteReader = iw.getReader();
+        iw.close();
+
         super.setUp();
     }
+    
+    
+
+    @Override
+    public void tearDown() throws Exception {
+        try {
+            rewriteReader.close();
+            rewriteDir.close();
+        } catch (Exception ignoreCloseFailure) {
+            // allow any superclass tear down logic to continue
+        }
+        // TODO Auto-generated method stub
+        super.tearDown();
+    }
 
     public void testTooBigKeywordField() throws IOException {
         Directory dir = newDirectory();
@@ -484,10 +519,10 @@ public class WildcardFieldMapperTests extends MapperTestCase {
 
     public void testRegexAcceleration() throws IOException, ParseException {
         // All these expressions should rewrite to a match all with no verification step required at all
-        String superfastRegexes[]= { ".*",  "...*..", "(foo|bar|.*)", "@"};
+        String superfastRegexes[]= { ".*", "(foo|bar|.*)", "@"};
         for (String regex : superfastRegexes) {
             Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 0, 20000, null, MOCK_CONTEXT);
-            assertTrue(wildcardFieldQuery instanceof DocValuesFieldExistsQuery);
+            assertTrue(regex + "should have been accelerated", wildcardFieldQuery instanceof DocValuesFieldExistsQuery);
         }
         String matchNoDocsRegexes[]= { ""};
         for (String regex : matchNoDocsRegexes) {
@@ -518,12 +553,14 @@ public class WildcardFieldMapperTests extends MapperTestCase {
 
         // All these expressions should rewrite to just the verification query (there's no ngram acceleration)
         // TODO we can possibly improve on some of these
-        String matchAllButVerifyTests[]= { "..", "(a)?","(a|b){0,3}", "((foo)?|(foo|bar)?)", "@&~(abc.+)", "aaa.+&.+bbb"};
+        String matchAllButVerifyTests[]= { "..", "(a)?","(a|b){0,3}", "((foo)?|(foo|bar)?)", "@&~(abc.+)", "aaa.+&.+bbb", "a*",  "...*.."};
         for (String regex : matchAllButVerifyTests) {
             Query wildcardFieldQuery = wildcardFieldType.fieldType().regexpQuery(regex, RegExp.ALL, 0, 20000, null, MOCK_CONTEXT);
             BinaryDvConfirmedAutomatonQuery q = (BinaryDvConfirmedAutomatonQuery)wildcardFieldQuery;
+            Query approximationQuery = unwrapAnyBoost(q.getApproximationQuery());
+            approximationQuery = getSimplifiedApproximationQuery(q.getApproximationQuery());
             assertTrue(regex +" was not a pure verify query " +formatQuery(wildcardFieldQuery),
-                q.getApproximationQuery() instanceof MatchAllDocsQuery);
+                approximationQuery instanceof MatchAllDocsQuery);
         }
 
 
@@ -532,8 +569,8 @@ public class WildcardFieldMapperTests extends MapperTestCase {
         String suboptimalTests[][] = {
             // TODO short wildcards like a* OR b* aren't great so we just drop them.
             // Ideally we would attach to successors to create (acd OR bcd)
-            { "[ab]cd",  "+cc_ +c__"}
-            };
+            { "[ab]cd",  "+(+cc_ +c__) +*:*"}
+        };
         for (String[] test : suboptimalTests) {
             String regex = test[0];
             String expectedAccelerationQueryString = test[1].replaceAll("_", ""+WildcardFieldMapper.TOKEN_START_OR_END_CHAR);
@@ -569,7 +606,7 @@ public class WildcardFieldMapperTests extends MapperTestCase {
             { "foo*bar", "+_eo +eoo +aaq +aq_ +q__" },
             { "foo?bar", "+_eo +eoo +aaq +aq_ +q__" },
             { "?foo*bar?", "+eoo +aaq" },
-            { "*c", "+c__" } };
+            { "*c", "c__" } };
         for (String[] test : tests) {
             String pattern = test[0];
             String expectedAccelerationQueryString = test[1].replaceAll("_", "" + WildcardFieldMapper.TOKEN_START_OR_END_CHAR);
@@ -714,7 +751,7 @@ public class WildcardFieldMapperTests extends MapperTestCase {
         };
         for (FuzzyTest test : tests) {
             Query wildcardFieldQuery = test.getFuzzyQuery();
-            testExpectedAccelerationQuery(test.pattern, wildcardFieldQuery, test.getExpectedApproxQuery());
+            testExpectedAccelerationQuery(test.pattern, wildcardFieldQuery, getSimplifiedApproximationQuery(test.getExpectedApproxQuery()));
         }
     }
 
@@ -765,7 +802,8 @@ public class WildcardFieldMapperTests extends MapperTestCase {
         }
     }
 
-    void testExpectedAccelerationQuery(String regex, Query combinedQuery, String expectedAccelerationQueryString) throws ParseException {
+    void testExpectedAccelerationQuery(String regex, Query combinedQuery, String expectedAccelerationQueryString) throws ParseException,
+        IOException {
 
         QueryParser qsp = new QueryParser(WILDCARD_FIELD_NAME, new KeywordAnalyzer());
         Query expectedAccelerationQuery = qsp.parse(expectedAccelerationQueryString);
@@ -780,16 +818,63 @@ public class WildcardFieldMapperTests extends MapperTestCase {
             return q;
         }
     }
+    private Query unwrapAnyBoost(Query q) {
+        if (q instanceof BoostQuery) {
+            BoostQuery csq = (BoostQuery) q;
+            return csq.getQuery();
+        } else {
+            return q;
+        }
+    }    
+    
 
-    void testExpectedAccelerationQuery(String regex, Query combinedQuery, Query expectedAccelerationQuery) throws ParseException {
+    void testExpectedAccelerationQuery(String regex, Query combinedQuery, Query expectedAccelerationQuery) throws ParseException,
+        IOException {
         BinaryDvConfirmedAutomatonQuery cq = (BinaryDvConfirmedAutomatonQuery) unwrapAnyConstantScore(combinedQuery);
         Query approximationQuery = cq.getApproximationQuery();
-
+        approximationQuery = getSimplifiedApproximationQuery(approximationQuery);
         String message = "regex: "+ regex +"\nactual query: " + formatQuery(approximationQuery) +
             "\nexpected query: " + formatQuery(expectedAccelerationQuery) + "\n";
         assertEquals(message, expectedAccelerationQuery, approximationQuery);
     }
 
+    // For comparison purposes rewrite and unwrap various superfluous parts to get to raw logic
+    protected Query getSimplifiedApproximationQuery(Query approximationQuery) throws IOException {
+        int numRewrites = 0;
+        int maxNumRewrites = 100;
+        for (; numRewrites < maxNumRewrites; numRewrites++) {
+            Query newApprox = approximationQuery.rewrite(rewriteReader);
+            if (newApprox == approximationQuery) {
+                break;
+            }
+            approximationQuery = newApprox;
+            
+        }
+        assertTrue(numRewrites < maxNumRewrites);
+        approximationQuery = rewriteFiltersToMustsForComparisonPurposes(approximationQuery);
+        return approximationQuery;
+    }
+
+    private Query rewriteFiltersToMustsForComparisonPurposes(Query q) {
+        q = unwrapAnyBoost(q);
+        q = unwrapAnyConstantScore(q);
+        if (q instanceof BooleanQuery) {
+            BooleanQuery.Builder result = new BooleanQuery.Builder();
+            BooleanQuery bq = (BooleanQuery)q;
+            for (BooleanClause cq : bq.clauses()){
+                Query rewritten = rewriteFiltersToMustsForComparisonPurposes(cq.getQuery());
+                if(cq.getOccur() == Occur.FILTER) {
+                    result.add(rewritten, Occur.MUST);
+                } else {
+                    result.add(rewritten, cq.getOccur());
+                }
+            }
+            return result.build();
+        }
+
+        return q;
+    }
+
     private String getRandomFuzzyPattern(HashSet<String> values, int edits, int prefixLength) {
         assert edits >=0 && edits <=2;
         // Pick one of the indexed document values to focus our queries on.