Browse Source

[ES|QL] Simplify patterns for subfields (#111118)

* simplify pattern for subfields
Fang Xing 1 year ago
parent
commit
c1dcc6e5c8

+ 5 - 0
docs/changelog/111118.yaml

@@ -0,0 +1,5 @@
+pr: 111118
+summary: "[ES|QL] Simplify patterns for subfields"
+area: ES|QL
+type: bug
+issues: []

+ 7 - 1
server/src/main/java/org/elasticsearch/action/fieldcaps/TransportFieldCapabilitiesAction.java

@@ -9,6 +9,7 @@
 package org.elasticsearch.action.fieldcaps;
 
 import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
 import org.elasticsearch.ExceptionsHelper;
 import org.elasticsearch.action.ActionListener;
 import org.elasticsearch.action.ActionRunnable;
@@ -557,7 +558,12 @@ public class TransportFieldCapabilitiesAction extends HandledTransportAction<Fie
                     .stream()
                     .collect(Collectors.groupingBy(ShardId::getIndexName));
                 final FieldCapabilitiesFetcher fetcher = new FieldCapabilitiesFetcher(indicesService, request.includeEmptyFields());
-                final Predicate<String> fieldNameFilter = Regex.simpleMatcher(request.fields());
+                Predicate<String> fieldNameFilter;
+                try {
+                    fieldNameFilter = Regex.simpleMatcher(request.fields());
+                } catch (TooComplexToDeterminizeException e) {
+                    throw new IllegalArgumentException("The field names are too complex to process. " + e.getMessage());
+                }
                 for (List<ShardId> shardIds : groupedShardIds.values()) {
                     final Map<ShardId, Exception> failures = new HashMap<>();
                     final Set<ShardId> unmatched = new HashSet<>();

+ 22 - 2
server/src/main/java/org/elasticsearch/common/regex/Regex.java

@@ -44,10 +44,20 @@ public class Regex {
         return str.equals("*");
     }
 
+    /**
+     * Returns true if the str ends with "*".
+     */
     public static boolean isSuffixMatchPattern(String str) {
         return str.length() > 1 && str.indexOf('*') == str.length() - 1;
     }
 
+    /**
+     * Returns true if the str ends with ".*".
+     */
+    public static boolean isSuffixWildcard(String str) {
+        return isSuffixMatchPattern(str) && str.endsWith(".*");
+    }
+
     /** Return an {@link Automaton} that matches the given pattern. */
     public static Automaton simpleMatchToAutomaton(String pattern) {
         List<Automaton> automata = new ArrayList<>();
@@ -71,9 +81,12 @@ public class Regex {
 
         List<BytesRef> simpleStrings = new ArrayList<>();
         List<Automaton> automata = new ArrayList<>();
+        List<BytesRef> prefixes = new ArrayList<>();
         for (String pattern : patterns) {
             // Strings longer than 1000 characters aren't supported by makeStringUnion
-            if (isSimpleMatchPattern(pattern) || pattern.length() >= 1000) {
+            if (isSuffixWildcard(pattern) && pattern.length() < 1000) {
+                prefixes.add(new BytesRef(pattern.substring(0, pattern.length() - 1)));
+            } else if (isSimpleMatchPattern(pattern) || pattern.length() >= 1000) {
                 automata.add(simpleMatchToAutomaton(pattern));
             } else {
                 simpleStrings.add(new BytesRef(pattern));
@@ -87,11 +100,18 @@ public class Regex {
             } else {
                 simpleStringsAutomaton = Automata.makeString(simpleStrings.get(0).utf8ToString());
             }
-            if (automata.isEmpty()) {
+            if (automata.isEmpty() && prefixes.isEmpty()) {
                 return simpleStringsAutomaton;
             }
             automata.add(simpleStringsAutomaton);
         }
+        if (false == prefixes.isEmpty()) {
+            List<Automaton> prefixAutomaton = new ArrayList<>();
+            Collections.sort(prefixes);
+            prefixAutomaton.add(Automata.makeStringUnion(prefixes));
+            prefixAutomaton.add(Automata.makeAnyString());
+            automata.add(Operations.concatenate(prefixAutomaton));
+        }
         return Operations.union(automata);
     }
 

+ 13 - 0
server/src/test/java/org/elasticsearch/common/regex/RegexTests.java

@@ -14,6 +14,7 @@ import org.elasticsearch.test.ESTestCase;
 import java.io.IOException;
 import java.util.Locale;
 import java.util.Random;
+import java.util.function.Predicate;
 import java.util.regex.Pattern;
 
 import static org.elasticsearch.test.LambdaMatchers.falseWith;
@@ -236,4 +237,16 @@ public class RegexTests extends ESTestCase {
         assertThat(Regex.simpleMatcher("a*c", "x*z"), falseWith("abd"));
         assertThat(Regex.simpleMatcher("a*c", "x*z"), falseWith("xyy"));
     }
+
+    public void testThousandsAndLongPattern() throws IOException {
+        String[] patterns = new String[10000];
+        for (int i = 0; i < patterns.length / 2; i++) {
+            patterns[i * 2] = randomAlphaOfLength(10);
+            patterns[i * 2 + 1] = patterns[i * 2] + ".*";
+        }
+        Predicate<String> predicate = Regex.simpleMatcher(patterns);
+        for (int i = 0; i < patterns.length / 2; i++) {
+            assertTrue(predicate.test(patterns[i]));
+        }
+    }
 }

+ 26 - 0
x-pack/plugin/esql/qa/server/src/main/java/org/elasticsearch/xpack/esql/qa/rest/RestEsqlTestCase.java

@@ -567,6 +567,32 @@ public abstract class RestEsqlTestCase extends ESRestTestCase {
         assertThat(EntityUtils.toString(re.getResponse().getEntity()), containsString("n1=[5, 6, 7] is not supported as a parameter"));
     }
 
+    public void testComplexFieldNames() throws IOException {
+        bulkLoadTestData(1);
+        // catch verification exception, field names not found
+        int fieldNumber = 5000;
+        String q1 = fromIndex() + queryWithComplexFieldNames(fieldNumber);
+        ResponseException e = expectThrows(ResponseException.class, () -> runEsql(requestObjectBuilder().query(q1)));
+        assertEquals(400, e.getResponse().getStatusLine().getStatusCode());
+        assertThat(e.getMessage(), containsString("verification_exception"));
+
+        // catch automaton's TooComplexToDeterminizeException
+        fieldNumber = 6000;
+        final String q2 = fromIndex() + queryWithComplexFieldNames(fieldNumber);
+        e = expectThrows(ResponseException.class, () -> runEsql(requestObjectBuilder().query(q2)));
+        assertEquals(400, e.getResponse().getStatusLine().getStatusCode());
+        assertThat(e.getMessage(), containsString("The field names are too complex to process"));
+    }
+
+    private static String queryWithComplexFieldNames(int field) {
+        StringBuilder query = new StringBuilder();
+        query.append(" | keep ").append(randomAlphaOfLength(10)).append(1);
+        for (int i = 2; i <= field; i++) {
+            query.append(", ").append(randomAlphaOfLength(10)).append(i);
+        }
+        return query.toString();
+    }
+
     private static String expectedTextBody(String format, int count, @Nullable Character csvDelimiter) {
         StringBuilder sb = new StringBuilder();
         switch (format) {