Explorar el Código

Avoid unnecessary determinization in index pattern conflict checks (#128362)

Starting with Lucene 10, `CharacterRunAutomaton` is no longer determinized automatically.
In Elasticsearch 9, we adapted to this by eagerly determinizing automatons early (via `Regex#simpleMatchToAutomaton`).
However, this introduced  regression: operations like index template conflict checks, which only require intersection testing, now pay the cost of determinization—an expensive step that wasn’t needed before. In some cases, especially when many wildcard patterns are involved, determinization can even fail due to state explosion.

This change removes the unnecessary determinization, restoring the pre-9.0 behavior and allowing valid index templates with many patterns to be registered again.
Jim Ferenczi hace 4 meses
padre
commit
83126135fa

+ 5 - 0
docs/changelog/128362.yaml

@@ -0,0 +1,5 @@
+pr: 128362
+summary: Avoid unnecessary determinization in index pattern conflict checks
+area: Indices APIs
+type: bug
+issues: []

+ 5 - 2
server/src/main/java/org/elasticsearch/cluster/metadata/MetadataIndexTemplateService.java

@@ -946,12 +946,15 @@ public class MetadataIndexTemplateService {
         final String candidateName,
         final List<String> indexPatterns
     ) {
-        Automaton v2automaton = Regex.simpleMatchToAutomaton(indexPatterns.toArray(Strings.EMPTY_ARRAY));
+        // No need to determinize the automaton, as it is only used to check for intersection with another automaton.
+        // Determinization is avoided because it can fail or become very costly due to state explosion.
+        Automaton v2automaton = Regex.simpleMatchToNonDeterminizedAutomaton(indexPatterns.toArray(Strings.EMPTY_ARRAY));
         Map<String, List<String>> overlappingTemplates = new HashMap<>();
         for (Map.Entry<String, IndexTemplateMetadata> cursor : project.templates().entrySet()) {
             String name = cursor.getKey();
             IndexTemplateMetadata template = cursor.getValue();
-            Automaton v1automaton = Regex.simpleMatchToAutomaton(template.patterns().toArray(Strings.EMPTY_ARRAY));
+            // No need to determinize the automaton, as it is only used to check for intersection with another automaton.
+            Automaton v1automaton = Regex.simpleMatchToNonDeterminizedAutomaton(template.patterns().toArray(Strings.EMPTY_ARRAY));
             if (Operations.isEmpty(Operations.intersection(v2automaton, v1automaton)) == false) {
                 logger.debug(
                     "composable template {} and legacy template {} would overlap: {} <=> {}",

+ 32 - 7
server/src/main/java/org/elasticsearch/common/regex/Regex.java

@@ -59,8 +59,15 @@ public class Regex {
         return isSuffixMatchPattern(str) && str.endsWith(".*");
     }
 
-    /** Return an {@link Automaton} that matches the given pattern. */
-    public static Automaton simpleMatchToAutomaton(String pattern) {
+    /**
+     * Return a non-determinized {@link Automaton} that matches the given pattern.
+     * WARNING: Use this method only when the resulting {@link Automaton} is used in contexts
+     * that do not require determinism (e.g., checking the intersection of automatons).
+     *
+     * For pattern matching with {@link CharacterRunAutomaton}, a deterministic automaton is required.
+     * In that case, use {@link Regex#simpleMatchToAutomaton} instead.
+     */
+    public static Automaton simpleMatchToNonDeterminizedAutomaton(String pattern) {
         List<Automaton> automata = new ArrayList<>();
         int previous = 0;
         for (int i = pattern.indexOf('*'); i != -1; i = pattern.indexOf('*', i + 1)) {
@@ -69,13 +76,24 @@ public class Regex {
             previous = i + 1;
         }
         automata.add(Automata.makeString(pattern.substring(previous)));
-        return Operations.determinize(Operations.concatenate(automata), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
+        return Operations.concatenate(automata);
+    }
+
+    /** Return a deterministic {@link Automaton} that matches the given pattern. */
+    public static Automaton simpleMatchToAutomaton(String pattern) {
+        return Operations.determinize(simpleMatchToNonDeterminizedAutomaton(pattern), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
     }
 
     /**
-     * Return an Automaton that matches the union of the provided patterns.
+     * Returns a non-deterministic {@link Automaton} that matches the union of the given patterns.
+     *
+     * WARNING: Use this method only when the resulting {@link Automaton} is used in contexts
+     * that do not require determinism (e.g., checking the intersection of automatons).
+     *
+     * For pattern matching with {@link CharacterRunAutomaton}, a deterministic automaton is required.
+     * In that case, use {@link Regex#simpleMatchToAutomaton} instead.
      */
-    public static Automaton simpleMatchToAutomaton(String... patterns) {
+    public static Automaton simpleMatchToNonDeterminizedAutomaton(String... patterns) {
         if (patterns.length < 1) {
             throw new IllegalArgumentException("There must be at least one pattern, zero given");
         }
@@ -88,7 +106,7 @@ public class Regex {
             if (isSuffixWildcard(pattern) && pattern.length() < 1000) {
                 prefixes.add(new BytesRef(pattern.substring(0, pattern.length() - 1)));
             } else if (isSimpleMatchPattern(pattern) || pattern.length() >= 1000) {
-                automata.add(simpleMatchToAutomaton(pattern));
+                automata.add(simpleMatchToNonDeterminizedAutomaton(pattern));
             } else {
                 simpleStrings.add(new BytesRef(pattern));
             }
@@ -113,7 +131,14 @@ public class Regex {
             prefixAutomaton.add(Automata.makeAnyString());
             automata.add(Operations.concatenate(prefixAutomaton));
         }
-        return Operations.determinize(Operations.union(automata), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
+        return Operations.union(automata);
+    }
+
+    /**
+     * Return a deterministic Automaton that matches the union of the provided patterns.
+     */
+    public static Automaton simpleMatchToAutomaton(String... patterns) {
+        return Operations.determinize(simpleMatchToNonDeterminizedAutomaton(patterns), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
     }
 
     /**

+ 18 - 0
server/src/test/java/org/elasticsearch/common/regex/RegexTests.java

@@ -10,9 +10,12 @@ package org.elasticsearch.common.regex;
 
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
 import org.elasticsearch.test.ESTestCase;
 
 import java.io.IOException;
+import java.util.Arrays;
 import java.util.Locale;
 import java.util.Random;
 import java.util.function.Predicate;
@@ -20,6 +23,7 @@ import java.util.regex.Pattern;
 
 import static org.elasticsearch.test.LambdaMatchers.falseWith;
 import static org.elasticsearch.test.LambdaMatchers.trueWith;
+import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
 
 public class RegexTests extends ESTestCase {
@@ -250,4 +254,18 @@ public class RegexTests extends ESTestCase {
             assertTrue(predicate.test(patterns[i]));
         }
     }
+
+    public void testIntersectNonDeterminizedAutomaton() {
+        // patterns too complex to determinize within the default limit
+        String[] patterns = randomArray(20, 100, size -> new String[size], () -> "*" + randomAlphanumericOfLength(10) + "*");
+        Automaton a = Regex.simpleMatchToNonDeterminizedAutomaton(patterns);
+        assertFalse(a.isDeterministic());
+        Automaton b = Regex.simpleMatchToNonDeterminizedAutomaton(Arrays.copyOfRange(patterns, patterns.length / 2, patterns.length));
+        assertFalse(b.isDeterministic());
+        assertFalse(Operations.isEmpty(Operations.intersection(a, b)));
+        IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> assertMatchesAll(a, "my_test"));
+        // the run automaton expects a deterministic automaton
+        assertThat(exc.getMessage(), containsString("deterministic"));
+        expectThrows(TooComplexToDeterminizeException.class, () -> Regex.simpleMatchToAutomaton(patterns));
+    }
 }