Browse Source

Make Glob non-recursive (#132798) (#133109)

This changes the implementation of `Glob` (used by `FilterPath`)
to use a non-recursive algorithm for improved efficiency and
stability
Tim Vernum 1 month ago
parent
commit
0f0c597942

+ 69 - 19
libs/core/src/main/java/org/elasticsearch/core/Glob.java

@@ -29,34 +29,84 @@ public class Glob {
         if (pattern == null || str == null) {
             return false;
         }
-        int firstIndex = pattern.indexOf('*');
-        if (firstIndex == -1) {
+
+        int patternIndex = pattern.indexOf('*');
+        if (patternIndex == -1) {
+            // Nothing to glob
             return pattern.equals(str);
         }
-        if (firstIndex == 0) {
+
+        if (patternIndex == 0) {
+            // If the pattern is a literal '*' then it matches any input
             if (pattern.length() == 1) {
                 return true;
             }
-            int nextIndex = pattern.indexOf('*', firstIndex + 1);
-            if (nextIndex == -1) {
-                return str.endsWith(pattern.substring(1));
-            } else if (nextIndex == 1) {
-                // Double wildcard "**" - skipping the first "*"
-                return globMatch(pattern.substring(1), str);
+        } else {
+            if (str.regionMatches(0, pattern, 0, patternIndex) == false) {
+                // If the pattern starts with a literal (i.e. not '*') then the input string must also start with that
+                return false;
             }
-            String part = pattern.substring(1, nextIndex);
-            int partIndex = str.indexOf(part);
-            while (partIndex != -1) {
-                if (globMatch(pattern.substring(nextIndex), str.substring(partIndex + part.length()))) {
-                    return true;
+            if (patternIndex == pattern.length() - 1) {
+                // The pattern is "something*", so if the starting region matches, then the whole pattern matches
+                return true;
+            }
+        }
+
+        int strIndex = patternIndex;
+        while (strIndex < str.length()) {
+            assert pattern.charAt(patternIndex) == '*' : "Expected * at index " + patternIndex + " of [" + pattern + "]";
+
+            // skip over the '*'
+            patternIndex++;
+
+            if (patternIndex == pattern.length()) {
+                // The pattern ends in '*' (that is, "something*" or "*some*thing*", etc)
+                // Since we already matched everything up to the '*' we know the string matches (whatever is left over must match '*')
+                // so we're automatically done
+                return true;
+            }
+
+            // Look for the next '*'
+            int nextStar = pattern.indexOf('*', patternIndex);
+            while (nextStar == patternIndex) {
+                // Two (or more) stars in sequence, just skip the subsequent ones
+                patternIndex++;
+                nextStar = pattern.indexOf('*', patternIndex);
+            }
+            if (nextStar == -1) {
+                // We've come to the last '*' in a pattern (.e.g the 2nd one in "*some*thing")
+                // In this case we match if the input string ends in "thing" (but constrained by the current position)
+                final int len = pattern.length() - patternIndex;
+                final int strSuffixStart = str.length() - len;
+                if (strSuffixStart < strIndex) {
+                    // The suffix would start before the current position. That means it's not a match
+                    // e.g. "abc" is not a match for "ab*bc" even though "abc" does end with "bc"
+                    return false;
+                }
+                return str.regionMatches(strSuffixStart, pattern, patternIndex, len);
+            } else {
+                // There is another star, with a literal in between the current position and that '*'
+                // That is, we have "*literal*"
+                // We want the first '*' to consume everything up until the first occurrence of "literal" in the input string
+                int match = str.indexOf(pattern.substring(patternIndex, nextStar), strIndex);
+                if (match == -1) {
+                    // If "literal" isn't there, then the match fails.
+                    return false;
                 }
-                partIndex = str.indexOf(part, partIndex + 1);
+                // Move both index (pointer) values to the end of the literal
+                strIndex = match + (nextStar - patternIndex);
+                patternIndex = nextStar;
             }
-            return false;
         }
-        return (str.length() >= firstIndex
-            && pattern.substring(0, firstIndex).equals(str.substring(0, firstIndex))
-            && globMatch(pattern.substring(firstIndex), str.substring(firstIndex)));
+
+        // We might have trailing '*'s in the pattern after completing a literal match at the end of the input string
+        // e.g. a glob of "el*ic*" matching "elastic" - we need to consume that last '*' without it matching anything
+        while (patternIndex < pattern.length() && pattern.charAt(patternIndex) == '*') {
+            patternIndex++;
+        }
+
+        // The match is successful only if we have consumed the entire pattern.
+        return patternIndex == pattern.length();
     }
 
 }

+ 199 - 0
libs/core/src/test/java/org/elasticsearch/core/GlobTests.java

@@ -0,0 +1,199 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.core;
+
+import org.elasticsearch.test.ESTestCase;
+
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.is;
+
+public class GlobTests extends ESTestCase {
+
+    public void testMatchNull() {
+        assertThat(Glob.globMatch(null, null), is(false));
+        assertThat(Glob.globMatch(randomAlphaOfLengthBetween(1, 10), null), is(false));
+        assertThat(Glob.globMatch(null, randomAlphaOfLengthBetween(1, 10)), is(false));
+    }
+
+    public void testMatchLiteral() {
+        assertMatch("", "");
+        var str = randomAlphaOfLengthBetween(1, 12);
+        assertMatch(str, str);
+
+        str = randomAlphanumericOfLength(randomIntBetween(1, 12));
+        assertMatch(str, str);
+
+        str = randomAsciiString(randomIntBetween(1, 24), ch -> ch >= ' ' && ch <= '~' && ch != '*');
+        assertMatch(str, str);
+    }
+
+    public void testSingleAsterisk() {
+        assertMatch("*", "");
+        assertMatch("*", randomAlphaOfLengthBetween(1, 12));
+        assertMatch("*", randomAlphanumericOfLength(randomIntBetween(1, 12)));
+        assertMatch("*", randomAsciiString(randomIntBetween(1, 24), ch -> ch >= ' ' && ch <= '~'));
+        assertMatch("*", "*".repeat(randomIntBetween(1, 5)));
+    }
+
+    public void testMultipleConsecutiveAsterisk() {
+        var pattern = "*".repeat(randomIntBetween(2, 5));
+
+        assertMatch(pattern, "");
+        assertMatch(pattern, randomAlphaOfLengthBetween(1, 12));
+        assertMatch(pattern, randomAlphanumericOfLength(randomIntBetween(1, 12)));
+        assertMatch(pattern, randomAsciiString(randomIntBetween(1, 24)));
+        assertMatch(pattern, "*".repeat(randomIntBetween(1, 5)));
+    }
+
+    public void testPrefixMatch() {
+        assertMatch("123*", "123");
+        assertMatch("123*", "123abc");
+        assertMatch("123*", "123123123");
+        assertNonMatch("123*", "12");
+        assertNonMatch("123*", "124");
+        assertNonMatch("123*", "23");
+        assertNonMatch("123*", "23x");
+        assertNonMatch("123*", "x23");
+        assertNonMatch("123*", "12*");
+        assertNonMatch("123*", "12-3");
+        assertNonMatch("123*", "1.2.3");
+        assertNonMatch("123*", "abc123");
+        assertNonMatch("123*", "abc123def");
+
+        var prefix = randomAsciiString(randomIntBetween(2, 12));
+        var pattern = prefix + "*";
+        assertMatch(pattern, prefix);
+        assertMatch(pattern, prefix + randomAsciiString(randomIntBetween(1, 30)));
+        assertNonMatch(
+            pattern,
+            randomValueOtherThanMany(s -> s.charAt(0) == prefix.charAt(0), () -> randomAsciiString(randomIntBetween(1, 30))) + prefix
+        );
+        assertNonMatch(pattern, prefix.substring(0, prefix.length() - 1));
+        assertNonMatch(pattern, prefix.substring(1));
+    }
+
+    public void testSuffixMatch() {
+        assertMatch("*123", "123");
+        assertMatch("*123", "abc123");
+        assertMatch("*123", "123123123");
+        assertNonMatch("*123", "12");
+        assertNonMatch("*123", "x12");
+        assertNonMatch("*123", "23");
+        assertNonMatch("*123", "x23");
+        assertNonMatch("*123", "12*");
+        assertNonMatch("*123", "1.2.3");
+        assertNonMatch("*123", "123abc");
+        assertNonMatch("*123", "abc123def");
+
+        var suffix = randomAsciiString(randomIntBetween(2, 12));
+        var pattern = "*" + suffix;
+        assertMatch(pattern, suffix);
+        assertMatch(pattern, randomAsciiString(randomIntBetween(1, 30)) + suffix);
+        assertNonMatch(pattern, suffix + "#" + randomValueOtherThan(suffix, () -> randomAsciiString(randomIntBetween(1, 30))));
+        assertNonMatch(pattern, suffix.substring(0, suffix.length() - 1));
+        assertNonMatch(pattern, suffix.substring(1));
+    }
+
+    public void testInfixStringMatch() {
+        assertMatch("*123*", "abc123def");
+        assertMatch("*123*", "abc123");
+        assertMatch("*123*", "123def");
+        assertMatch("*123*", "123");
+        assertMatch("*123*", "123123123");
+        assertMatch("*123*", "1.12.123.1234");
+        assertNonMatch("*123*", "12");
+        assertNonMatch("*123*", "23");
+        assertNonMatch("*123*", "x23");
+        assertNonMatch("*123*", "12*");
+        assertNonMatch("*123*", "1.2.3");
+
+        var infix = randomAsciiString(randomIntBetween(2, 12));
+        var pattern = "*" + infix + "*";
+        assertMatch(pattern, infix);
+        assertMatch(pattern, randomAsciiString(randomIntBetween(1, 30)) + infix + randomAsciiString(randomIntBetween(1, 30)));
+        assertMatch(pattern, randomAsciiString(randomIntBetween(1, 30)) + infix);
+        assertMatch(pattern, infix + randomAsciiString(randomIntBetween(1, 30)));
+        assertNonMatch(pattern, infix.substring(0, infix.length() - 1));
+        assertNonMatch(pattern, infix.substring(1));
+    }
+
+    public void testInfixAsteriskMatch() {
+        assertMatch("abc*xyz", "abcxyz");
+        assertMatch("abc*xyz", "abc#xyz");
+        assertMatch("abc*xyz", "abc*xyz");
+        assertMatch("abc*xyz", "abcdefghijklmnopqrstuvwxyz");
+        assertNonMatch("abc*xyz", "ABC.xyz");
+        assertNonMatch("abc*xyz", "RabcSxyzT");
+        assertNonMatch("abc*xyz", "RabcSxyz");
+        assertNonMatch("abc*xyz", "abcSxyzT");
+
+        assertMatch("123*321", "123321");
+        assertMatch("123*321", "12345678987654321");
+        assertNonMatch("123*321", "12321");
+
+        var prefix = randomAsciiString(randomIntBetween(2, 12));
+        var suffix = randomAsciiString(randomIntBetween(2, 12));
+        var pattern = prefix + "*" + suffix;
+        assertMatch(pattern, prefix + suffix);
+        assertMatch(pattern, prefix + randomAsciiString(randomIntBetween(1, 30)) + suffix);
+        assertNonMatch(pattern, prefix.substring(0, prefix.length() - 1) + suffix);
+        assertNonMatch(pattern, prefix + suffix.substring(1));
+    }
+
+    public void testLiteralSubstringMatching() {
+        assertMatch("start*middle*end", "startmiddleend");
+        assertMatch("start*middle*end", "start.middle.end");
+        assertMatch("start*middle*end", "start.middlX.middle.end");
+        assertMatch("start*middle*end", "start.middlmiddle.end");
+        assertMatch("start*middle*end", "start.middle.eend");
+        assertMatch("start*middle*end", "start.middle.enend");
+        assertMatch("start*middle*end", "start.middle.endend");
+
+        assertNonMatch("start*middle*end", "startmiddlend");
+        assertNonMatch("start*middle*end", "start.end");
+        assertNonMatch("start*middle*end", "start+MIDDLE+end");
+        assertNonMatch("start*middle*end", "start+mid+dle+end");
+        assertNonMatch("start*middle*end", "start+mid+middle+en");
+    }
+
+    private static void assertMatch(String pattern, String str) {
+        assertThat("Expect [" + str + "] to match '" + pattern + "'", Glob.globMatch(pattern, str), is(true));
+    }
+
+    private static void assertNonMatch(String pattern, String str) {
+        assertThat("Expect [" + str + "] to not match '" + pattern + "'", Glob.globMatch(pattern, str), is(false));
+    }
+
+    @FunctionalInterface
+    interface CharPredicate {
+        boolean test(char c);
+    }
+
+    private String randomAsciiString(int length) {
+        return randomAsciiString(length, ch -> ch >= ' ' && ch <= '~');
+    }
+
+    private String randomAsciiString(int length, CharPredicate validCharacters) {
+        StringBuilder str = new StringBuilder(length);
+        nextChar: for (int i = 0; i < length; i++) {
+            for (int attempts = 0; attempts < 200; attempts++) {
+                char ch = (char) randomIntBetween(0x1, 0x7f);
+                if (validCharacters.test(ch)) {
+                    str.append(ch);
+                    continue nextChar;
+                }
+            }
+            throw new IllegalStateException("Cannot find valid character for string");
+        }
+        assertThat(str.length(), equalTo(length));
+        return str.toString();
+    }
+
+}