Browse Source

[ML] address two edge cases for categorization.GrokPatternCreator#findBestGrokMatchFromExamples (#51168)

There are two edge cases that can be ran into when example input is matched in a weird way.

1. Recursion depth could continue many many times, resulting in a HUGE runtime cost. I put a limit of 10 recursions (could be adjusted I suppose). 
2. If there are no "fixed regex bits", exploring the grok space would result in a fence-post error during runtime (with assertions turned off)
Benjamin Trent 5 years ago
parent
commit
79eb5aa4f8

+ 51 - 14
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java

@@ -6,6 +6,7 @@
 package org.elasticsearch.xpack.ml.job.categorization;
 
 import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
 import org.elasticsearch.grok.Grok;
 
 import java.util.ArrayList;
@@ -25,8 +26,11 @@ import java.util.regex.Pattern;
  */
 public final class GrokPatternCreator {
 
+    private static final Logger logger = LogManager.getLogger(GrokPatternCreator.class);
+
     private static final String PREFACE = "preface";
     private static final String EPILOGUE = "epilogue";
+    private static final int MAX_RECURSE_DEPTH = 10;
 
     /**
      * The first match in this list will be chosen, so it needs to be ordered
@@ -89,6 +93,11 @@ public final class GrokPatternCreator {
         // E.g., ".*?cat.+?sat.+?mat.*" -> [ "", "cat", "sat", "mat" ]
         String[] fixedRegexBits = regex.split("\\.[*+]\\??");
 
+        // If there are no fixed regex bits, that implies there would only be a single capture group
+        // And that would mean a pretty uninteresting grok pattern
+        if (fixedRegexBits.length == 0) {
+            return regex;
+        }
         // Create a pattern that will capture the bits in between the fixed parts of the regex
         //
         // E.g., ".*?cat.+?sat.+?mat.*" -> Pattern (.*?)cat(.+?)sat(.+?)mat(.*)
@@ -112,8 +121,7 @@ public final class GrokPatternCreator {
                 // We should never get here.  If we do it implies a bug in the original categorization,
                 // as it's produced a regex that doesn't match the examples.
                 assert matcher.matches() : exampleProcessor.pattern() + " did not match " + example;
-                LogManager.getLogger(GrokPatternCreator.class).error("[{}] Pattern [{}] did not match example [{}]", jobId,
-                        exampleProcessor.pattern(), example);
+                logger.error("[{}] Pattern [{}] did not match example [{}]", jobId, exampleProcessor.pattern(), example);
             }
         }
 
@@ -125,19 +133,19 @@ public final class GrokPatternCreator {
             // Remember (from the first comment in this method) that the first element in this array is
             // always the empty string
             overallGrokPatternBuilder.append(fixedRegexBits[inBetweenBitNum]);
-            appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, inBetweenBitNum == 0,
+            appendBestGrokMatchForStrings(jobId, fieldNameCountStore, overallGrokPatternBuilder, inBetweenBitNum == 0,
                     inBetweenBitNum == fixedRegexBits.length - 1, groupsMatchesFromExamples.get(inBetweenBitNum));
         }
         return overallGrokPatternBuilder.toString();
     }
 
-    /**
-     * Given a collection of strings, work out which (if any) of the grok patterns we're allowed
-     * to use matches it best.  Then append the appropriate grok language to represent that finding
-     * onto the supplied string builder.
-     */
-    static void appendBestGrokMatchForStrings(Map<String, Integer> fieldNameCountStore, StringBuilder overallGrokPatternBuilder,
-                                              boolean isFirst, boolean isLast, Collection<String> mustMatchStrings) {
+    private static void appendBestGrokMatchForStrings(String jobId,
+                                                      Map<String, Integer> fieldNameCountStore,
+                                                      StringBuilder overallGrokPatternBuilder,
+                                                      boolean isFirst,
+                                                      boolean isLast,
+                                                      Collection<String> mustMatchStrings,
+                                                      int numRecurse) {
 
         GrokPatternCandidate bestCandidate = null;
         if (mustMatchStrings.isEmpty() == false) {
@@ -149,7 +157,10 @@ public final class GrokPatternCreator {
             }
         }
 
-        if (bestCandidate == null) {
+        if (bestCandidate == null || numRecurse >= MAX_RECURSE_DEPTH) {
+            if (bestCandidate != null) {
+                logger.warn("[{}] exited grok discovery early, reached max depth [{}]", jobId, MAX_RECURSE_DEPTH);
+            }
             if (isLast) {
                 overallGrokPatternBuilder.append(".*");
             } else if (isFirst || mustMatchStrings.stream().anyMatch(String::isEmpty)) {
@@ -161,13 +172,39 @@ public final class GrokPatternCreator {
             Collection<String> prefaces = new ArrayList<>();
             Collection<String> epilogues = new ArrayList<>();
             populatePrefacesAndEpilogues(mustMatchStrings, bestCandidate.grok, prefaces, epilogues);
-            appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, isFirst, false, prefaces);
+            appendBestGrokMatchForStrings(jobId,
+                fieldNameCountStore,
+                overallGrokPatternBuilder,
+                isFirst,
+                false,
+                prefaces,
+                numRecurse + 1);
             overallGrokPatternBuilder.append("%{").append(bestCandidate.grokPatternName).append(':')
-                    .append(buildFieldName(fieldNameCountStore, bestCandidate.fieldName)).append('}');
-            appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, isLast, epilogues);
+                .append(buildFieldName(fieldNameCountStore, bestCandidate.fieldName)).append('}');
+            appendBestGrokMatchForStrings(jobId,
+                fieldNameCountStore,
+                overallGrokPatternBuilder,
+                false, isLast,
+                epilogues,
+                numRecurse + 1);
         }
     }
 
+
+    /**
+     * Given a collection of strings, work out which (if any) of the grok patterns we're allowed
+     * to use matches it best.  Then append the appropriate grok language to represent that finding
+     * onto the supplied string builder.
+     */
+    static void appendBestGrokMatchForStrings(String jobId,
+                                              Map<String, Integer> fieldNameCountStore,
+                                              StringBuilder overallGrokPatternBuilder,
+                                              boolean isFirst,
+                                              boolean isLast,
+                                              Collection<String> mustMatchStrings) {
+        appendBestGrokMatchForStrings(jobId, fieldNameCountStore, overallGrokPatternBuilder, isFirst, isLast, mustMatchStrings, 0);
+    }
+
     /**
      * Given a collection of strings, and a grok pattern that matches some part of them all,
      * return collections of the bits that come before (prefaces) and after (epilogues) the

+ 91 - 10
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java

@@ -14,6 +14,7 @@ import java.util.Collection;
 import java.util.HashMap;
 import java.util.Map;
 
+import static org.hamcrest.CoreMatchers.equalTo;
 import static org.hamcrest.Matchers.containsInAnyOrder;
 
 public class GrokPatternCreatorTests extends ESTestCase {
@@ -71,7 +72,8 @@ public class GrokPatternCreatorTests extends ESTestCase {
         Map<String, Integer> fieldNameCountStore = new HashMap<>();
         StringBuilder overallGrokPatternBuilder = new StringBuilder();
 
-        GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
+        GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
+            false, mustMatchStrings);
 
         assertEquals(".+?%{TIMESTAMP_ISO8601:timestamp}.+?%{LOGLEVEL:loglevel}.+?", overallGrokPatternBuilder.toString());
     }
@@ -87,7 +89,8 @@ public class GrokPatternCreatorTests extends ESTestCase {
         Map<String, Integer> fieldNameCountStore = new HashMap<>();
         StringBuilder overallGrokPatternBuilder = new StringBuilder();
 
-        GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
+        GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
+            false, mustMatchStrings);
 
         assertEquals(".*?%{TOMCAT_DATESTAMP:timestamp}.+?%{LOGLEVEL:loglevel}.+?", overallGrokPatternBuilder.toString());
     }
@@ -105,7 +108,8 @@ public class GrokPatternCreatorTests extends ESTestCase {
         Map<String, Integer> fieldNameCountStore = new HashMap<>();
         StringBuilder overallGrokPatternBuilder = new StringBuilder();
 
-        GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
+        GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
+            false, mustMatchStrings);
 
         assertEquals(".+?", overallGrokPatternBuilder.toString());
     }
@@ -120,7 +124,8 @@ public class GrokPatternCreatorTests extends ESTestCase {
         Map<String, Integer> fieldNameCountStore = new HashMap<>();
         StringBuilder overallGrokPatternBuilder = new StringBuilder();
 
-        GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
+        GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
+            false, mustMatchStrings);
 
         assertEquals(".+?%{NUMBER:field}.+?", overallGrokPatternBuilder.toString());
     }
@@ -134,7 +139,8 @@ public class GrokPatternCreatorTests extends ESTestCase {
         Map<String, Integer> fieldNameCountStore = new HashMap<>();
         StringBuilder overallGrokPatternBuilder = new StringBuilder();
 
-        GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
+        GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
+            false, mustMatchStrings);
 
         // It seems sensible that we don't detect these suffices as either base 10 or base 16 numbers
         assertEquals(".+?", overallGrokPatternBuilder.toString());
@@ -150,7 +156,8 @@ public class GrokPatternCreatorTests extends ESTestCase {
         Map<String, Integer> fieldNameCountStore = new HashMap<>();
         StringBuilder overallGrokPatternBuilder = new StringBuilder();
 
-        GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
+        GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
+            false, mustMatchStrings);
 
         assertEquals(".*?%{BASE16NUM:field}.*?", overallGrokPatternBuilder.toString());
     }
@@ -163,7 +170,8 @@ public class GrokPatternCreatorTests extends ESTestCase {
         Map<String, Integer> fieldNameCountStore = new HashMap<>();
         StringBuilder overallGrokPatternBuilder = new StringBuilder();
 
-        GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
+        GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
+            false, mustMatchStrings);
 
         // We don't want the .1. in the middle to get detected as a hex number
         assertEquals(".+?", overallGrokPatternBuilder.toString());
@@ -178,7 +186,8 @@ public class GrokPatternCreatorTests extends ESTestCase {
         Map<String, Integer> fieldNameCountStore = new HashMap<>();
         StringBuilder overallGrokPatternBuilder = new StringBuilder();
 
-        GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
+        GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
+            false, mustMatchStrings);
 
         assertEquals(".*?%{EMAILADDRESS:email}.*?", overallGrokPatternBuilder.toString());
     }
@@ -192,7 +201,8 @@ public class GrokPatternCreatorTests extends ESTestCase {
         Map<String, Integer> fieldNameCountStore = new HashMap<>();
         StringBuilder overallGrokPatternBuilder = new StringBuilder();
 
-        GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
+        GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
+            false, mustMatchStrings);
 
         assertEquals(".*?%{URI:uri}.*?", overallGrokPatternBuilder.toString());
     }
@@ -206,7 +216,8 @@ public class GrokPatternCreatorTests extends ESTestCase {
         Map<String, Integer> fieldNameCountStore = new HashMap<>();
         StringBuilder overallGrokPatternBuilder = new StringBuilder();
 
-        GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
+        GrokPatternCreator.appendBestGrokMatchForStrings("foo", fieldNameCountStore, overallGrokPatternBuilder, false,
+            false, mustMatchStrings);
 
         assertEquals(".+?%{PATH:path}.*?", overallGrokPatternBuilder.toString());
     }
@@ -263,4 +274,74 @@ public class GrokPatternCreatorTests extends ESTestCase {
                 "%{IP:ipaddress}.+?Authpriv.+?Info.+?sshd.+?subsystem.+?request.+?for.+?sftp.*",
                 GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples));
     }
+
+    public void testFindBestGrokMatchFromExamplesGivenAdversarialInputRecurseDepth() {
+        String regex = ".*?combo.+?rpc\\.statd.+?gethostbyname.+?error.+?for.+?X.+?X.+?Z.+?Z.+?hn.+?hn.*";
+        // Two timestamps: one local, one UTC
+        Collection<String> examples = Arrays.asList(
+            "combo rpc.statd[1605]: gethostbyname error for ^X^X^Z^Z%8x%8x%8x%8x%8x%8x%8x%8x%8x%62716x%hn%51859x%hn" +
+                "\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\22...",
+            "combo rpc.statd[1608]: gethostbyname error for ^X^X^Z^Z%8x%8x%8x%8x%8x%8x%8x%8x%8x%62716x%hn%51859x%hn" +
+                "\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\22...",
+            "combo rpc.statd[1635]: gethostbyname error for ^X^X^Z^Z%8x%8x%8x%8x%8x%8x%8x%8x%8x%62716x%hn%51859x%hn" +
+                "\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220\\220" +
+                "\\220\\220\\220\\220\\220\\220\\220\\220\\220\\22...");
+        assertEquals(
+            ".*?combo.+?rpc\\.statd.+?%{NUMBER:field}.+?gethostbyname.+?error.+?for.+?X.+?X.+?Z.+?Z.+?hn.+?hn.+?%{NUMBER:field2}" +
+                ".+?%{NUMBER:field3}.+?%{NUMBER:field4}.+?%{NUMBER:field5}.+?%{NUMBER:field6}.+?%{NUMBER:field7}.+?%{NUMBER:field8}" +
+                ".+?%{NUMBER:field9}.+?%{NUMBER:field10}.+?%{NUMBER:field11}.*",
+            GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples));
+    }
+
+    public void testFindBestGrokMatchFromExamplesGivenMatchAllRegex() {
+        String regex = ".*";
+        // Two timestamps: one local, one UTC
+        Collection<String> examples = Arrays.asList(
+            "Killing job [count_tweets]",
+            "Killing job [tweets_by_location]",
+            "[count_tweets] Killing job",
+            "[tweets_by_location] Killing job");
+        assertThat(GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples), equalTo(regex));
+    }
 }