Browse Source

[ML] Adjust assertion in Grok pattern creation code (#65421)

It turns out that there _is_ a situation where the
regex for a category definition will not match all
the examples: where one or more examples have been
truncated.

Previously we had an assertion that this would never
happen.  This only affected development, as in
production assertions are disabled.  However, it
makes sense to adjust the assertion and comment to
reflect reality.
David Roberts 4 years ago
parent
commit
f6e0d74e24

+ 6 - 4
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java

@@ -118,10 +118,12 @@ public final class GrokPatternCreator {
                     groupsMatchesFromExamples.get(groupNum - 1).add(matcher.group(groupNum));
                 }
             } else {
-                // We should never get here.  If we do it implies a bug in the original categorization,
-                // as it's produced a regex that doesn't match the examples.
-                assert matcher.matches() : exampleProcessor.pattern() + " did not match " + example;
-                logger.error("[{}] Pattern [{}] did not match example [{}]", jobId, exampleProcessor.pattern(), example);
+                // If we get here it implies the original categorization has produced a
+                // regex that doesn't match one of the examples.  This can happen when
+                // the message was very long, and the example was truncated.  In this
+                // case we will have appended an ellipsis to indicate truncation.
+                assert example.endsWith("...") : exampleProcessor.pattern() + " did not match non-truncated example " + example;
+                logger.warn("[{}] Pattern [{}] did not match example [{}]", jobId, exampleProcessor.pattern(), example);
             }
         }
 

+ 39 - 0
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java

@@ -11,6 +11,7 @@ import org.elasticsearch.test.ESTestCase;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.Map;
 
@@ -344,4 +345,42 @@ public class GrokPatternCreatorTests extends ESTestCase {
             "[tweets_by_location] Killing job");
         assertThat(GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples), equalTo(regex));
     }
+
+    public void testFindBestGrokMatchFromExamplesGivenTruncated() {
+        String regex = ".*?BST.+?dave.+?bank3.+?CONTEXT.+?SQL.+?statement.+?SELECT.+?time_series_ids_tmp\\.evidence_id" +
+            ".+?time_series_ids_tmp\\.time_series_id.+?is_delta.+?GREATEST.+?usual_interval.+?FROM.+?time_series_ids_tmp.+?" +
+            "WHERE.+?found_peak_value.+?FALSE.+?ORDER.+?BY.+?time_series_ids_tmp\\.magnitude.+?DESC.+?" +
+            "time_series_ids_tmp\\.scaling_factor.+?DESC.+?time_series_ids_tmp\\.significance.+?DESC.+?" +
+            "time_series_ids_tmp\\.evidence_id.+?DESC.+?LIMIT.+?PL.+?pgSQL.+?function.+?probable_cause_list_common.+?" +
+            "integer.+?integer.+?integer.+?line.+?at.+?SQL.+?statement.+?SQL.+?statement.+?SELECT.+?" +
+            "probable_cause_list_common.+?evidenceIdIn.+?linkGroupId.+?timeSpanSeconds.+?PL.+?pgSQL.+?function.+?" +
+            "probable_cause_list.+?integer.+?integer.+?line.+?at.+?PERFORM.*";
+        Collection<String> examples = Collections.singletonList("2013-05-16 12:13:45 BST:192.168.61.59(51438):dave:@bank3:[19084]: " +
+            "CONTEXT:  SQL statement \"SELECT\n" +
+            "                    time_series_ids_tmp.evidence_id,\n" +
+            "                    time_series_ids_tmp.time_series_id,\n" +
+            "                    is_delta,\n" +
+            "                    GREATEST(usual_interval, 1)\n" +
+            "                FROM\n" +
+            "                    time_series_ids_tmp\n" +
+            "                WHERE\n" +
+            "                    found_peak_value = FALSE\n" +
+            "                ORDER BY\n" +
+            "                    \n" +
+            "                    \n" +
+            "                    \n" +
+            "                    time_series_ids_tmp.magnitude DESC,\n" +
+            "                    time_series_ids_tmp.scaling_factor DESC,\n" +
+            "                    time_series_ids_tmp.significance DESC,\n" +
+            "                    time_series_ids_tmp.evidence_id DESC\n" +
+            "                LIMIT\n" +
+            "                    1\"\n" +
+            "        PL/pgSQL function probable_cause_list_common(integer,integer,integer) line 255 at SQL statement\n" +
+            "        SQL statement \"SELECT probable_cause_list_common(evidenceIdIn, linkGroupId, timeSpanSeconds)\"\n" +
+            "        PL/pgSQL function probable_cause_list...");
+        // Our algorithm for converting examples to Grok patterns that pick out useful fields doesn't work in
+        // this case because the regex doesn't match the example (because the example has been truncated and
+        // the regex contains pieces that would match parts of the original message beyond the truncation point)
+        assertThat(GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples), equalTo(regex));
+    }
 }