7 ani în urmă · 84eaac79d7
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java
@@ -25,14 +25,15 @@ import java.util.regex.Pattern;
 
				  */
			
 
				 public final class GrokPatternCreator {
			
 
				 
			
 
				-    private static String PREFACE = "preface";
			
 
				-    private static String EPILOGUE = "epilogue";
			
 
				+    private static final String PREFACE = "preface";
			
 
				+    private static final String EPILOGUE = "epilogue";
			
 
				 
			
 
				     /**
			
 
				      * The first match in this list will be chosen, so it needs to be ordered
			
 
				      * such that more generic patterns come after more specific patterns.
			
 
				      */
			
 
				     private static final List<GrokPatternCandidate> ORDERED_CANDIDATE_GROK_PATTERNS = Arrays.asList(
			
 
				+            new GrokPatternCandidate("TOMCAT_DATESTAMP", "timestamp"),
			
 
				             new GrokPatternCandidate("TIMESTAMP_ISO8601", "timestamp"),
			
 
				             new GrokPatternCandidate("DATESTAMP_RFC822", "timestamp"),
			
 
				             new GrokPatternCandidate("DATESTAMP_RFC2822", "timestamp"),
			
@@ -41,7 +42,6 @@ public final class GrokPatternCreator {
 
				             new GrokPatternCandidate("SYSLOGTIMESTAMP", "timestamp"),
			
 
				             new GrokPatternCandidate("HTTPDATE", "timestamp"),
			
 
				             new GrokPatternCandidate("CATALINA_DATESTAMP", "timestamp"),
			
 
				-            new GrokPatternCandidate("TOMCAT_DATESTAMP", "timestamp"),
			
 
				             new GrokPatternCandidate("CISCOTIMESTAMP", "timestamp"),
			
 
				             new GrokPatternCandidate("DATE", "date"),
			
 
				             new GrokPatternCandidate("TIME", "time"),
			
@@ -56,12 +56,10 @@ public final class GrokPatternCreator {
 
				             new GrokPatternCandidate("IP", "ipaddress"),
			
 
				             // This already includes pre/post break conditions
			
 
				             new GrokPatternCandidate("QUOTEDSTRING", "field", "", ""),
			
 
				-            // Can't use \b as the break before, because it doesn't work for negative numbers (the
			
 
				-            // minus sign is not a "word" character)
			
 
				-            new GrokPatternCandidate("NUMBER", "field", "(?<!\\w)"),
			
 
				-            // Disallow +, - and . before hex numbers, otherwise this pattern will pick up base 10
			
 
				-            // numbers that NUMBER rejected due to preceeding characters
			
 
				-            new GrokPatternCandidate("BASE16NUM", "field", "(?<![\\w.+-])")
			
 
				+            // Disallow +, - and . before numbers, as well as "word" characters, otherwise we'll pick
			
 
				+            // up numeric suffices too eagerly
			
 
				+            new GrokPatternCandidate("NUMBER", "field", "(?<![\\w.+-])", "(?![\\w+-]|\\.\\d)"),
			
 
				+            new GrokPatternCandidate("BASE16NUM", "field", "(?<![\\w.+-])", "(?![\\w+-]|\\.\\w)")
			
 
				             // TODO: also unfortunately can't have USERNAME in the list as it matches too broadly
			
 
				             // Fixing these problems with overly broad matches would require some extra intelligence
			
 
				             // to be added to remove inappropriate matches.  One idea would be to use a dictionary,
			
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java
@@ -76,6 +76,40 @@ public class GrokPatternCreatorTests extends ESTestCase {
 
				         assertEquals(".+?%{TIMESTAMP_ISO8601:timestamp}.+?%{LOGLEVEL:loglevel}.+?", overallGrokPatternBuilder.toString());
			
 
				     }
			
 
				 
			
 
				+    public void testAppendBestGrokMatchForStringsGivenTomcatDatestamps() {
			
 
				+
			
 
				+        // The first part of the Tomcat datestamp can match as an ISO8601
			
 
				+        // timestamp if the ordering of candidate patterns is wrong
			
 
				+        Collection<String> mustMatchStrings = Arrays.asList("2018-09-03 17:03:28,269 +0100 | ERROR | ",
			
 
				+                "2018-09-03 17:04:27,279 +0100 | DEBUG | ",
			
 
				+                "2018-09-03 17:05:26,289 +0100 | ERROR | ");
			
 
				+
			
 
				+        Map<String, Integer> fieldNameCountStore = new HashMap<>();
			
 
				+        StringBuilder overallGrokPatternBuilder = new StringBuilder();
			
 
				+
			
 
				+        GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
			
 
				+
			
 
				+        assertEquals(".*?%{TOMCAT_DATESTAMP:timestamp}.+?%{LOGLEVEL:loglevel}.+?", overallGrokPatternBuilder.toString());
			
 
				+    }
			
 
				+
			
 
				+    public void testAppendBestGrokMatchForStringsGivenTrappyFloatCandidates() {
			
 
				+
			
 
				+        // If we're not careful then we might detect the first part of these strings as a
			
 
				+        // number, e.g. 1.2 in the first example, but this is inappropriate given the
			
 
				+        // trailing dot and digit
			
 
				+        Collection<String> mustMatchStrings = Arrays.asList("1.2.3",
			
 
				+                "-2.3.4",
			
 
				+                "4.5.6.7",
			
 
				+                "-9.8.7.6.5");
			
 
				+
			
 
				+        Map<String, Integer> fieldNameCountStore = new HashMap<>();
			
 
				+        StringBuilder overallGrokPatternBuilder = new StringBuilder();
			
 
				+
			
 
				+        GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings);
			
 
				+
			
 
				+        assertEquals(".+?", overallGrokPatternBuilder.toString());
			
 
				+    }
			
 
				+
			
 
				     public void testAppendBestGrokMatchForStringsGivenNumbersInBrackets() {
			
 
				 
			
 
				         Collection<String> mustMatchStrings = Arrays.asList("(-2)",