Browse Source

Extract capture config from grok patterns up front (#62706)

This extracts the configuration for extracting values from a groked
string when building the grok expression to do two things:
1. Create a method exposing that configuration on `Grok` itself which
   will be used grok `grok` flavored runtime fields.
2. Marginally speed up extracting grok values by skipping a little
   string manipulation.
Nik Everett 5 years ago
parent
commit
d4e014c4c2

+ 21 - 14
libs/grok/src/main/java/org/elasticsearch/grok/Grok.java

@@ -71,6 +71,7 @@ public final class Grok {
     private final boolean namedCaptures;
     private final Regex compiledExpression;
     private final MatcherWatchdog matcherWatchdog;
+    private final List<GrokCaptureConfig> captureConfig;
 
     public Grok(Map<String, String> patternBank, String grokPattern, Consumer<String> logCallBack) {
         this(patternBank, grokPattern, true, MatcherWatchdog.noop(), logCallBack);
@@ -100,6 +101,12 @@ public final class Grok {
         byte[] expressionBytes = expression.getBytes(StandardCharsets.UTF_8);
         this.compiledExpression = new Regex(expressionBytes, 0, expressionBytes.length, Option.DEFAULT, UTF8Encoding.INSTANCE,
             message -> logCallBack.accept(message));
+
+        List<GrokCaptureConfig> captureConfig = new ArrayList<>();
+        for (Iterator<NameEntry> entry = compiledExpression.namedBackrefIterator(); entry.hasNext();) {
+            captureConfig.add(new GrokCaptureConfig(entry.next()));
+        }
+        this.captureConfig = List.copyOf(captureConfig);
     }
 
     /**
@@ -145,7 +152,7 @@ public final class Grok {
         }
     }
 
-    public String groupMatch(String name, Region region, String pattern) {
+    private String groupMatch(String name, Region region, String pattern) {
         int number = GROK_PATTERN_REGEX.nameToBackrefNumber(name.getBytes(StandardCharsets.UTF_8), 0,
             name.getBytes(StandardCharsets.UTF_8).length, region);
         int begin = region.beg[number];
@@ -161,7 +168,7 @@ public final class Grok {
      *
      * @return named regex expression
      */
-    public String toRegex(String grokPattern) {
+    protected String toRegex(String grokPattern) {
         StringBuilder res = new StringBuilder();
         for (int i = 0; i < MAX_TO_REGEX_ITERATIONS; i++) {
             byte[] grokPatternBytes = grokPattern.getBytes(StandardCharsets.UTF_8);
@@ -251,19 +258,12 @@ public final class Grok {
             // TODO: I think we should throw an error here?
             return null;
         } else if (compiledExpression.numberOfNames() > 0) {
-            Map<String, Object> fields = new HashMap<>();
+            Map<String, Object> fields = new HashMap<>(captureConfig.size());
             Region region = matcher.getEagerRegion();
-            for (Iterator<NameEntry> entry = compiledExpression.namedBackrefIterator(); entry.hasNext();) {
-                NameEntry e = entry.next();
-                String groupName = new String(e.name, e.nameP, e.nameEnd - e.nameP, StandardCharsets.UTF_8);
-                for (int number : e.getBackRefs()) {
-                    if (region.beg[number] >= 0) {
-                        String matchValue = new String(textAsBytes, region.beg[number], region.end[number] - region.beg[number],
-                            StandardCharsets.UTF_8);
-                        GrokMatchGroup match = new GrokMatchGroup(groupName, matchValue);
-                        fields.put(match.getName(), match.getValue());
-                        break;
-                    }
+            for (GrokCaptureConfig config: captureConfig) {
+                Object v = config.extract(textAsBytes, region);
+                if (v != null) {
+                    fields.put(config.name(), v);
                 }
             }
             return fields;
@@ -272,6 +272,13 @@ public final class Grok {
         }
     }
 
+    /**
+     * The list of values that this {@linkplain Grok} can capture.
+     */
+    public List<GrokCaptureConfig> captureConfig() {
+        return captureConfig;
+    }
+
     /**
      * Load built-in patterns. 
      */

+ 67 - 0
libs/grok/src/main/java/org/elasticsearch/grok/GrokCaptureConfig.java

@@ -0,0 +1,67 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.grok;
+
+import org.joni.NameEntry;
+import org.joni.Region;
+
+import java.nio.charset.StandardCharsets;
+
+/**
+ * Configuration for a value that {@link Grok} can capture.
+ */
+public final class GrokCaptureConfig {
+    private final String name;
+    private final GrokCaptureType type;
+    private final int[] backRefs;
+
+    GrokCaptureConfig(NameEntry nameEntry) {
+        String groupName = new String(nameEntry.name, nameEntry.nameP, nameEntry.nameEnd - nameEntry.nameP, StandardCharsets.UTF_8);
+        String[] parts = groupName.split(":");
+        name = parts.length >= 2 ? parts[1] : parts[0];
+        type = parts.length == 3 ? GrokCaptureType.fromString(parts[2]) : GrokCaptureType.STRING;
+        this.backRefs = nameEntry.getBackRefs();
+    }
+
+    /**
+     * The name defined for the field in the pattern.
+     */
+    public String name() {
+        return name;
+    }
+
+    /**
+     * The type defined for the field in the pattern.
+     */
+    public GrokCaptureType type() {
+        return type;
+    }
+
+    Object extract(byte[] textAsBytes, Region region) {
+        for (int number : backRefs) {
+            if (region.beg[number] >= 0) {
+                String matchValue = new String(textAsBytes, region.beg[number], region.end[number] - region.beg[number],
+                    StandardCharsets.UTF_8);
+                return type.parse(matchValue);
+            }
+        }
+        return null;
+    }
+}

+ 90 - 0
libs/grok/src/main/java/org/elasticsearch/grok/GrokCaptureType.java

@@ -0,0 +1,90 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.grok;
+
+/**
+ * The type defined for the field in the pattern.
+ */
+public enum GrokCaptureType {
+    STRING {
+        @Override
+        protected Object parseValue(String str) {
+            return str;
+        }
+    },
+    INTEGER {
+        @Override
+        protected Object parseValue(String str) {
+            return Integer.parseInt(str);
+        }
+    },
+    LONG {
+        @Override
+        protected Object parseValue(String str) {
+            return Long.parseLong(str);
+        }
+    },
+    DOUBLE {
+        @Override
+        protected Object parseValue(String str) {
+            return Double.parseDouble(str);
+        }
+    },
+    FLOAT {
+        @Override
+        protected Object parseValue(String str) {
+            return Float.parseFloat(str);
+        }
+    },
+    BOOLEAN {
+        @Override
+        protected Object parseValue(String str) {
+            return Boolean.parseBoolean(str);
+        }
+    };
+
+    final Object parse(String str) {
+        if (str == null) {
+            return null;
+        }
+        return parseValue(str);
+    }
+
+    protected abstract Object parseValue(String str);
+
+    static GrokCaptureType fromString(String str) {
+        switch (str) {
+            case "string":
+                return STRING;
+            case "int":
+                return INTEGER;
+            case "long":
+                return LONG;
+            case "double":
+                return DOUBLE;
+            case "float":
+                return FLOAT;
+            case "boolean":
+                return BOOLEAN;
+            default:
+                return STRING;
+        }
+    }
+}

+ 0 - 68
libs/grok/src/main/java/org/elasticsearch/grok/GrokMatchGroup.java

@@ -1,68 +0,0 @@
-/*
- * Licensed to Elasticsearch under one or more contributor
- * license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright
- * ownership. Elasticsearch licenses this file to you under
- * the Apache License, Version 2.0 (the "License"); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.elasticsearch.grok;
-
-final class GrokMatchGroup {
-    private static final String DEFAULT_TYPE = "string";
-    private final String patternName;
-    private final String fieldName;
-    private final String type;
-    private final String groupValue;
-
-    GrokMatchGroup(String groupName, String groupValue) {
-        String[] parts = groupName.split(":");
-        patternName = parts[0];
-        if (parts.length >= 2) {
-            fieldName = parts[1];
-        } else {
-            fieldName = null;
-        }
-
-        if (parts.length == 3) {
-            type = parts[2];
-        } else {
-            type = DEFAULT_TYPE;
-        }
-        this.groupValue = groupValue;
-    }
-
-    public String getName() {
-        return (fieldName == null) ? patternName : fieldName;
-    }
-
-    public Object getValue() {
-        if (groupValue == null) { return null; }
-
-        switch(type) {
-            case "int":
-                return Integer.parseInt(groupValue);
-            case "long":
-                return Long.parseLong(groupValue);
-            case "double":
-                return Double.parseDouble(groupValue);
-            case "float":
-                return Float.parseFloat(groupValue);
-            case "boolean":
-                return Boolean.parseBoolean(groupValue);
-            default:
-                return groupValue;
-        }
-    }
-}

+ 109 - 0
libs/grok/src/test/java/org/elasticsearch/grok/GrokTests.java

@@ -31,6 +31,12 @@ import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.function.BiConsumer;
 
+import static org.elasticsearch.grok.GrokCaptureType.BOOLEAN;
+import static org.elasticsearch.grok.GrokCaptureType.DOUBLE;
+import static org.elasticsearch.grok.GrokCaptureType.FLOAT;
+import static org.elasticsearch.grok.GrokCaptureType.INTEGER;
+import static org.elasticsearch.grok.GrokCaptureType.LONG;
+import static org.elasticsearch.grok.GrokCaptureType.STRING;
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.is;
@@ -53,6 +59,19 @@ public class GrokTests extends ESTestCase {
     public void testSimpleSyslogLine() {
         String line = "Mar 16 00:01:25 evita postfix/smtpd[1713]: connect from camomile.cloud9.net[168.100.1.3]";
         Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "%{SYSLOGLINE}", logger::warn);
+        assertCaptureConfig(
+            grok,
+            Map.ofEntries(
+                Map.entry("facility", STRING),
+                Map.entry("logsource", STRING),
+                Map.entry("message", STRING),
+                Map.entry("pid", STRING),
+                Map.entry("priority", STRING),
+                Map.entry("program", STRING),
+                Map.entry("timestamp", STRING),
+                Map.entry("timestamp8601", STRING)
+            )
+        );
         Map<String, Object> matches = grok.captures(line);
         assertEquals("evita", matches.get("logsource"));
         assertEquals("Mar 16 00:01:25", matches.get("timestamp"));
@@ -65,6 +84,20 @@ public class GrokTests extends ESTestCase {
         String line = "<191>1 2009-06-30T18:30:00+02:00 paxton.local grokdebug 4123 - [id1 foo=\\\"bar\\\"][id2 baz=\\\"something\\\"] " +
                 "Hello, syslog.";
         Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "%{SYSLOG5424LINE}", logger::warn);
+        assertCaptureConfig(
+            grok,
+            Map.ofEntries(
+                Map.entry("syslog5424_app", STRING),
+                Map.entry("syslog5424_host", STRING),
+                Map.entry("syslog5424_msg", STRING),
+                Map.entry("syslog5424_msgid", STRING),
+                Map.entry("syslog5424_pri", STRING),
+                Map.entry("syslog5424_proc", STRING),
+                Map.entry("syslog5424_sd", STRING),
+                Map.entry("syslog5424_ts", STRING),
+                Map.entry("syslog5424_ver", STRING)
+            )
+        );
         Map<String, Object> matches = grok.captures(line);
         assertEquals("191", matches.get("syslog5424_pri"));
         assertEquals("1", matches.get("syslog5424_ver"));
@@ -80,12 +113,14 @@ public class GrokTests extends ESTestCase {
     public void testDatePattern() {
         String line = "fancy 12-12-12 12:12:12";
         Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "(?<timestamp>%{DATE_EU} %{TIME})", logger::warn);
+        assertCaptureConfig(grok, Map.of("timestamp", STRING));
         Map<String, Object> matches = grok.captures(line);
         assertEquals("12-12-12 12:12:12", matches.get("timestamp"));
     }
 
     public void testNilCoercedValues() {
         Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "test (N/A|%{BASE10NUM:duration:float}ms)", logger::warn);
+        assertCaptureConfig(grok, Map.of("duration", FLOAT));
         Map<String, Object> matches = grok.captures("test 28.4ms");
         assertEquals(28.4f, matches.get("duration"));
         matches = grok.captures("test N/A");
@@ -94,6 +129,7 @@ public class GrokTests extends ESTestCase {
 
     public void testNilWithNoCoercion() {
         Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "test (N/A|%{BASE10NUM:duration}ms)", logger::warn);
+        assertCaptureConfig(grok, Map.of("duration", STRING));
         Map<String, Object> matches = grok.captures("test 28.4ms");
         assertEquals("28.4", matches.get("duration"));
         matches = grok.captures("test N/A");
@@ -104,6 +140,17 @@ public class GrokTests extends ESTestCase {
         Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "<%{POSINT:syslog_pri}>%{SPACE}%{SYSLOGTIMESTAMP:syslog_timestamp} " +
                 "%{SYSLOGHOST:syslog_hostname} %{PROG:syslog_program}(:?)(?:\\[%{GREEDYDATA:syslog_pid}\\])?(:?) " +
                 "%{GREEDYDATA:syslog_message}", logger::warn);
+        assertCaptureConfig(
+            grok,
+            Map.ofEntries(
+                Map.entry("syslog_hostname", STRING),
+                Map.entry("syslog_message", STRING),
+                Map.entry("syslog_pid", STRING),
+                Map.entry("syslog_pri", STRING),
+                Map.entry("syslog_program", STRING),
+                Map.entry("syslog_timestamp", STRING)
+            )
+        );
         Map<String, Object> matches = grok.captures("<22>Jan  4 07:50:46 mailmaster postfix/policy-spf[9454]: : " +
                 "SPF permerror (Junk encountered in record 'v=spf1 mx a:mail.domain.no ip4:192.168.0.4 �all'): Envelope-from: " +
                 "email@domain.no");
@@ -114,18 +161,21 @@ public class GrokTests extends ESTestCase {
 
     public void testNamedFieldsWithWholeTextMatch() {
         Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "%{DATE_EU:stimestamp}", logger::warn);
+        assertCaptureConfig(grok, Map.of("stimestamp", STRING));
         Map<String, Object> matches = grok.captures("11/01/01");
         assertThat(matches.get("stimestamp"), equalTo("11/01/01"));
     }
 
     public void testWithOniguramaNamedCaptures() {
         Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "(?<foo>\\w+)", logger::warn);
+        assertCaptureConfig(grok, Map.of("foo", STRING));
         Map<String, Object> matches = grok.captures("hello world");
         assertThat(matches.get("foo"), equalTo("hello"));
     }
 
     public void testISO8601() {
         Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "^%{TIMESTAMP_ISO8601}$", logger::warn);
+        assertCaptureConfig(grok, Map.of());
         List<String> timeMessages = Arrays.asList(
                 "2001-01-01T00:00:00",
                 "1974-03-02T04:09:09",
@@ -150,6 +200,7 @@ public class GrokTests extends ESTestCase {
 
     public void testNotISO8601() {
         Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "^%{TIMESTAMP_ISO8601}$", logger::warn);
+        assertCaptureConfig(grok, Map.of());
         List<String> timeMessages = Arrays.asList(
                 "2001-13-01T00:00:00", // invalid month
                 "2001-00-01T00:00:00", // invalid month
@@ -189,6 +240,7 @@ public class GrokTests extends ESTestCase {
         String text = "wowza !!!Tal!!! - Tal";
         String pattern = "%{EXCITED_NAME} - %{NAME}";
         Grok g = new Grok(bank, pattern, false, logger::warn);
+        assertCaptureConfig(g, Map.of("EXCITED_NAME_0", STRING, "NAME_21", STRING, "NAME_22", STRING));
 
         assertEquals("(?<EXCITED_NAME_0>!!!(?<NAME_21>Tal)!!!) - (?<NAME_22>Tal)", g.toRegex(pattern));
         assertEquals(true, g.match(text));
@@ -263,6 +315,7 @@ public class GrokTests extends ESTestCase {
     public void testBooleanCaptures() {
         String pattern = "%{WORD:name}=%{WORD:status:boolean}";
         Grok g = new Grok(Grok.BUILTIN_PATTERNS, pattern, logger::warn);
+        assertCaptureConfig(g, Map.of("name", STRING, "status", BOOLEAN));
 
         String text = "active=true";
         Map<String, Object> expected = new HashMap<>();
@@ -280,6 +333,7 @@ public class GrokTests extends ESTestCase {
 
         String pattern = "%{NUMBER:bytes:float} %{NUMBER:id:long} %{NUMBER:rating:double}";
         Grok g = new Grok(bank, pattern, logger::warn);
+        assertCaptureConfig(g, Map.of("bytes", FLOAT, "id", LONG, "rating", DOUBLE));
 
         String text = "12009.34 20000000000 4820.092";
         Map<String, Object> expected = new HashMap<>();
@@ -298,6 +352,7 @@ public class GrokTests extends ESTestCase {
 
         String pattern = "%{NUMBER:bytes:float} %{NUMBER:status} %{NUMBER}";
         Grok g = new Grok(bank, pattern, logger::warn);
+        assertCaptureConfig(g, Map.of("bytes", FLOAT, "status", STRING));
 
         String text = "12009.34 200 9032";
         Map<String, Object> expected = new HashMap<>();
@@ -308,11 +363,39 @@ public class GrokTests extends ESTestCase {
         assertEquals(expected, actual);
     }
 
+    public void testGarbageTypeNameBecomesString() {
+        Map<String, String> bank = new HashMap<>();
+        bank.put("BASE10NUM", "(?<![0-9.+-])(?>[+-]?(?:(?:[0-9]+(?:\\.[0-9]+)?)|(?:\\.[0-9]+)))");
+        bank.put("NUMBER", "(?:%{BASE10NUM})");
+
+        String pattern = "%{NUMBER:f:not_a_valid_type}";
+        Grok g = new Grok(bank, pattern, logger::warn);
+        assertCaptureConfig(g, Map.of("f", STRING));
+        assertThat(g.captures("12009.34"), equalTo(Map.of("f", "12009.34")));
+    }
+
     public void testApacheLog() {
         String logLine = "31.184.238.164 - - [24/Jul/2014:05:35:37 +0530] \"GET /logs/access.log HTTP/1.0\" 200 69849 " +
                 "\"http://8rursodiol.enjin.com\" \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) " +
                 "Chrome/30.0.1599.12785 YaBrowser/13.12.1599.12785 Safari/537.36\" \"www.dlwindianrailways.com\"";
         Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "%{COMBINEDAPACHELOG}", logger::warn);
+        assertCaptureConfig(
+            grok,
+            Map.ofEntries(
+                Map.entry("agent", STRING),
+                Map.entry("auth", STRING),
+                Map.entry("bytes", STRING),
+                Map.entry("clientip", STRING),
+                Map.entry("httpversion", STRING),
+                Map.entry("ident", STRING),
+                Map.entry("rawrequest", STRING),
+                Map.entry("referrer", STRING),
+                Map.entry("request", STRING),
+                Map.entry("response", STRING),
+                Map.entry("timestamp", STRING),
+                Map.entry("verb", STRING)
+            )
+        );
         Map<String, Object> matches = grok.captures(logLine);
 
         assertEquals("31.184.238.164", matches.get("clientip"));
@@ -373,6 +456,22 @@ public class GrokTests extends ESTestCase {
                 "HTTP/%{NUMBER:httpversion}\" %{NUMBER:response:int} (?:-|%{NUMBER:bytes:int}) %{QS:referrer} %{QS:agent}";
 
         Grok grok = new Grok(bank, pattern, logger::warn);
+        assertCaptureConfig(
+            grok,
+            Map.ofEntries(
+                Map.entry("agent", STRING),
+                Map.entry("auth", STRING),
+                Map.entry("bytes", INTEGER),
+                Map.entry("clientip", STRING),
+                Map.entry("httpversion", STRING),
+                Map.entry("ident", STRING),
+                Map.entry("referrer", STRING),
+                Map.entry("request", STRING),
+                Map.entry("response", INTEGER),
+                Map.entry("timestamp", STRING),
+                Map.entry("verb", STRING)
+            )
+        );
 
         Map<String, Object> expected = new HashMap<>();
         expected.put("clientip", "83.149.9.216");
@@ -404,6 +503,7 @@ public class GrokTests extends ESTestCase {
         Map<String, String> bank = new HashMap<>();
         bank.put("SINGLEDIGIT", "[0-9]");
         Grok grok = new Grok(bank, "%{SINGLEDIGIT:num}%{SINGLEDIGIT:num}", logger::warn);
+        assertCaptureConfig(grok, Map.of("num", STRING));
 
         Map<String, Object> expected = new HashMap<>();
         expected.put("num", "1");
@@ -500,4 +600,13 @@ public class GrokTests extends ESTestCase {
         Map<String, Object> matches = grok.captures(line);
         assertEquals(line, matches.get(fieldName));
     }
+
+    private void assertCaptureConfig(Grok grok, Map<String, GrokCaptureType> nameToType) {
+        Map<String, GrokCaptureType> fromGrok = new TreeMap<>();
+        for (GrokCaptureConfig config : grok.captureConfig()) {
+            Object old = fromGrok.put(config.name(), config.type());
+            assertThat("duplicates not allowed", old, nullValue());
+        }
+        assertThat(fromGrok, equalTo(new TreeMap<>(nameToType)));
+    }
 }