Browse Source

Grok: "native" results (#62843)

This adds the ability to fetch java primitives like `long` and `float`
from grok matches rather than their boxed versions. It also allows
customizing the which fields are extracted and how they are extracted.
By default we continue to fetch a `Map<String, Object>` but runtime
fields will be able to catch *just* the fields it is interested
in, and the values will be primitives.
Nik Everett 5 years ago
parent
commit
2e346f3fae

+ 33 - 0
libs/grok/src/main/java/org/elasticsearch/grok/FloatConsumer.java

@@ -0,0 +1,33 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.grok;
+
+import java.util.function.Consumer;
+
+/**
+ * Primitive {@link Consumer} for floats.
+ */
+@FunctionalInterface
+public interface FloatConsumer {
+    /**
+     * Consumes the {@code value}.
+     */
+    void accept(float value);
+}

+ 16 - 21
libs/grok/src/main/java/org/elasticsearch/grok/Grok.java

@@ -34,7 +34,6 @@ import java.io.InputStreamReader;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Collections;
-import java.util.HashMap;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.List;
@@ -238,21 +237,27 @@ public final class Grok {
      * Matches and returns any named captures.
      *
      * @param text the text to match and extract values from.
-     * @return a map containing field names and their respective coerced values that matched.
+     * @return a map containing field names and their respective coerced values that matched or null if the pattern didn't match
      */
     public Map<String, Object> captures(String text) {
         byte[] utf8Bytes = text.getBytes(StandardCharsets.UTF_8);
-        return captures(utf8Bytes, 0, utf8Bytes.length);
+        GrokCaptureExtracter.MapExtracter extracter = new GrokCaptureExtracter.MapExtracter(captureConfig);
+        if (match(utf8Bytes, 0, utf8Bytes.length, extracter)) {
+            return extracter.result();
+        }
+        return null;
     }
 
     /**
-     * Matches and returns any named captures.
+     * Matches and collects any named captures.
      * @param utf8Bytes array containing the text to match against encoded in utf-8
      * @param offset offset {@code utf8Bytes} of the start of the text
      * @param length length of the text to match
-     * @return a map containing field names and their respective coerced values that matched.
+     * @param extracter collector for captures. {@link GrokCaptureConfig#nativeExtracter} can build these.
+     * @return true if there was a match, false otherwise
+     * @throws RuntimeException if there was a timeout
      */
-    public Map<String, Object> captures(byte[] utf8Bytes, int offset, int length) {
+    public boolean match(byte[] utf8Bytes, int offset, int length, GrokCaptureExtracter extracter) {
         Matcher matcher = compiledExpression.matcher(utf8Bytes, offset, offset + length);
         int result;
         try {
@@ -264,22 +269,12 @@ public final class Grok {
         if (result == Matcher.INTERRUPTED) {
             throw new RuntimeException("grok pattern matching was interrupted after [" +
                 matcherWatchdog.maxExecutionTimeInMillis() + "] ms");
-        } else if (result == Matcher.FAILED) {
-            // TODO: I think we should throw an error here?
-            return null;
-        } else if (compiledExpression.numberOfNames() > 0) {
-            Map<String, Object> fields = new HashMap<>(captureConfig.size());
-            Region region = matcher.getEagerRegion();
-            for (GrokCaptureConfig config: captureConfig) {
-                Object v = config.extract(utf8Bytes, offset, region);
-                if (v != null) {
-                    fields.put(config.name(), v);
-                }
-            }
-            return fields;
-        } else {
-            return Collections.emptyMap();
         }
+        if (result == Matcher.FAILED) {
+            return false;
+        }
+        extracter.extract(utf8Bytes, offset, matcher.getEagerRegion());
+        return true;
     }
 
     /**

+ 101 - 10
libs/grok/src/main/java/org/elasticsearch/grok/GrokCaptureConfig.java

@@ -20,9 +20,13 @@
 package org.elasticsearch.grok;
 
 import org.joni.NameEntry;
-import org.joni.Region;
 
 import java.nio.charset.StandardCharsets;
+import java.util.function.Consumer;
+import java.util.function.DoubleConsumer;
+import java.util.function.Function;
+import java.util.function.IntConsumer;
+import java.util.function.LongConsumer;
 
 /**
  * Configuration for a value that {@link Grok} can capture.
@@ -50,18 +54,105 @@ public final class GrokCaptureConfig {
     /**
      * The type defined for the field in the pattern.
      */
-    public GrokCaptureType type() {
+    GrokCaptureType type() {
         return type;
     }
 
-    Object extract(byte[] utf8Bytes, int offset, Region region) {
-        for (int number : backRefs) {
-            if (region.beg[number] >= 0) {
-                String matchValue = new String(utf8Bytes, offset + region.beg[number], region.end[number] - region.beg[number],
-                    StandardCharsets.UTF_8);
-                return type.parse(matchValue);
+    /**
+     * Build a {@linkplain GrokCaptureExtracter} that will call {@code emit} when
+     * it extracts text, boxed if the "native" representation is primitive type.
+     * Extracters returned from this method are stateless and can be reused.
+     */
+    public GrokCaptureExtracter objectExtracter(Consumer<Object> emit) {
+        // We could probably write this code a little more concisely but this makes it clear where we are boxing
+        return nativeExtracter(new NativeExtracterMap<GrokCaptureExtracter>() {
+            @Override
+            public GrokCaptureExtracter forString(Function<Consumer<String>, GrokCaptureExtracter> buildExtracter) {
+                return buildExtracter.apply(str -> emit.accept(str));
+            }
+
+            @Override
+            public GrokCaptureExtracter forInt(Function<IntConsumer, GrokCaptureExtracter> buildExtracter) {
+                return buildExtracter.apply(i -> emit.accept(Integer.valueOf(i)));
             }
-        }
-        return null;
+
+            @Override
+            public GrokCaptureExtracter forLong(Function<LongConsumer, GrokCaptureExtracter> buildExtracter) {
+                return buildExtracter.apply(l -> emit.accept(Long.valueOf(l)));
+            }
+
+            @Override
+            public GrokCaptureExtracter forFloat(Function<FloatConsumer, GrokCaptureExtracter> buildExtracter) {
+                return buildExtracter.apply(f -> emit.accept(Float.valueOf(f)));
+            }
+
+            @Override
+            public GrokCaptureExtracter forDouble(Function<DoubleConsumer, GrokCaptureExtracter> buildExtracter) {
+                return buildExtracter.apply(d -> emit.accept(Double.valueOf(d)));
+            }
+
+            @Override
+            public GrokCaptureExtracter forBoolean(Function<Consumer<Boolean>, GrokCaptureExtracter> buildExtracter) {
+                return buildExtracter.apply(b -> emit.accept(b));
+            }
+        });
+    }
+
+    /**
+     * Build an extract that has access to the "native" type of the extracter
+     * match. This means that patterns like {@code %{NUMBER:bytes:float}} has
+     * access to an actual {@link float}. Extracters returned from this method
+     * should be stateless stateless and can be reused. Pathological implementations
+     * of the {@code map} parameter could violate this, but the caller should
+     * take care to stay sane.
+     * <p>
+     * While the goal is to produce a {@link GrokCaptureExtracter} that provides
+     * a primitive, the caller can produce whatever type-safe constructs it
+     * needs and return them from this method. Thus the {@code <T>} in the type
+     * signature.
+     *
+     * @param <T> The type of the result.
+     * @param map Collection of handlers for each native type. Only one method
+     *            will be called but well behaved implementers are stateless.
+     * @return whatever was returned by the handler.
+     */
+    public <T> T nativeExtracter(NativeExtracterMap<T> map) {
+        return type.nativeExtracter(backRefs, map);
+    }
+
+    /**
+     * Collection of handlers for each native type. Well behaved implementations
+     * are stateless and produce stateless results.
+     */
+    public interface NativeExtracterMap<T> {
+        /**
+         * Called when the native type is a {@link String}.
+         */
+        T forString(Function<Consumer<String>, GrokCaptureExtracter> buildExtracter);
+
+        /**
+         * Called when the native type is an {@link int}.
+         */
+        T forInt(Function<IntConsumer, GrokCaptureExtracter> buildExtracter);
+
+        /**
+         * Called when the native type is an {@link long}.
+         */
+        T forLong(Function<LongConsumer, GrokCaptureExtracter> buildExtracter);
+
+        /**
+         * Called when the native type is an {@link float}.
+         */
+        T forFloat(Function<FloatConsumer, GrokCaptureExtracter> buildExtracter);
+
+        /**
+         * Called when the native type is an {@link double}.
+         */
+        T forDouble(Function<DoubleConsumer, GrokCaptureExtracter> buildExtracter);
+
+        /**
+         * Called when the native type is an {@link boolean}.
+         */
+        T forBoolean(Function<Consumer<Boolean>, GrokCaptureExtracter> buildExtracter);
     }
 }

+ 64 - 0
libs/grok/src/main/java/org/elasticsearch/grok/GrokCaptureExtracter.java

@@ -0,0 +1,64 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.grok;
+
+import org.joni.Region;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import static java.util.Collections.emptyMap;
+
+/**
+ * How to extract matches.
+ */
+public abstract class GrokCaptureExtracter {
+    /**
+     * Extract {@link Map} results. This implementation of {@link GrokCaptureExtracter}
+     * is mutable and should be discarded after collecting a single result.
+     */
+    static class MapExtracter extends GrokCaptureExtracter {
+        private final Map<String, Object> result;
+        private final List<GrokCaptureExtracter> fieldExtracters;
+
+        MapExtracter(List<GrokCaptureConfig> captureConfig) {
+            result = captureConfig.isEmpty() ? emptyMap() : new HashMap<>();
+            fieldExtracters = new ArrayList<>(captureConfig.size());
+            for (GrokCaptureConfig config : captureConfig) {
+                fieldExtracters.add(config.objectExtracter(v -> result.put(config.name(), v)));
+            }
+        }
+
+        @Override
+        void extract(byte[] utf8Bytes, int offset, Region region) {
+            for (GrokCaptureExtracter extracter : fieldExtracters) {
+                extracter.extract(utf8Bytes, offset, region);
+            }
+        }
+
+        Map<String, Object> result() {
+            return result;
+        }
+    }
+
+    abstract void extract(byte[] utf8Bytes, int offset, Region region);
+}

+ 40 - 25
libs/grok/src/main/java/org/elasticsearch/grok/GrokCaptureType.java

@@ -19,55 +19,54 @@
 
 package org.elasticsearch.grok;
 
+import org.elasticsearch.grok.GrokCaptureConfig.NativeExtracterMap;
+import org.joni.Region;
+
+import java.nio.charset.StandardCharsets;
+import java.util.function.Consumer;
+
 /**
  * The type defined for the field in the pattern.
  */
-public enum GrokCaptureType {
+enum GrokCaptureType {
     STRING {
         @Override
-        protected Object parseValue(String str) {
-            return str;
+        <T> T nativeExtracter(int[] backRefs, NativeExtracterMap<T> map) {
+            return map.forString(emit -> rawExtracter(backRefs, emit));
         }
     },
     INTEGER {
         @Override
-        protected Object parseValue(String str) {
-            return Integer.parseInt(str);
+        <T> T nativeExtracter(int[] backRefs, NativeExtracterMap<T> map) {
+            return map.forInt(emit -> rawExtracter(backRefs, str -> emit.accept(Integer.parseInt(str))));
         }
     },
     LONG {
         @Override
-        protected Object parseValue(String str) {
-            return Long.parseLong(str);
+        <T> T nativeExtracter(int[] backRefs, NativeExtracterMap<T> map) {
+            return map.forLong(emit -> rawExtracter(backRefs, str -> emit.accept(Long.parseLong(str))));
         }
     },
-    DOUBLE {
+    FLOAT {
         @Override
-        protected Object parseValue(String str) {
-            return Double.parseDouble(str);
+        <T> T nativeExtracter(int[] backRefs, NativeExtracterMap<T> map) {
+            return map.forFloat(emit -> rawExtracter(backRefs, str -> emit.accept(Float.parseFloat(str))));
         }
     },
-    FLOAT {
+    DOUBLE {
         @Override
-        protected Object parseValue(String str) {
-            return Float.parseFloat(str);
+        <T> T nativeExtracter(int[] backRefs, NativeExtracterMap<T> map) {
+            return map.forDouble(emit -> rawExtracter(backRefs, str -> emit.accept(Double.parseDouble(str))));
         }
     },
     BOOLEAN {
         @Override
-        protected Object parseValue(String str) {
-            return Boolean.parseBoolean(str);
+        <T> T nativeExtracter(int[] backRefs, NativeExtracterMap<T> map) {
+            return map.forBoolean(emit -> rawExtracter(backRefs, str -> emit.accept(Boolean.parseBoolean(str))));
         }
     };
 
-    final Object parse(String str) {
-        if (str == null) {
-            return null;
-        }
-        return parseValue(str);
-    }
-
-    protected abstract Object parseValue(String str);
+    abstract <T> T nativeExtracter(int[] backRefs, NativeExtracterMap<T> map);
 
     static GrokCaptureType fromString(String str) {
         switch (str) {
@@ -77,14 +76,30 @@ public enum GrokCaptureType {
                 return INTEGER;
             case "long":
                 return LONG;
-            case "double":
-                return DOUBLE;
             case "float":
                 return FLOAT;
+            case "double":
+                return DOUBLE;
             case "boolean":
                 return BOOLEAN;
             default:
                 return STRING;
         }
     }
+
+    protected final GrokCaptureExtracter rawExtracter(int[] backRefs, Consumer<? super String> emit) {
+        return new GrokCaptureExtracter() {
+            @Override
+            void extract(byte[] utf8Bytes, int offset, Region region) {
+                for (int number : backRefs) {
+                    if (region.beg[number] >= 0) {
+                        int matchOffset = offset + region.beg[number];
+                        int matchLength = region.end[number] - region.beg[number];
+                        emit.accept(new String(utf8Bytes, matchOffset, matchLength, StandardCharsets.UTF_8));
+                        return; // Capture only the first value.
+                    }
+                }
+            }
+        };
+    }
 }

+ 109 - 5
libs/grok/src/test/java/org/elasticsearch/grok/GrokTests.java

@@ -19,6 +19,7 @@
 
 package org.elasticsearch.grok;
 
+import org.elasticsearch.grok.GrokCaptureConfig.NativeExtracterMap;
 import org.elasticsearch.test.ESTestCase;
 
 import java.nio.charset.StandardCharsets;
@@ -31,6 +32,11 @@ import java.util.TreeMap;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.function.BiConsumer;
+import java.util.function.Consumer;
+import java.util.function.DoubleConsumer;
+import java.util.function.Function;
+import java.util.function.IntConsumer;
+import java.util.function.LongConsumer;
 
 import static org.elasticsearch.grok.GrokCaptureType.BOOLEAN;
 import static org.elasticsearch.grok.GrokCaptureType.DOUBLE;
@@ -55,12 +61,20 @@ public class GrokTests extends ESTestCase {
     public void testCaputuresBytes() {
         Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "%{NUMBER:n:int}", logger::warn);
         byte[] utf8 = "10".getBytes(StandardCharsets.UTF_8);
-        assertThat(grok.captures(utf8, 0, utf8.length), equalTo(Map.of("n", 10)));
-        assertThat(grok.captures(utf8, 0, 1), equalTo(Map.of("n", 1)));
+        assertThat(captureBytes(grok, utf8, 0, utf8.length), equalTo(Map.of("n", 10)));
+        assertThat(captureBytes(grok, utf8, 0, 1), equalTo(Map.of("n", 1)));
         utf8 = "10 11 12".getBytes(StandardCharsets.UTF_8);
-        assertThat(grok.captures(utf8, 0, 2), equalTo(Map.of("n", 10)));
-        assertThat(grok.captures(utf8, 3, 2), equalTo(Map.of("n", 11)));
-        assertThat(grok.captures(utf8, 6, 2), equalTo(Map.of("n", 12)));
+        assertThat(captureBytes(grok, utf8, 0, 2), equalTo(Map.of("n", 10)));
+        assertThat(captureBytes(grok, utf8, 3, 2), equalTo(Map.of("n", 11)));
+        assertThat(captureBytes(grok, utf8, 6, 2), equalTo(Map.of("n", 12)));
+    }
+
+    private Map<String, Object> captureBytes(Grok grok, byte[] utf8, int offset, int length) {
+        GrokCaptureExtracter.MapExtracter extracter = new GrokCaptureExtracter.MapExtracter(grok.captureConfig());
+        if (grok.match(utf8, offset, length, extracter)) {
+            return extracter.result();
+        }
+        return null;
     }
 
     public void testNoMatchingPatternInDictionary() {
@@ -90,6 +104,16 @@ public class GrokTests extends ESTestCase {
         assertEquals("connect from camomile.cloud9.net[168.100.1.3]", matches.get("message"));
         assertEquals("postfix/smtpd", matches.get("program"));
         assertEquals("1713", matches.get("pid"));
+
+        String[] logsource = new String[1];
+        GrokCaptureExtracter logsourceExtracter = namedConfig(grok, "logsource").nativeExtracter(new ThrowingNativeExtracterMap() {
+            @Override
+            public GrokCaptureExtracter forString(Function<Consumer<String>, GrokCaptureExtracter> buildExtracter) {
+                return buildExtracter.apply(str -> logsource[0] = str);
+            }
+        });
+        assertThat(specificCapture(grok, line, logsourceExtracter), is(true));
+        assertThat(logsource[0], equalTo("evita"));
     }
 
     public void testSyslog5424Line() {
@@ -336,6 +360,16 @@ public class GrokTests extends ESTestCase {
         Map<String, Object> actual = g.captures(text);
 
         assertEquals(expected, actual);
+
+        boolean[] status = new boolean[1];
+        GrokCaptureExtracter statusExtracter = namedConfig(g, "status").nativeExtracter(new ThrowingNativeExtracterMap() {
+            @Override
+            public GrokCaptureExtracter forBoolean(Function<Consumer<Boolean>, GrokCaptureExtracter> buildExtracter) {
+                return buildExtracter.apply(b -> status[0] = b);
+            }
+        });
+        assertThat(specificCapture(g, text, statusExtracter), is(true));
+        assertThat(status[0], equalTo(true));
     }
 
     public void testNumericCaptures() {
@@ -355,6 +389,35 @@ public class GrokTests extends ESTestCase {
         Map<String, Object> actual = g.captures(text);
 
         assertEquals(expected, actual);
+
+        float[] bytes = new float[1];
+        GrokCaptureExtracter bytesExtracter = namedConfig(g, "bytes").nativeExtracter(new ThrowingNativeExtracterMap() {
+            @Override
+            public GrokCaptureExtracter forFloat(Function<FloatConsumer, GrokCaptureExtracter> buildExtracter) {
+                return buildExtracter.apply(f -> bytes[0] = f);
+            }
+        });
+        assertThat(specificCapture(g, text, bytesExtracter), is(true));
+        assertThat(bytes[0], equalTo(12009.34f));
+
+        long[] id = new long[1];
+        GrokCaptureExtracter idExtracter = namedConfig(g, "id").nativeExtracter(new ThrowingNativeExtracterMap() {
+            @Override
+            public GrokCaptureExtracter forLong(Function<LongConsumer, GrokCaptureExtracter> buildExtracter) {
+                return buildExtracter.apply(l -> id[0] = l);
+            }
+        });
+        assertThat(specificCapture(g, text, idExtracter), is(true));
+        assertThat(id[0], equalTo(20000000000L));
+
+        double[] rating = new double[1];
+        GrokCaptureExtracter ratingExtracter = namedConfig(g, "rating").nativeExtracter(new ThrowingNativeExtracterMap() {
+            public GrokCaptureExtracter forDouble(java.util.function.Function<DoubleConsumer,GrokCaptureExtracter> buildExtracter) {
+                return buildExtracter.apply(d -> rating[0] = d);
+            }
+        });
+        assertThat(specificCapture(g, text, ratingExtracter), is(true));
+        assertThat(rating[0], equalTo(4820.092));
     }
 
     public void testNumericCapturesCoercion() {
@@ -621,4 +684,45 @@ public class GrokTests extends ESTestCase {
         }
         assertThat(fromGrok, equalTo(new TreeMap<>(nameToType)));
     }
+
+    private GrokCaptureConfig namedConfig(Grok grok, String name) {
+        return grok.captureConfig().stream().filter(i -> i.name().equals(name)).findFirst().get();
+    }
+
+    private boolean specificCapture(Grok grok, String str, GrokCaptureExtracter extracter) {
+        byte[] utf8 = str.getBytes(StandardCharsets.UTF_8);
+        return grok.match(utf8, 0, utf8.length, extracter);
+    }
+
+    private abstract class ThrowingNativeExtracterMap implements NativeExtracterMap<GrokCaptureExtracter> {
+        @Override
+        public GrokCaptureExtracter forString(Function<Consumer<String>, GrokCaptureExtracter> buildExtracter) {
+            throw new IllegalArgumentException();
+        }
+
+        @Override
+        public GrokCaptureExtracter forInt(Function<IntConsumer, GrokCaptureExtracter> buildExtracter) {
+            throw new IllegalArgumentException();
+        }
+
+        @Override
+        public GrokCaptureExtracter forLong(Function<LongConsumer, GrokCaptureExtracter> buildExtracter) {
+            throw new IllegalArgumentException();
+        }
+
+        @Override
+        public GrokCaptureExtracter forFloat(Function<FloatConsumer, GrokCaptureExtracter> buildExtracter) {
+            throw new IllegalArgumentException();
+        }
+
+        @Override
+        public GrokCaptureExtracter forDouble(Function<DoubleConsumer, GrokCaptureExtracter> buildExtracter) {
+            throw new IllegalArgumentException();
+        }
+
+        @Override
+        public GrokCaptureExtracter forBoolean(Function<Consumer<Boolean>, GrokCaptureExtracter> buildExtracter) {
+            throw new IllegalArgumentException();
+        }
+    }
 }