Browse Source

add ability to specify multiple grok patterns (#18074)

- now you can specify a list of grok patterns to match your field with
and the first one to successfully match wins.
- only non-null captures will be inserted into your matched document.

Fixes #17903.
Tal Levy 9 years ago
parent
commit
edfbdf2748

+ 4 - 3
docs/reference/ingest/ingest-node.asciidoc

@@ -1038,8 +1038,9 @@ Grok expression.
 |======
 | Name                   | Required  | Default             | Description
 | `field`                | yes       | -                   | The field to use for grok expression parsing
-| `pattern`              | yes       | -                   | The grok expression to match and extract named captures with
+| `patterns`             | yes       | -                   | An ordered list of grok expression to match and extract named captures with. Returns on the first expression in the list that matches.
 | `pattern_definitions`  | no        | -                   | A map of pattern-name and pattern tuples defining custom patterns to be used by the current processor. Patterns matching existing names will override the pre-existing definition.
+| `trace_match`          | no        | false               | when true, `_ingest._grok_match_index` will be inserted into your matched document's metadata with the index into the pattern found in `patterns` that matched.
 |======
 
 Here is an example of using the provided patterns to extract out and name structured fields from a string field in
@@ -1069,7 +1070,7 @@ Here is an example pipeline for processing the above document by using Grok:
     {
       "grok": {
         "field": "message",
-        "pattern": "%{IP:client} %{WORD:method} %{URIPATHPARAM:request} %{NUMBER:bytes} %{NUMBER:duration}"
+        "patterns": ["%{IP:client} %{WORD:method} %{URIPATHPARAM:request} %{NUMBER:bytes} %{NUMBER:duration}"]
       }
     }
   ]
@@ -1107,7 +1108,7 @@ Here is an example of a pipeline specifying custom pattern definitions:
     {
       "grok": {
         "field": "message",
-        "pattern": "my %{FAVORITE_DOG:dog} is colored %{RGB:color}"
+        "patterns": ["my %{FAVORITE_DOG:dog} is colored %{RGB:color}"]
         "pattern_definitions" : {
           "FAVORITE_DOG" : "beagle",
           "RGB" : "RED|GREEN|BLUE"

+ 66 - 15
modules/ingest-grok/src/main/java/org/elasticsearch/ingest/grok/GrokProcessor.java

@@ -25,31 +25,50 @@ import org.elasticsearch.ingest.core.ConfigurationUtils;
 import org.elasticsearch.ingest.core.IngestDocument;
 
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
+import java.util.Objects;
 
 import static org.elasticsearch.ingest.core.ConfigurationUtils.newConfigurationException;
 
 public final class GrokProcessor extends AbstractProcessor {
 
     public static final String TYPE = "grok";
+    private static final String PATTERN_MATCH_KEY = "_ingest._grok_match_index";
 
     private final String matchField;
     private final Grok grok;
+    private final boolean traceMatch;
 
-    public GrokProcessor(String tag, Grok grok, String matchField) {
+    public GrokProcessor(String tag, Map<String, String> patternBank, List<String> matchPatterns, String matchField) {
+        this(tag, patternBank, matchPatterns, matchField, false);
+    }
+
+    public GrokProcessor(String tag, Map<String, String> patternBank, List<String> matchPatterns, String matchField, boolean traceMatch) {
         super(tag);
         this.matchField = matchField;
-        this.grok = grok;
+        this.grok = new Grok(patternBank, combinePatterns(matchPatterns, traceMatch));
+        this.traceMatch = traceMatch;
     }
 
     @Override
     public void execute(IngestDocument ingestDocument) throws Exception {
         String fieldValue = ingestDocument.getFieldValue(matchField, String.class);
         Map<String, Object> matches = grok.captures(fieldValue);
-        if (matches != null) {
-            matches.forEach((k, v) -> ingestDocument.setFieldValue(k, v));
-        } else {
-            throw new IllegalArgumentException("Grok expression does not match field value: [" + fieldValue + "]");
+        if (matches == null) {
+            throw new IllegalArgumentException("Provided Grok expressions do not match field value: [" + fieldValue + "]");
+        }
+
+        matches.entrySet().stream()
+            .filter((e) -> Objects.nonNull(e.getValue()))
+            .forEach((e) -> ingestDocument.setFieldValue(e.getKey(), e.getValue()));
+
+        if (traceMatch) {
+            @SuppressWarnings("unchecked")
+            HashMap<String, String> matchMap = (HashMap<String, String>) ingestDocument.getFieldValue(PATTERN_MATCH_KEY, Object.class);
+            matchMap.keySet().stream().findFirst().ifPresent((index) -> {
+                ingestDocument.setFieldValue(PATTERN_MATCH_KEY, index);
+            });
         }
     }
 
@@ -58,12 +77,41 @@ public final class GrokProcessor extends AbstractProcessor {
         return TYPE;
     }
 
+    public Grok getGrok() {
+        return grok;
+    }
+
     String getMatchField() {
         return matchField;
     }
 
-    Grok getGrok() {
-        return grok;
+    static String combinePatterns(List<String> patterns, boolean traceMatch) {
+        String combinedPattern;
+        if (patterns.size() > 1) {
+            if (traceMatch) {
+                combinedPattern = "";
+                for (int i = 0; i < patterns.size(); i++) {
+                    String valueWrap = "(?<" + PATTERN_MATCH_KEY + "." + i + ">" + patterns.get(i) + ")";
+                    if (combinedPattern.equals("")) {
+                        combinedPattern = valueWrap;
+                    } else {
+                        combinedPattern = combinedPattern + "|" + valueWrap;
+                    }
+                }
+            } else {
+                combinedPattern = patterns.stream().reduce("", (prefix, value) -> {
+                    if (prefix.equals("")) {
+                        return "(?:" + value + ")";
+                    } else {
+                        return prefix + "|" + "(?:" + value + ")";
+                    }
+                });
+            }
+        }  else {
+            combinedPattern = patterns.get(0);
+        }
+
+        return combinedPattern;
     }
 
     public final static class Factory extends AbstractProcessorFactory<GrokProcessor> {
@@ -77,22 +125,25 @@ public final class GrokProcessor extends AbstractProcessor {
         @Override
         public GrokProcessor doCreate(String processorTag, Map<String, Object> config) throws Exception {
             String matchField = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "field");
-            String matchPattern = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "pattern");
+            List<String> matchPatterns = ConfigurationUtils.readList(TYPE, processorTag, config, "patterns");
+            boolean traceMatch = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "trace_match", false);
+
+            if (matchPatterns.isEmpty()) {
+                throw newConfigurationException(TYPE, processorTag, "patterns", "List of patterns must not be empty");
+            }
             Map<String, String> customPatternBank = ConfigurationUtils.readOptionalMap(TYPE, processorTag, config, "pattern_definitions");
             Map<String, String> patternBank = new HashMap<>(builtinPatterns);
             if (customPatternBank != null) {
                 patternBank.putAll(customPatternBank);
             }
 
-            Grok grok;
             try {
-                grok = new Grok(patternBank, matchPattern);
+                return new GrokProcessor(processorTag, patternBank, matchPatterns, matchField, traceMatch);
             } catch (Exception e) {
-                throw newConfigurationException(TYPE, processorTag, "pattern", "Invalid regex pattern. " + e.getMessage());
+                throw newConfigurationException(TYPE, processorTag, "patterns",
+                    "Invalid regex pattern found in: " + matchPatterns + ". " + e.getMessage());
             }
-            return new GrokProcessor(processorTag, grok, matchField);
-        }
 
+        }
     }
-
 }

+ 23 - 33
modules/ingest-grok/src/test/java/org/elasticsearch/ingest/grok/GrokProcessorFactoryTests.java

@@ -37,7 +37,7 @@ public class GrokProcessorFactoryTests extends ESTestCase {
 
         Map<String, Object> config = new HashMap<>();
         config.put("field", "_field");
-        config.put("pattern", "(?<foo>\\w+)");
+        config.put("patterns", Collections.singletonList("(?<foo>\\w+)"));
         String processorTag = randomAsciiOfLength(10);
         config.put(AbstractProcessorFactory.TAG_KEY, processorTag);
         GrokProcessor processor = factory.create(config);
@@ -49,27 +49,26 @@ public class GrokProcessorFactoryTests extends ESTestCase {
     public void testBuildMissingField() throws Exception {
         GrokProcessor.Factory factory = new GrokProcessor.Factory(Collections.emptyMap());
         Map<String, Object> config = new HashMap<>();
-        config.put("pattern", "(?<foo>\\w+)");
-        try {
-            factory.create(config);
-            fail("should fail");
-        } catch (ElasticsearchParseException e) {
-            assertThat(e.getMessage(), equalTo("[field] required property is missing"));
-
-        }
+        config.put("patterns", Collections.singletonList("(?<foo>\\w+)"));
+        ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> factory.create(config));
+        assertThat(e.getMessage(), equalTo("[field] required property is missing"));
     }
 
-    public void testBuildMissingPattern() throws Exception {
+    public void testBuildMissingPatterns() throws Exception {
         GrokProcessor.Factory factory = new GrokProcessor.Factory(Collections.emptyMap());
         Map<String, Object> config = new HashMap<>();
         config.put("field", "foo");
-        try {
-            factory.create(config);
-            fail("should fail");
-        } catch (ElasticsearchParseException e) {
-            assertThat(e.getMessage(), equalTo("[pattern] required property is missing"));
-        }
+        ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> factory.create(config));
+        assertThat(e.getMessage(), equalTo("[patterns] required property is missing"));
+    }
 
+    public void testBuildEmptyPatternsList() throws Exception {
+        GrokProcessor.Factory factory = new GrokProcessor.Factory(Collections.emptyMap());
+        Map<String, Object> config = new HashMap<>();
+        config.put("field", "foo");
+        config.put("patterns", Collections.emptyList());
+        ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> factory.create(config));
+        assertThat(e.getMessage(), equalTo("[patterns] List of patterns must not be empty"));
     }
 
     public void testCreateWithCustomPatterns() throws Exception {
@@ -77,7 +76,7 @@ public class GrokProcessorFactoryTests extends ESTestCase {
 
         Map<String, Object> config = new HashMap<>();
         config.put("field", "_field");
-        config.put("pattern", "%{MY_PATTERN:name}!");
+        config.put("patterns", Collections.singletonList("%{MY_PATTERN:name}!"));
         config.put("pattern_definitions", Collections.singletonMap("MY_PATTERN", "foo"));
         GrokProcessor processor = factory.create(config);
         assertThat(processor.getMatchField(), equalTo("_field"));
@@ -89,28 +88,19 @@ public class GrokProcessorFactoryTests extends ESTestCase {
         GrokProcessor.Factory factory = new GrokProcessor.Factory(Collections.emptyMap());
         Map<String, Object> config = new HashMap<>();
         config.put("field", "_field");
-        config.put("pattern", "[");
-        try {
-            factory.create(config);
-            fail("should fail");
-        } catch (ElasticsearchParseException e) {
-            assertThat(e.getMessage(), equalTo("[pattern] Invalid regex pattern. premature end of char-class"));
-        }
-
+        config.put("patterns", Collections.singletonList("["));
+        ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> factory.create(config));
+        assertThat(e.getMessage(), equalTo("[patterns] Invalid regex pattern found in: [[]. premature end of char-class"));
     }
 
     public void testCreateWithInvalidPatternDefinition() throws Exception {
         GrokProcessor.Factory factory = new GrokProcessor.Factory(Collections.emptyMap());
         Map<String, Object> config = new HashMap<>();
         config.put("field", "_field");
-        config.put("pattern", "%{MY_PATTERN:name}!");
+        config.put("patterns", Collections.singletonList("%{MY_PATTERN:name}!"));
         config.put("pattern_definitions", Collections.singletonMap("MY_PATTERN", "["));
-        try {
-            factory.create(config);
-            fail("should fail");
-        } catch (ElasticsearchParseException e) {
-            assertThat(e.getMessage(), equalTo("[pattern] Invalid regex pattern. premature end of char-class"));
-        }
-
+        ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> factory.create(config));
+        assertThat(e.getMessage(),
+            equalTo("[patterns] Invalid regex pattern found in: [%{MY_PATTERN:name}!]. premature end of char-class"));
     }
 }

+ 68 - 30
modules/ingest-grok/src/test/java/org/elasticsearch/ingest/grok/GrokProcessorTests.java

@@ -21,14 +21,15 @@ package org.elasticsearch.ingest.grok;
 
 import org.elasticsearch.ingest.RandomDocumentPicks;
 import org.elasticsearch.ingest.core.IngestDocument;
-import org.elasticsearch.ingest.grok.Grok;
-import org.elasticsearch.ingest.grok.GrokProcessor;
 import org.elasticsearch.test.ESTestCase;
 
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.Map;
 
 import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.nullValue;
 
 
 public class GrokProcessorTests extends ESTestCase {
@@ -37,8 +38,8 @@ public class GrokProcessorTests extends ESTestCase {
         String fieldName = RandomDocumentPicks.randomFieldName(random());
         IngestDocument doc = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>());
         doc.setFieldValue(fieldName, "1");
-        Grok grok = new Grok(Collections.singletonMap("ONE", "1"), "%{ONE:one}");
-        GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), grok, fieldName);
+        GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), Collections.singletonMap("ONE", "1"),
+            Collections.singletonList("%{ONE:one}"), fieldName);
         processor.execute(doc);
         assertThat(doc.getFieldValue("one", String.class), equalTo("1"));
     }
@@ -47,14 +48,10 @@ public class GrokProcessorTests extends ESTestCase {
         String fieldName = RandomDocumentPicks.randomFieldName(random());
         IngestDocument doc = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>());
         doc.setFieldValue(fieldName, "23");
-        Grok grok = new Grok(Collections.singletonMap("ONE", "1"), "%{ONE:one}");
-        GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), grok, fieldName);
-        try {
-            processor.execute(doc);
-            fail();
-        } catch (Exception e) {
-            assertThat(e.getMessage(), equalTo("Grok expression does not match field value: [23]"));
-        }
+        GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), Collections.singletonMap("ONE", "1"),
+            Collections.singletonList("%{ONE:one}"), fieldName);
+        Exception e = expectThrows(Exception.class, () -> processor.execute(doc));
+        assertThat(e.getMessage(), equalTo("Provided Grok expressions do not match field value: [23]"));
     }
 
     public void testMatchWithoutCaptures() throws Exception {
@@ -62,8 +59,8 @@ public class GrokProcessorTests extends ESTestCase {
         IngestDocument originalDoc = new IngestDocument(new HashMap<>(), new HashMap<>());
         originalDoc.setFieldValue(fieldName, fieldName);
         IngestDocument doc = new IngestDocument(originalDoc);
-        Grok grok = new Grok(Collections.emptyMap(), fieldName);
-        GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), grok, fieldName);
+        GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), Collections.emptyMap(),
+            Collections.singletonList(fieldName), fieldName);
         processor.execute(doc);
         assertThat(doc, equalTo(originalDoc));
     }
@@ -72,26 +69,67 @@ public class GrokProcessorTests extends ESTestCase {
         String fieldName = RandomDocumentPicks.randomFieldName(random());
         IngestDocument doc = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>());
         doc.setFieldValue(fieldName, 1);
-        Grok grok = new Grok(Collections.singletonMap("ONE", "1"), "%{ONE:one}");
-        GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), grok, fieldName);
-        try {
-            processor.execute(doc);
-            fail();
-        } catch (Exception e) {
-            assertThat(e.getMessage(), equalTo("field [" + fieldName + "] of type [java.lang.Integer] cannot be cast to [java.lang.String]"));
-        }
+        GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), Collections.singletonMap("ONE", "1"),
+            Collections.singletonList("%{ONE:one}"), fieldName);
+        Exception e = expectThrows(Exception.class, () -> processor.execute(doc));
+        assertThat(e.getMessage(), equalTo("field [" + fieldName + "] of type [java.lang.Integer] cannot be cast to [java.lang.String]"));
     }
 
     public void testMissingField() {
         String fieldName = "foo.bar";
         IngestDocument doc = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>());
-        Grok grok = new Grok(Collections.singletonMap("ONE", "1"), "%{ONE:one}");
-        GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), grok, fieldName);
-        try {
-            processor.execute(doc);
-            fail();
-        } catch (Exception e) {
-            assertThat(e.getMessage(), equalTo("field [foo] not present as part of path [foo.bar]"));
-        }
+        GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), Collections.singletonMap("ONE", "1"),
+            Collections.singletonList("%{ONE:one}"), fieldName);
+        Exception e = expectThrows(Exception.class, () -> processor.execute(doc));
+        assertThat(e.getMessage(), equalTo("field [foo] not present as part of path [foo.bar]"));
+    }
+
+    public void testMultiplePatternsWithMatchReturn() throws Exception {
+        String fieldName = RandomDocumentPicks.randomFieldName(random());
+        IngestDocument doc = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>());
+        doc.setFieldValue(fieldName, "2");
+        Map<String, String> patternBank = new HashMap<>();
+        patternBank.put("ONE", "1");
+        patternBank.put("TWO", "2");
+        patternBank.put("THREE", "3");
+        GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), patternBank,
+            Arrays.asList("%{ONE:one}", "%{TWO:two}", "%{THREE:three}"), fieldName);
+        processor.execute(doc);
+        assertThat(doc.hasField("one"), equalTo(false));
+        assertThat(doc.getFieldValue("two", String.class), equalTo("2"));
+        assertThat(doc.hasField("three"), equalTo(false));
+    }
+
+    public void testSetMetadata() throws Exception {
+        String fieldName = RandomDocumentPicks.randomFieldName(random());
+        IngestDocument doc = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>());
+        doc.setFieldValue(fieldName, "abc23");
+        Map<String, String> patternBank = new HashMap<>();
+        patternBank.put("ONE", "1");
+        patternBank.put("TWO", "2");
+        patternBank.put("THREE", "3");
+        GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), patternBank,
+            Arrays.asList("%{ONE:one}", "%{TWO:two}", "%{THREE:three}"), fieldName, true);
+        processor.execute(doc);
+        assertThat(doc.hasField("one"), equalTo(false));
+        assertThat(doc.getFieldValue("two", String.class), equalTo("2"));
+        assertThat(doc.hasField("three"), equalTo(false));
+        assertThat(doc.getFieldValue("_ingest._grok_match_index", String.class), equalTo("1"));
+    }
+
+    public void testCombinedPatterns() {
+        String combined;
+        combined = GrokProcessor.combinePatterns(Arrays.asList(""), false);
+        assertThat(combined, equalTo(""));
+        combined = GrokProcessor.combinePatterns(Arrays.asList(""), true);
+        assertThat(combined, equalTo(""));
+        combined = GrokProcessor.combinePatterns(Arrays.asList("foo"), false);
+        assertThat(combined, equalTo("foo"));
+        combined = GrokProcessor.combinePatterns(Arrays.asList("foo"), true);
+        assertThat(combined, equalTo("foo"));
+        combined = GrokProcessor.combinePatterns(Arrays.asList("foo", "bar"), false);
+        assertThat(combined, equalTo("(?:foo)|(?:bar)"));
+        combined = GrokProcessor.combinePatterns(Arrays.asList("foo", "bar"), true);
+        assertThat(combined, equalTo("(?<_ingest._grok_match_index.0>foo)|(?<_ingest._grok_match_index.1>bar)"));
     }
 }

+ 43 - 3
modules/ingest-grok/src/test/resources/rest-api-spec/test/ingest_grok/20_grok.yaml

@@ -10,7 +10,7 @@
               {
                 "grok" : {
                   "field" : "field1",
-                  "pattern" : "%{NUMBER:val:float} %{NUMBER:status:int} <%{WORD:msg}>"
+                  "patterns" : ["%{NUMBER:val:float} %{NUMBER:status:int} <%{WORD:msg}>"]
                 }
               }
             ]
@@ -46,7 +46,7 @@
               {
                 "grok" : {
                   "field" : "field1",
-                  "pattern" : "<%{MY_PATTERN:msg}>",
+                  "patterns" : ["<%{MY_PATTERN:msg}>"],
                   "pattern_definitions" : {
                     "MY_PATTERN" : "foo"
                   }
@@ -83,7 +83,7 @@
               {
                 "grok" : {
                   "field" : "field1",
-                  "pattern" : "<%{NUMBER:msg}>",
+                  "patterns" : ["<%{NUMBER:msg}>"],
                   "pattern_definitions" : {
                     "NUMBER" : "foo"
                   }
@@ -107,3 +107,43 @@
         type: test
         id: 1
   - match: { _source.msg: "foo" }
+
+---
+"Test simulate with grok debugging enabled":
+  - do:
+      ingest.simulate:
+        body: >
+          {
+            "pipeline": {
+              "description": "_description",
+              "processors": [
+                {
+                  "grok" : {
+                    "field" : "field",
+                    "patterns" : ["%{ONE:one}", "%{TWO:two}"],
+                    "pattern_definitions" : {
+                      "ONE" : "1",
+                      "TWO" : "2"
+                    },
+                    "trace_match" : true
+                  }
+                }
+              ]
+            },
+            "docs": [
+              {
+                "_index": "index",
+                "_type": "type",
+                "_id": "id",
+                "_source": {
+                  "field": "abc2xyz"
+                }
+              }
+            ]
+          }
+  - length: { docs: 1 }
+  - match: { docs.0.doc._source.field: "abc2xyz" }
+  - match: { docs.0.doc._source.two: "2" }
+  - length: { docs.0.doc._ingest: 2 }
+  - match: { docs.0.doc._ingest._grok_match_index: "1" }
+  - is_true: docs.0.doc._ingest.timestamp

+ 2 - 2
qa/smoke-test-ingest-with-all-dependencies/src/test/resources/rest-api-spec/test/ingest/20_combine_processors.yaml

@@ -9,7 +9,7 @@
               {
                 "grok" : {
                   "field" : "log",
-                  "pattern": "%{COMBINEDAPACHELOG}"
+                  "patterns": ["%{COMBINEDAPACHELOG}"]
                 }
               },
               {
@@ -55,7 +55,7 @@
         index: test
         type: test
         id: 1
-  - length: { _source: 14 }
+  - length: { _source: 13 }
   - match: { _source.request: "/presentations/logstash-scale11x/images/ahhh___rage_face_by_samusmmx-d5g5zap.png" }
   - match: { _source.agent: "\"Mozilla/5.0 (Linux; Android 4.2.2; VS980 4G Build/JDQ39B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.135 Mobile Safari/537.36\"" }
   - match: { _source.auth: "-" }