Pārlūkot izejas kodu

fix trace_match behavior for when there is only one grok pattern (#21413)

There is an issue in the Grok Processor, where trace_match: true does not inject the _ingest._grok_match_index into the ingest-document when there is just one pattern provided. This is due to an optimization in the regex construction. This commit adds a check for when this is the case, and injects a static index value of "0", since there is only one pattern matched (at the first index into the patterns).

To make this clearer, more documentation was added to the grok-processor docs.

Fixes #21371.
Tal Levy 9 gadi atpakaļ
vecāks
revīzija
04b712bdc5

+ 135 - 2
docs/reference/ingest/ingest-node.asciidoc

@@ -931,14 +931,14 @@ and the result:
           "date1" : "2016-04-25T12:02:01.789Z"
         },
         "_ingest" : {
-          "timestamp" : "2016-08-11T12:00:01.222Z"
+          "timestamp" : "2016-11-08T19:43:03.850+0000"
         }
       }
     }
   ]
 }
 --------------------------------------------------
-// TESTRESPONSE[s/2016-08-11T12:00:01.222Z/$body.docs.0.doc._ingest.timestamp/]
+// TESTRESPONSE[s/2016-11-08T19:43:03.850\+0000/$body.docs.0.doc._ingest.timestamp/]
 
 The above example shows that `_index` was set to `<myindex-{2016-04-25||/M{yyyy-MM-dd|UTC}}>`. Elasticsearch
 understands this to mean `2016-04-01` as is explained in the <<date-math-index-names, date math index name documentation>>
@@ -1278,6 +1278,139 @@ Here is an example of a pipeline specifying custom pattern definitions:
 }
 --------------------------------------------------
 
+[[trace-match]]
+==== Providing Multiple Match Patterns
+
+Sometimes one pattern is not enough to capture the potential structure of a field. Let's assume we
+want to match all messages that contain your favorite pet breeds of either cats or dogs. One way to accomplish
+this is to provide two distinct patterns that can be matched, instead of one really complicated expression capturing
+the same `or` behavior.
+
+Here is an example of such a configuration executed against the simulate API:
+
+[source,js]
+--------------------------------------------------
+POST _ingest/pipeline/_simulate
+{
+  "pipeline": {
+  "description" : "parse multiple patterns",
+  "processors": [
+    {
+      "grok": {
+        "field": "message",
+        "patterns": ["%{FAVORITE_DOG:pet}", "%{FAVORITE_CAT:pet}"],
+        "pattern_definitions" : {
+          "FAVORITE_DOG" : "beagle",
+          "FAVORITE_CAT" : "burmese"
+        }
+      }
+    }
+  ]
+},
+"docs":[
+  {
+    "_source": {
+      "message": "I love burmese cats!"
+    }
+  }
+  ]
+}
+--------------------------------------------------
+// CONSOLE
+
+response:
+
+[source,js]
+--------------------------------------------------
+{
+  "docs": [
+    {
+      "doc": {
+        "_type": "_type",
+        "_index": "_index",
+        "_id": "_id",
+        "_source": {
+          "message": "I love burmese cats!",
+          "pet": "burmese"
+        },
+        "_ingest": {
+          "timestamp": "2016-11-08T19:43:03.850+0000"
+        }
+      }
+    }
+  ]
+}
+--------------------------------------------------
+// TESTRESPONSE[s/2016-11-08T19:43:03.850\+0000/$body.docs.0.doc._ingest.timestamp/]
+
+Both patterns will set the field `pet` with the appropriate match, but what if we want to trace which of our
+patterns matched and populated our fields? We can do this with the `trace_match` parameter. Here is the output of
+that same pipeline, but with `"trace_match": true` configured:
+
+////
+Hidden setup for example:
+[source,js]
+--------------------------------------------------
+POST _ingest/pipeline/_simulate
+{
+  "pipeline": {
+  "description" : "parse multiple patterns",
+  "processors": [
+    {
+      "grok": {
+        "field": "message",
+        "patterns": ["%{FAVORITE_DOG:pet}", "%{FAVORITE_CAT:pet}"],
+        "trace_match": true,
+        "pattern_definitions" : {
+          "FAVORITE_DOG" : "beagle",
+          "FAVORITE_CAT" : "burmese"
+        }
+      }
+    }
+  ]
+},
+"docs":[
+  {
+    "_source": {
+      "message": "I love burmese cats!"
+    }
+  }
+  ]
+}
+--------------------------------------------------
+// CONSOLE
+////
+
+[source,js]
+--------------------------------------------------
+{
+  "docs": [
+    {
+      "doc": {
+        "_type": "_type",
+        "_index": "_index",
+        "_id": "_id",
+        "_source": {
+          "message": "I love burmese cats!",
+          "pet": "burmese"
+        },
+        "_ingest": {
+          "_grok_match_index": "1",
+          "timestamp": "2016-11-08T19:43:03.850+0000"
+        }
+      }
+    }
+  ]
+}
+--------------------------------------------------
+// TESTRESPONSE[s/2016-11-08T19:43:03.850\+0000/$body.docs.0.doc._ingest.timestamp/]
+
+In the above response, you can see that the index of the pattern that matched was `"1"`. This is to say that it was the
+second (index starts at zero) pattern in `patterns` to match.
+
+This trace metadata enables debugging which of the patterns matched. This information is stored in the ingest
+metadata and will not be indexed.
+
 [[gsub-processor]]
 === Gsub Processor
 Converts a string field by applying a regular expression and a replacement.

+ 15 - 5
modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/GrokProcessor.java

@@ -37,6 +37,7 @@ public final class GrokProcessor extends AbstractProcessor {
     private static final String PATTERN_MATCH_KEY = "_ingest._grok_match_index";
 
     private final String matchField;
+    private final List<String> matchPatterns;
     private final Grok grok;
     private final boolean traceMatch;
     private final boolean ignoreMissing;
@@ -45,6 +46,7 @@ public final class GrokProcessor extends AbstractProcessor {
                          boolean traceMatch, boolean ignoreMissing) {
         super(tag);
         this.matchField = matchField;
+        this.matchPatterns = matchPatterns;
         this.grok = new Grok(patternBank, combinePatterns(matchPatterns, traceMatch));
         this.traceMatch = traceMatch;
         this.ignoreMissing = ignoreMissing;
@@ -79,11 +81,15 @@ public final class GrokProcessor extends AbstractProcessor {
             .forEach((e) -> ingestDocument.setFieldValue(e.getKey(), e.getValue()));
 
         if (traceMatch) {
-            @SuppressWarnings("unchecked")
-            HashMap<String, String> matchMap = (HashMap<String, String>) ingestDocument.getFieldValue(PATTERN_MATCH_KEY, Object.class);
-            matchMap.keySet().stream().findFirst().ifPresent((index) -> {
-                ingestDocument.setFieldValue(PATTERN_MATCH_KEY, index);
-            });
+            if (matchPatterns.size() > 1) {
+                @SuppressWarnings("unchecked")
+                HashMap<String, String> matchMap = (HashMap<String, String>) ingestDocument.getFieldValue(PATTERN_MATCH_KEY, Object.class);
+                matchMap.keySet().stream().findFirst().ifPresent((index) -> {
+                    ingestDocument.setFieldValue(PATTERN_MATCH_KEY, index);
+                });
+            } else {
+                ingestDocument.setFieldValue(PATTERN_MATCH_KEY, "0");
+            }
         }
     }
 
@@ -104,6 +110,10 @@ public final class GrokProcessor extends AbstractProcessor {
         return matchField;
     }
 
+    List<String> getMatchPatterns() {
+        return matchPatterns;
+    }
+
     static String combinePatterns(List<String> patterns, boolean traceMatch) {
         String combinedPattern;
         if (patterns.size() > 1) {

+ 13 - 0
modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/GrokProcessorTests.java

@@ -158,6 +158,19 @@ public class GrokProcessorTests extends ESTestCase {
         assertThat(doc.getFieldValue("_ingest._grok_match_index", String.class), equalTo("1"));
     }
 
+    public void testTraceWithOnePattern() throws Exception {
+        String fieldName = RandomDocumentPicks.randomFieldName(random());
+        IngestDocument doc = RandomDocumentPicks.randomIngestDocument(random(), new HashMap<>());
+        doc.setFieldValue(fieldName, "first1");
+        Map<String, String> patternBank = new HashMap<>();
+        patternBank.put("ONE", "1");
+        GrokProcessor processor = new GrokProcessor(randomAsciiOfLength(10), patternBank,
+            Arrays.asList("%{ONE:one}"), fieldName, true, false);
+        processor.execute(doc);
+        assertThat(doc.hasField("one"), equalTo(true));
+        assertThat(doc.getFieldValue("_ingest._grok_match_index", String.class), equalTo("0"));
+    }
+
     public void testCombinedPatterns() {
         String combined;
         combined = GrokProcessor.combinePatterns(Arrays.asList(""), false);