Browse Source

Tag redacted document in ingest pipeline (#113552) (#113750)

Adds a new option trace_redact in redact processor to indicate a document has been redacted in the ingest pipeline. If a document is processed by a redact processor AND any field is redacted, ingest metadata _ingest._redact._is_redacted = true will be set.

Closes #94633

Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
Sam Xiao 1 year ago
parent
commit
d405df9679

+ 5 - 0
docs/changelog/113552.yaml

@@ -0,0 +1,5 @@
+pr: 113552
+summary: Tag redacted document in ingest metadata
+area: Ingest Node
+type: enhancement
+issues: []

+ 1 - 0
docs/reference/ingest/processors/redact.asciidoc

@@ -39,6 +39,7 @@ patterns. Legacy Grok patterns are not supported.
 | `ignore_missing`       | no        | `true`              | If `true` and `field` does not exist or is `null`, the processor quietly exits without modifying the document
 include::common-options.asciidoc[]
 | `skip_if_unlicensed`   | no        | `false`             | If `true` and the current license does not support running redact processors, then the processor quietly exits without modifying the document
+| `trace_redact`         | no        | `false`             | If `true` then ingest metadata `_ingest._redact._is_redacted` is set to `true` if the document has been redacted
 |======
 
 In this example the predefined `IP` Grok pattern is used to match

+ 32 - 2
x-pack/plugin/redact/src/main/java/org/elasticsearch/xpack/redact/RedactProcessor.java

@@ -55,6 +55,12 @@ public class RedactProcessor extends AbstractProcessor {
     private static final String DEFAULT_REDACTED_START = "<";
     private static final String DEFAULT_REDACTED_END = ">";
 
+    protected static final String REDACT_KEY = "_redact";
+    protected static final String IS_REDACTED_KEY = "_is_redacted";
+    protected static final String METADATA_PATH_REDACT = IngestDocument.INGEST_KEY + "." + REDACT_KEY;
+    // indicates if document has been redacted, path: _ingest._redact._is_redacted
+    protected static final String METADATA_PATH_REDACT_IS_REDACTED = METADATA_PATH_REDACT + "." + IS_REDACTED_KEY;
+
     private final String redactField;
     private final List<Grok> groks;
     private final boolean ignoreMissing;
@@ -65,6 +71,8 @@ public class RedactProcessor extends AbstractProcessor {
     private final XPackLicenseState licenseState;
     private final boolean skipIfUnlicensed;
 
+    private final boolean traceRedact;
+
     RedactProcessor(
         String tag,
         String description,
@@ -76,7 +84,8 @@ public class RedactProcessor extends AbstractProcessor {
         String redactedEndToken,
         MatcherWatchdog matcherWatchdog,
         XPackLicenseState licenseState,
-        boolean skipIfUnlicensed
+        boolean skipIfUnlicensed,
+        boolean traceRedact
     ) {
         super(tag, description);
         this.redactField = redactField;
@@ -94,6 +103,7 @@ public class RedactProcessor extends AbstractProcessor {
         }
         this.licenseState = licenseState;
         this.skipIfUnlicensed = skipIfUnlicensed;
+        this.traceRedact = traceRedact;
     }
 
     @Override
@@ -128,6 +138,8 @@ public class RedactProcessor extends AbstractProcessor {
         try {
             String redacted = matchRedact(fieldValue, groks, redactedStartToken, redactedEndToken);
             ingestDocument.setFieldValue(redactField, redacted);
+            updateMetadataIfNecessary(ingestDocument, fieldValue, redacted);
+
             return ingestDocument;
         } catch (RuntimeException e) {
             // grok throws a RuntimeException when the watchdog interrupts the match
@@ -203,6 +215,21 @@ public class RedactProcessor extends AbstractProcessor {
         } while (offset != length);
     }
 
+    private void updateMetadataIfNecessary(IngestDocument ingestDocument, String fieldValue, String redacted) {
+        if (traceRedact == false || fieldValue == null) {
+            return;
+        }
+
+        Boolean isRedactedMetadata = ingestDocument.getFieldValue(METADATA_PATH_REDACT_IS_REDACTED, Boolean.class, true);
+        boolean alreadyRedacted = Boolean.TRUE.equals(isRedactedMetadata);
+        boolean isRedacted = fieldValue.equals(redacted) == false;
+
+        // document newly redacted
+        if (alreadyRedacted == false && isRedacted) {
+            ingestDocument.setFieldValue(METADATA_PATH_REDACT_IS_REDACTED, true);
+        }
+    }
+
     /**
      * A Grok capture extractor which tracks matched regions
      * and the Grok pattern name for redaction later.
@@ -389,6 +416,8 @@ public class RedactProcessor extends AbstractProcessor {
             String redactStart = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "prefix", DEFAULT_REDACTED_START);
             String redactEnd = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "suffix", DEFAULT_REDACTED_END);
 
+            boolean traceRedact = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "trace_redact", false);
+
             if (matchPatterns == null || matchPatterns.isEmpty()) {
                 throw newConfigurationException(TYPE, processorTag, "patterns", "List of patterns must not be empty");
             }
@@ -406,7 +435,8 @@ public class RedactProcessor extends AbstractProcessor {
                     redactEnd,
                     matcherWatchdog,
                     licenseState,
-                    skipIfUnlicensed
+                    skipIfUnlicensed,
+                    traceRedact
                 );
             } catch (Exception e) {
                 throw newConfigurationException(

+ 1 - 0
x-pack/plugin/redact/src/test/java/org/elasticsearch/xpack/redact/RedactProcessorFactoryTests.java

@@ -68,6 +68,7 @@ public class RedactProcessorFactoryTests extends ESTestCase {
         config.put("patterns", List.of("%{MY_PATTERN:name}!"));
         config.put("pattern_definitions", Map.of("MY_PATTERN", "foo"));
         config.put("ignore_missing", true);
+        config.put("trace_redact", true);
         config.put("extra", "unused");
 
         factory.create(null, null, null, config);

+ 114 - 1
x-pack/plugin/redact/src/test/java/org/elasticsearch/xpack/redact/RedactProcessorTests.java

@@ -259,7 +259,8 @@ public class RedactProcessorTests extends ESTestCase {
                 ">",
                 MatcherWatchdog.noop(),
                 notAllowed,
-                false // set skip_if_unlicensed to false, we do not want to skip, we do want to fail
+                false, // set skip_if_unlicensed to false, we do not want to skip, we do want to fail
+                false
             );
             assertThat(processor.getSkipIfUnlicensed(), equalTo(false));
             var ingestDoc = createIngestDoc(Map.of("not_the_field", "fieldValue"));
@@ -314,6 +315,118 @@ public class RedactProcessorTests extends ESTestCase {
         }
     }
 
+    @SuppressWarnings("unchecked")
+    public void testTraceRedact() throws Exception {
+        var config = new HashMap<String, Object>();
+        config.put("field", "to_redact");
+        config.put("patterns", List.of("%{EMAILADDRESS:REDACTED}"));
+        config.put("trace_redact", true);
+        {
+            var processor = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create(
+                null,
+                "t",
+                "d",
+                new HashMap<>(config)
+            );
+            var message = "this should not be redacted";
+            var ingestDoc = createIngestDoc(Map.of("to_redact", message));
+            var redactedDoc = processor.execute(ingestDoc);
+
+            assertEquals(message, redactedDoc.getFieldValue("to_redact", String.class));
+            assertNull(redactedDoc.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class, true));
+        }
+        {
+            var processor = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create(
+                null,
+                "t",
+                "d",
+                new HashMap<>(config)
+            );
+            var ingestDoc = createIngestDoc(Map.of("to_redact", "thisisanemail@address.com will be redacted"));
+            var redactedDoc = processor.execute(ingestDoc);
+
+            assertEquals("<REDACTED> will be redacted", redactedDoc.getFieldValue("to_redact", String.class));
+            // validate ingest metadata path correctly resolved
+            assertTrue(redactedDoc.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class));
+            // validate ingest metadata structure correct
+            var ingestMeta = redactedDoc.getIngestMetadata();
+            assertTrue(ingestMeta.containsKey(RedactProcessor.REDACT_KEY));
+            var redactMetadata = (HashMap<String, Object>) ingestMeta.get(RedactProcessor.REDACT_KEY);
+            assertTrue(redactMetadata.containsKey(RedactProcessor.IS_REDACTED_KEY));
+            assertTrue((Boolean) redactMetadata.get(RedactProcessor.IS_REDACTED_KEY));
+        }
+        {
+            var configNoTrace = new HashMap<String, Object>();
+            configNoTrace.put("field", "to_redact");
+            configNoTrace.put("patterns", List.of("%{EMAILADDRESS:REDACTED}"));
+
+            var processor = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create(null, "t", "d", configNoTrace);
+            var ingestDoc = createIngestDoc(Map.of("to_redact", "thisisanemail@address.com will be redacted"));
+            var redactedDoc = processor.execute(ingestDoc);
+
+            assertEquals("<REDACTED> will be redacted", redactedDoc.getFieldValue("to_redact", String.class));
+            assertNull(redactedDoc.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class, true));
+        }
+    }
+
+    public void testTraceRedactMultipleProcessors() throws Exception {
+        var configRedact = new HashMap<String, Object>();
+        configRedact.put("field", "to_redact");
+        configRedact.put("patterns", List.of("%{EMAILADDRESS:REDACTED}"));
+        configRedact.put("trace_redact", true);
+
+        var configNoRedact = new HashMap<String, Object>();
+        configNoRedact.put("field", "to_redact");
+        configNoRedact.put("patterns", List.of("%{IP:REDACTED}"));  // not in the doc
+        configNoRedact.put("trace_redact", true);
+
+        // first processor does not redact doc, second one does
+        {
+            var processorRedact = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create(
+                null,
+                "t1",
+                "d",
+                new HashMap<>(configRedact)
+            );
+            var processorNoRedact = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create(
+                null,
+                "t2",
+                "d",
+                new HashMap<>(configNoRedact)
+            );
+            var ingestDocWithEmail = createIngestDoc(Map.of("to_redact", "thisisanemail@address.com will be redacted"));
+
+            var docNotRedacted = processorNoRedact.execute(ingestDocWithEmail);
+            assertNull(docNotRedacted.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class, true));
+
+            var docRedacted = processorRedact.execute(docNotRedacted);
+            assertTrue(docRedacted.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class));
+        }
+        // first processor redacts doc, second one does not
+        {
+            var processorRedact = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create(
+                null,
+                "t1",
+                "d",
+                new HashMap<>(configRedact)
+            );
+            var processorNoRedact = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create(
+                null,
+                "t2",
+                "d",
+                new HashMap<>(configNoRedact)
+            );
+            var ingestDocWithEmail = createIngestDoc(Map.of("to_redact", "thisisanemail@address.com will be redacted"));
+
+            var docRedacted = processorRedact.execute(ingestDocWithEmail);
+            assertTrue(docRedacted.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class));
+
+            // validate does not override already redacted doc metadata
+            var docRedactedAlready = processorNoRedact.execute(docRedacted);
+            assertTrue(docRedactedAlready.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class));
+        }
+    }
+
     public void testMergeLongestRegion() {
         var r = List.of(
             new RedactProcessor.RegionTrackingMatchExtractor.Replacement(10, 20, "first"),

+ 23 - 1
x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/redact/10_redact_processor.yml

@@ -24,7 +24,7 @@
         index: test
         id: "1"
         pipeline: "pipeline-using-a-redact-processor"
-        body: {to_redact: "0.0.0.1 is my secret IP to redact"}
+        body: { to_redact: "0.0.0.1 is my secret IP to redact" }
 
   - do:
       get:
@@ -96,3 +96,25 @@
           }
   - length: { docs: 1 }
   - match: { docs.0.doc._source.to_redact: "==*EMAIL*== will be redacted" }
+---
+"Test redact with trace_redact":
+  - do:
+      ingest.simulate:
+        body: >
+          {
+            "pipeline": {
+              "processors": [
+                {
+                  "redact": {
+                    "field": "to_redact",
+                    "patterns": ["%{EMAILADDRESS:EMAIL}", "%{IP:IP_ADDRESS}"],
+                    "trace_redact": true
+                  }
+                }
+              ]
+            },
+            "docs": [{"_source": {"to_redact": "this-email@address.com will be redacted"}}]
+          }
+  - length: { docs: 1 }
+  - match: { docs.0.doc._source.to_redact: "<EMAIL> will be redacted" }
+  - match: { docs.0.doc._ingest._redact._is_redacted: true }