Browse Source

Remove binary field after attachment processor execution (#79172)

Keeping the binary as a field within the document might consume a lot of resources. It is highly recommended to remove that field from the document, which can be done by adding a `remove` processor in th pipeline.

This commit allows an easier way of doing this by adding a new option `remove_binary`.

 When set to `true`, it removes automatically the field at the end of the processor execution.

It defaults to `false` to keep the existing behavior and not introduce any breaking change.
David Pilato 4 years ago
parent
commit
879ae33b17

+ 4 - 0
docs/plugins/ingest-attachment.asciidoc

@@ -28,6 +28,7 @@ include::install_remove.asciidoc[]
 | `indexed_chars_field`  | no        | `null`           | Field name from which you can overwrite the number of chars being used for extraction. See `indexed_chars`.
 | `properties`           | no        | all properties   | Array of properties to select to be stored. Can be `content`, `title`, `name`, `author`, `keywords`, `date`, `content_type`, `content_length`, `language`
 | `ignore_missing`       | no        | `false`          | If `true` and `field` does not exist, the processor quietly exits without modifying the document
+| `remove_binary`        | no        | `false`          | If `true`, the binary `field` will be removed from the document
 | `resource_name`        | no        |                  | Field containing the name of the resource to decode. If specified, the processor passes this resource name to the underlying Tika library to enable https://tika.apache.org/1.24.1/detection.html#Resource_Name_Based_Detection[Resource Name Based Detection].
 |======
 
@@ -94,6 +95,9 @@ The document's `attachment` object contains extracted properties for the file:
 ----
 // TESTRESPONSE[s/"_seq_no": \d+/"_seq_no" : $body._seq_no/ s/"_primary_term" : 1/"_primary_term" : $body._primary_term/]
 
+NOTE: Keeping the binary as a field within the document might consume a lot of resources. It is highly recommended
+      to remove that field from the document. Set `remove_binary` to `true` to automatically remove the field.
+
 To extract only certain `attachment` fields, specify the `properties` array:
 
 [source,console]

+ 15 - 2
plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java

@@ -44,11 +44,13 @@ public final class AttachmentProcessor extends AbstractProcessor {
     private final Set<Property> properties;
     private final int indexedChars;
     private final boolean ignoreMissing;
+    private final boolean removeBinary;
     private final String indexedCharsField;
     private final String resourceName;
 
     AttachmentProcessor(String tag, String description, String field, String targetField, Set<Property> properties,
-                        int indexedChars, boolean ignoreMissing, String indexedCharsField, String resourceName) {
+                        int indexedChars, boolean ignoreMissing, String indexedCharsField, String resourceName,
+                        boolean removeBinary) {
         super(tag, description);
         this.field = field;
         this.targetField = targetField;
@@ -57,12 +59,18 @@ public final class AttachmentProcessor extends AbstractProcessor {
         this.ignoreMissing = ignoreMissing;
         this.indexedCharsField = indexedCharsField;
         this.resourceName = resourceName;
+        this.removeBinary = removeBinary;
     }
 
     boolean isIgnoreMissing() {
         return ignoreMissing;
     }
 
+    // For tests only
+    boolean isRemoveBinary() {
+        return removeBinary;
+    }
+
     @Override
     public IngestDocument execute(IngestDocument ingestDocument) {
         Map<String, Object> additionalFields = new HashMap<>();
@@ -162,6 +170,10 @@ public final class AttachmentProcessor extends AbstractProcessor {
         }
 
         ingestDocument.setFieldValue(targetField, additionalFields);
+
+        if (removeBinary) {
+            ingestDocument.removeField(field);
+        }
         return ingestDocument;
     }
 
@@ -200,6 +212,7 @@ public final class AttachmentProcessor extends AbstractProcessor {
             int indexedChars = readIntProperty(TYPE, processorTag, config, "indexed_chars", NUMBER_OF_CHARS_INDEXED);
             boolean ignoreMissing = readBooleanProperty(TYPE, processorTag, config, "ignore_missing", false);
             String indexedCharsField = readOptionalStringProperty(TYPE, processorTag, config, "indexed_chars_field");
+            boolean removeBinary = readBooleanProperty(TYPE, processorTag, config, "remove_binary", false);
 
             final Set<Property> properties;
             if (propertyNames != null) {
@@ -217,7 +230,7 @@ public final class AttachmentProcessor extends AbstractProcessor {
             }
 
             return new AttachmentProcessor(processorTag, description, field, targetField, properties, indexedChars, ignoreMissing,
-                indexedCharsField, resourceName);
+                indexedCharsField, resourceName, removeBinary);
         }
     }
 

+ 15 - 0
plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorFactoryTests.java

@@ -124,4 +124,19 @@ public class AttachmentProcessorFactoryTests extends ESTestCase {
         assertThat(processor.getProperties(), sameInstance(AttachmentProcessor.Factory.DEFAULT_PROPERTIES));
         assertTrue(processor.isIgnoreMissing());
     }
+
+    public void testRemoveBinary() throws Exception {
+        Map<String, Object> config = new HashMap<>();
+        config.put("field", "_field");
+        config.put("remove_binary", true);
+
+        String processorTag = randomAlphaOfLength(10);
+
+        AttachmentProcessor processor = factory.create(null, processorTag, null, config);
+        assertThat(processor.getTag(), equalTo(processorTag));
+        assertThat(processor.getField(), equalTo("_field"));
+        assertThat(processor.getTargetField(), equalTo("attachment"));
+        assertThat(processor.getProperties(), sameInstance(AttachmentProcessor.Factory.DEFAULT_PROPERTIES));
+        assertTrue(processor.isRemoveBinary());
+    }
 }

+ 30 - 9
plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java

@@ -44,7 +44,7 @@ public class AttachmentProcessorTests extends ESTestCase {
     @Before
     public void createStandardProcessor() {
         processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
-            "target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 10000, false, null, null);
+            "target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 10000, false, null, null, false);
     }
 
     public void testEnglishTextDocument() throws Exception {
@@ -77,7 +77,7 @@ public class AttachmentProcessorTests extends ESTestCase {
             selectedProperties.add(AttachmentProcessor.Property.DATE);
         }
         processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
-            "target_field", selectedProperties, 10000, false, null, null);
+            "target_field", selectedProperties, 10000, false, null, null, false);
 
         Map<String, Object> attachmentData = parseDocument("htmlWithEmptyDateMeta.html", processor);
         assertThat(attachmentData.keySet(), hasSize(selectedFieldNames.length));
@@ -237,7 +237,7 @@ public class AttachmentProcessorTests extends ESTestCase {
             Collections.singletonMap("source_field", null));
         IngestDocument ingestDocument = new IngestDocument(originalIngestDocument);
         Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
-            "randomTarget", null, 10, true, null, null);
+            "randomTarget", null, 10, true, null, null, false);
         processor.execute(ingestDocument);
         assertIngestDocument(originalIngestDocument, ingestDocument);
     }
@@ -246,7 +246,7 @@ public class AttachmentProcessorTests extends ESTestCase {
         IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(), Collections.emptyMap());
         IngestDocument ingestDocument = new IngestDocument(originalIngestDocument);
         Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
-            "randomTarget", null, 10, true, null, null);
+            "randomTarget", null, 10, true, null, null, false);
         processor.execute(ingestDocument);
         assertIngestDocument(originalIngestDocument, ingestDocument);
     }
@@ -256,7 +256,7 @@ public class AttachmentProcessorTests extends ESTestCase {
             Collections.singletonMap("source_field", null));
         IngestDocument ingestDocument = new IngestDocument(originalIngestDocument);
         Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
-            "randomTarget", null, 10, false, null, null);
+            "randomTarget", null, 10, false, null, null, false);
         Exception exception = expectThrows(Exception.class, () -> processor.execute(ingestDocument));
         assertThat(exception.getMessage(), equalTo("field [source_field] is null, cannot parse."));
     }
@@ -265,7 +265,7 @@ public class AttachmentProcessorTests extends ESTestCase {
         IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(), Collections.emptyMap());
         IngestDocument ingestDocument = new IngestDocument(originalIngestDocument);
         Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
-            "randomTarget", null, 10, false, null, null);
+            "randomTarget", null, 10, false, null, null, false);
         Exception exception = expectThrows(Exception.class, () -> processor.execute(ingestDocument));
         assertThat(exception.getMessage(), equalTo("field [source_field] not present as part of path [source_field]"));
     }
@@ -299,7 +299,7 @@ public class AttachmentProcessorTests extends ESTestCase {
 
     public void testIndexedChars() throws Exception {
         processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
-            "target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 19, false, null, null);
+            "target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 19, false, null, null, false);
 
         Map<String, Object> attachmentData = parseDocument("text-in-english.txt", processor);
 
@@ -310,7 +310,7 @@ public class AttachmentProcessorTests extends ESTestCase {
         assertThat(attachmentData.get("content_length"), is(19L));
 
         processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
-            "target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 19, false, "max_length", null);
+            "target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 19, false, "max_length", null, false);
 
         attachmentData = parseDocument("text-in-english.txt", processor);
 
@@ -341,7 +341,7 @@ public class AttachmentProcessorTests extends ESTestCase {
     public void testIndexedCharsWithResourceName() throws Exception {
         processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
             "target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 100,
-            false, null, "resource_name");
+            false, null, "resource_name", false);
 
         Map<String, Object> attachmentData = parseDocument("text-cjk-big5.txt", processor, Collections.singletonMap("max_length", 100),
             true);
@@ -369,6 +369,27 @@ public class AttachmentProcessorTests extends ESTestCase {
         assertThat(attachmentData.get("content_length"), is(100L));
     }
 
+    public void testRemoveBinary() throws Exception {
+        {
+            // Test the default behavior.
+            Map<String, Object> document = new HashMap<>();
+            document.put("source_field", getAsBinaryOrBase64("text-in-english.txt"));
+            IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document);
+            processor.execute(ingestDocument);
+            assertThat(ingestDocument.hasField("source_field"), is(true));
+        }
+        {
+            // Remove the binary field.
+            processor = new AttachmentProcessor(randomAlphaOfLength(10), null, "source_field",
+                "target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 10000, false, null, null, true);
+            Map<String, Object> document = new HashMap<>();
+            document.put("source_field", getAsBinaryOrBase64("text-in-english.txt"));
+            IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document);
+            processor.execute(ingestDocument);
+            assertThat(ingestDocument.hasField("source_field"), is(false));
+        }
+    }
+
     private Object getAsBinaryOrBase64(String filename) throws Exception {
         String path = "/org/elasticsearch/ingest/attachment/test/sample-files/" + filename;
         try (InputStream is = AttachmentProcessorTests.class.getResourceAsStream(path)) {