Browse Source

Merge branch 'fix/22077-ingest-attachment'

David Pilato 8 years ago
parent
commit
76675229c7

+ 2 - 0
plugins/ingest-attachment/build.gradle

@@ -74,9 +74,11 @@ dependencyLicenses {
 }
 
 forbiddenPatterns {
+  exclude '**/*.doc'
   exclude '**/*.docx'
   exclude '**/*.pdf'
   exclude '**/*.epub'
+  exclude '**/*.vsdx'
 }
 
 thirdPartyAudit.excludes = [

+ 8 - 1
plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java

@@ -22,8 +22,10 @@ package org.elasticsearch.ingest.attachment;
 import org.apache.tika.Tika;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
 import org.elasticsearch.SpecialPermission;
 import org.elasticsearch.bootstrap.JarHell;
 import org.elasticsearch.common.SuppressForbidden;
@@ -45,7 +47,9 @@ import java.security.PrivilegedActionException;
 import java.security.PrivilegedExceptionAction;
 import java.security.ProtectionDomain;
 import java.security.SecurityPermission;
+import java.util.Collections;
 import java.util.PropertyPermission;
+import java.util.Set;
 
 /**
  * Runs tika with limited parsers and limited permissions.
@@ -54,6 +58,9 @@ import java.util.PropertyPermission;
  */
 final class TikaImpl {
 
+    /** Exclude some formats */
+    private static final Set<MediaType> EXCLUDES = Collections.singleton(MediaType.application("x-tika-ooxml"));
+
     /** subset of parsers for types we support */
     private static final Parser PARSERS[] = new Parser[] {
         // documents
@@ -63,7 +70,7 @@ final class TikaImpl {
         new org.apache.tika.parser.txt.TXTParser(),
         new org.apache.tika.parser.microsoft.OfficeParser(),
         new org.apache.tika.parser.microsoft.OldExcelParser(),
-        new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(),
+        ParserDecorator.withoutTypes(new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), EXCLUDES),
         new org.apache.tika.parser.odf.OpenDocumentParser(),
         new org.apache.tika.parser.iwork.IWorkPackageParser(),
         new org.apache.tika.parser.xml.DcXMLParser(),

+ 36 - 0
plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java

@@ -47,6 +47,7 @@ import static org.hamcrest.Matchers.hasSize;
 import static org.hamcrest.Matchers.is;
 import static org.hamcrest.Matchers.not;
 import static org.hamcrest.Matchers.notNullValue;
+import static org.hamcrest.Matchers.nullValue;
 import static org.hamcrest.core.IsCollectionContaining.hasItem;
 
 public class AttachmentProcessorTests extends ESTestCase {
@@ -130,6 +131,34 @@ public class AttachmentProcessorTests extends ESTestCase {
             is("application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
     }
 
+    public void testWordDocumentWithVisioSchema() throws Exception {
+        Map<String, Object> attachmentData = parseDocument("issue-22077.docx", processor);
+
+        assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type",
+            "content_length"));
+        assertThat(attachmentData.get("content").toString(), containsString("Table of Contents"));
+        assertThat(attachmentData.get("language"), is("en"));
+        assertThat(attachmentData.get("date"), is("2015-01-06T18:07:00Z"));
+        assertThat(attachmentData.get("author"), is(notNullValue()));
+        assertThat(attachmentData.get("content_length"), is(notNullValue()));
+        assertThat(attachmentData.get("content_type").toString(),
+            is("application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
+    }
+
+    public void testLegacyWordDocumentWithVisioSchema() throws Exception {
+        Map<String, Object> attachmentData = parseDocument("issue-22077.doc", processor);
+
+        assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type",
+            "content_length"));
+        assertThat(attachmentData.get("content").toString(), containsString("Table of Contents"));
+        assertThat(attachmentData.get("language"), is("en"));
+        assertThat(attachmentData.get("date"), is("2016-12-16T15:04:00Z"));
+        assertThat(attachmentData.get("author"), is(notNullValue()));
+        assertThat(attachmentData.get("content_length"), is(notNullValue()));
+        assertThat(attachmentData.get("content_type").toString(),
+            is("application/msword"));
+    }
+
     public void testPdf() throws Exception {
         Map<String, Object> attachmentData = parseDocument("test.pdf", processor);
         assertThat(attachmentData.get("content"),
@@ -138,6 +167,13 @@ public class AttachmentProcessorTests extends ESTestCase {
         assertThat(attachmentData.get("content_length"), is(notNullValue()));
     }
 
+    public void testVisioIsExcluded() throws Exception {
+        Map<String, Object> attachmentData = parseDocument("issue-22077.vsdx", processor);
+        assertThat(attachmentData.get("content"), nullValue());
+        assertThat(attachmentData.get("content_type"), is("application/vnd.ms-visio.drawing"));
+        assertThat(attachmentData.get("content_length"), is(0L));
+    }
+
     public void testEncryptedPdf() throws Exception {
         ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> parseDocument("encrypted.pdf", processor));
         assertThat(e.getDetailedMessage(), containsString("document is encrypted"));

BIN
plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.doc


BIN
plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.docx


BIN
plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.vsdx


BIN
plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/tika-files/testPPT.potm.zip