Browse Source

Upgrading to tika 3.2.2 (#133410)

Keith Massey 2 months ago
parent
commit
6985093b6f

+ 5 - 0
docs/changelog/133410.yaml

@@ -0,0 +1,5 @@
+pr: 133410
+summary: Upgrading to tika 3.2.2
+area: Ingest Node
+type: upgrade
+issues: []

+ 88 - 88
gradle/verification-metadata.xml

@@ -1389,9 +1389,9 @@
             <sha256 value="f700de80ac270d0344fdea7468201d8b9c805e5c648331c3619f2ee067ccfc59" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="commons-codec" name="commons-codec" version="1.18.0">
-         <artifact name="commons-codec-1.18.0.jar">
-            <sha256 value="ba005f304cef92a3dede24a38ad5ac9b8afccf0d8f75839d6c1338634cf7f6e4" origin="Generated by Gradle"/>
+      <component group="commons-codec" name="commons-codec" version="1.19.0">
+         <artifact name="commons-codec-1.19.0.jar">
+            <sha256 value="5c3881e4f556855e9c532927ee0c9dfde94cc66760d5805c031a59887070af5f" origin="Generated by Gradle"/>
          </artifact>
       </component>
       <component group="commons-codec" name="commons-codec" version="1.9">
@@ -1439,11 +1439,6 @@
             <sha256 value="4aa4ca48f3dfd30b78220b7881d8cb93eac4093ec94361b6befa9487998a550b" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="commons-io" name="commons-io" version="2.18.0">
-         <artifact name="commons-io-2.18.0.jar">
-            <sha256 value="f3ca0f8d63c40e23a56d54101c60d5edee136b42d84bfb85bc7963093109cf8b" origin="Generated by Gradle"/>
-         </artifact>
-      </component>
       <component group="commons-io" name="commons-io" version="2.2">
          <artifact name="commons-io-2.2.jar">
             <sha256 value="675f60bd11a82d481736591fe4054c66471fa5463d45616652fd71585792ba87" origin="Generated by Gradle"/>
@@ -2122,9 +2117,9 @@
             <sha256 value="5ba0a81f4b0769122b6045b98bb9bbba5f2c69dbf736a6cc7ca4eb603c337487" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.apache.commons" name="commons-collections4" version="4.4">
-         <artifact name="commons-collections4-4.4.jar">
-            <sha256 value="1df8b9430b5c8ed143d7815e403e33ef5371b2400aadbe9bda0883762e0846d1" origin="Generated by Gradle"/>
+      <component group="org.apache.commons" name="commons-collections4" version="4.5.0">
+         <artifact name="commons-collections4-4.5.0.jar">
+            <sha256 value="00f93263c267be201b8ae521b44a7137271b16688435340bf629db1bac0a5845" origin="Generated by Gradle"/>
          </artifact>
       </component>
       <component group="org.apache.commons" name="commons-compress" version="1.21">
@@ -2147,9 +2142,9 @@
             <sha256 value="9168a03141d8fc7eda21a2360d83cc0412bcbb1d6204d992bd48c2573cb3c6b8" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.apache.commons" name="commons-compress" version="1.27.1">
-         <artifact name="commons-compress-1.27.1.jar">
-            <sha256 value="293d80f54b536b74095dcd7ea3cf0a29bbfc3402519281332495f4420d370d16" origin="Generated by Gradle"/>
+      <component group="org.apache.commons" name="commons-compress" version="1.28.0">
+         <artifact name="commons-compress-1.28.0.jar">
+            <sha256 value="e1522945218456f3649a39bc4afd70ce4bd466221519dba7d378f2141a4642ca" origin="Generated by Gradle"/>
          </artifact>
       </component>
       <component group="org.apache.commons" name="commons-compress" version="1.4.1">
@@ -2182,9 +2177,9 @@
             <sha256 value="7b96bf3ee68949abb5bc465559ac270e0551596fa34523fddf890ec418dde13c" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.apache.commons" name="commons-lang3" version="3.17.0">
-         <artifact name="commons-lang3-3.17.0.jar">
-            <sha256 value="6ee731df5c8e5a2976a1ca023b6bb320ea8d3539fbe64c8a1d5cb765127c33b4" origin="Generated by Gradle"/>
+      <component group="org.apache.commons" name="commons-lang3" version="3.18.0">
+         <artifact name="commons-lang3-3.18.0.jar">
+            <sha256 value="4eeeae8d20c078abb64b015ec158add383ac581571cddc45c68f0c9ae0230720" origin="Generated by Gradle"/>
          </artifact>
       </component>
       <component group="org.apache.commons" name="commons-lang3" version="3.7">
@@ -2805,14 +2800,14 @@
             <sha256 value="a121f4b14ec525e54e29b9f5db7b93f4a97e088774e81c7143b5198f67d81bec" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.apache.james" name="apache-mime4j-core" version="0.8.12">
-         <artifact name="apache-mime4j-core-0.8.12.jar">
-            <sha256 value="b2180c13b97ade21edb5f52581ade0a6f82b5084bb9ca5bdf83584deb6225a69" origin="Generated by Gradle"/>
+      <component group="org.apache.james" name="apache-mime4j-core" version="0.8.13">
+         <artifact name="apache-mime4j-core-0.8.13.jar">
+            <sha256 value="00496c123926395d59e5dfdfc8342c607600c6c9e6e6dcab981a673b62481cdf" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.apache.james" name="apache-mime4j-dom" version="0.8.12">
-         <artifact name="apache-mime4j-dom-0.8.12.jar">
-            <sha256 value="d8de21f9091a0109bdfe68d323f2a5ffb326922f8493f88b1203a04a69198940" origin="Generated by Gradle"/>
+      <component group="org.apache.james" name="apache-mime4j-dom" version="0.8.13">
+         <artifact name="apache-mime4j-dom-0.8.13.jar">
+            <sha256 value="b31d88db955079cd3be745b21ef27c76ab868306688a7e54ad75646e916bfd67" origin="Generated by Gradle"/>
          </artifact>
       </component>
       <component group="org.apache.kerby" name="kerb-admin" version="1.0.1">
@@ -3305,9 +3300,9 @@
             <sha256 value="39b2dfc8e84380bf7adab657d3d5e1625cb6592a885ebdb854ec5c6f7a3ec88d" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.apache.pdfbox" name="fontbox" version="2.0.33">
-         <artifact name="fontbox-2.0.33.jar">
-            <sha256 value="95e16863508697a2b4a3a8b8996919cd426b9b01696e10964e42523496854c5b" origin="Generated by Gradle"/>
+      <component group="org.apache.pdfbox" name="fontbox" version="3.0.5">
+         <artifact name="fontbox-3.0.5.jar">
+            <sha256 value="e8a62be2df27a0d44191b6669c0b18df6efe5271232db8dcb8745d5d9774755b" origin="Generated by Gradle"/>
          </artifact>
       </component>
       <component group="org.apache.pdfbox" name="jempbox" version="1.8.17">
@@ -3315,29 +3310,34 @@
             <sha256 value="ded9c81038dd1bbcba18f07e1028d70c9ceaf0b48ac56cea8ab6ec2c255fc1b3" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.apache.pdfbox" name="pdfbox" version="2.0.33">
-         <artifact name="pdfbox-2.0.33.jar">
-            <sha256 value="34e104a3526925419a3671f8eb3f38565890f0f0106c659f2f8e7ce87b46d490" origin="Generated by Gradle"/>
+      <component group="org.apache.pdfbox" name="pdfbox" version="3.0.5">
+         <artifact name="pdfbox-3.0.5.jar">
+            <sha256 value="f0e5d3a1e573c707e4fbcc2ee8e42ea8ca1d5261bdcb3a05a08d2118553c1e5a" origin="Generated by Gradle"/>
+         </artifact>
+      </component>
+      <component group="org.apache.pdfbox" name="pdfbox-io" version="3.0.5">
+         <artifact name="pdfbox-io-3.0.5.jar">
+            <sha256 value="6df3f3b4db4fd55ef502847ea4e4ebc58e28908800e86eab031345efe219b705" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.apache.poi" name="poi" version="5.4.0">
-         <artifact name="poi-5.4.0.jar">
-            <sha256 value="ace71e79873059e273036674560b50c3d6b945b7ca168b0d4962ad7650ae1eec" origin="Generated by Gradle"/>
+      <component group="org.apache.poi" name="poi" version="5.4.1">
+         <artifact name="poi-5.4.1.jar">
+            <sha256 value="da5abf42da4604c5a7bca38956af6e9d6f196d9b6d4cb7eabee4f480b580d505" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.apache.poi" name="poi-ooxml" version="5.4.0">
-         <artifact name="poi-ooxml-5.4.0.jar">
-            <sha256 value="98693442ed7d44791de4a57962b6c820ae678e0eba9cf854681b62ff62c9611d" origin="Generated by Gradle"/>
+      <component group="org.apache.poi" name="poi-ooxml" version="5.4.1">
+         <artifact name="poi-ooxml-5.4.1.jar">
+            <sha256 value="fd200c9e6f74d704160a97e9d52041995ed87439454530001edd920688f19f53" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.apache.poi" name="poi-ooxml-lite" version="5.4.0">
-         <artifact name="poi-ooxml-lite-5.4.0.jar">
-            <sha256 value="bb5a8a6c833279ced51afb6042aa15ae5d5ca312ee682e570e23917b522b079e" origin="Generated by Gradle"/>
+      <component group="org.apache.poi" name="poi-ooxml-lite" version="5.4.1">
+         <artifact name="poi-ooxml-lite-5.4.1.jar">
+            <sha256 value="dc590461efdfcd4f27e2a892737979ab5e30b4132a7adfc7c9e56447b71a45b0" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.apache.poi" name="poi-scratchpad" version="5.4.0">
-         <artifact name="poi-scratchpad-5.4.0.jar">
-            <sha256 value="6665792cde201f5828e38e0d214aa67d817e34de3e7a5946e6a488c4534b4561" origin="Generated by Gradle"/>
+      <component group="org.apache.poi" name="poi-scratchpad" version="5.4.1">
+         <artifact name="poi-scratchpad-5.4.1.jar">
+            <sha256 value="6497ba15c1cba7062aa71661a8d776d321b1f998bb2bfa19b57d7e35606381f1" origin="Generated by Gradle"/>
          </artifact>
       </component>
       <component group="org.apache.rat" name="apache-rat" version="0.11">
@@ -3365,59 +3365,59 @@
             <sha256 value="3902794d36d9b81da1b7e697f21ed04ccae276cc116eecc640a4cd0fff2691f2" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.apache.tika" name="tika-core" version="2.9.3">
-         <artifact name="tika-core-2.9.3.jar">
-            <sha256 value="ec1ce5791ed2c81867a45b183c87d6e2a9fe67f4920567b08fcae5a3edff0fee" origin="Generated by Gradle"/>
+      <component group="org.apache.tika" name="tika-core" version="3.2.2">
+         <artifact name="tika-core-3.2.2.jar">
+            <sha256 value="a34ba35d675f5a9733e3b60d5782edd8636a5c7785ac5e1c44125d025d57ad1f" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.apache.tika" name="tika-langdetect-tika" version="2.9.3">
-         <artifact name="tika-langdetect-tika-2.9.3.jar">
-            <sha256 value="5b5075b838c75bef02052d9bef2631cb83fa245418113135f2e6cfab0c81c2a5" origin="Generated by Gradle"/>
+      <component group="org.apache.tika" name="tika-langdetect-tika" version="3.2.2">
+         <artifact name="tika-langdetect-tika-3.2.2.jar">
+            <sha256 value="1db5f0007440c70f1e22b99765ceefbb83e7d6b8344c8c9777e9a1af1d999529" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.apache.tika" name="tika-parser-apple-module" version="2.9.3">
-         <artifact name="tika-parser-apple-module-2.9.3.jar">
-            <sha256 value="798c334f71849fe1652bded5e518de59c485dbbf2dd2dede4bfcecf7a2ebd523" origin="Generated by Gradle"/>
+      <component group="org.apache.tika" name="tika-parser-apple-module" version="3.2.2">
+         <artifact name="tika-parser-apple-module-3.2.2.jar">
+            <sha256 value="02385a1228ce85b44228653296d5d43b2cc6e45fc2d16989231b484a2a0f5cd4" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.apache.tika" name="tika-parser-html-module" version="2.9.3">
-         <artifact name="tika-parser-html-module-2.9.3.jar">
-            <sha256 value="6445ba6bc1dcc41301346099b47e3b88bb0a264a4f839206a5dc7a46bda78efa" origin="Generated by Gradle"/>
+      <component group="org.apache.tika" name="tika-parser-html-module" version="3.2.2">
+         <artifact name="tika-parser-html-module-3.2.2.jar">
+            <sha256 value="598b9639bfc53acf4327bdbbdd62c86531b67f9fd2ced97828dd18789ab424c4" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.apache.tika" name="tika-parser-microsoft-module" version="2.9.3">
-         <artifact name="tika-parser-microsoft-module-2.9.3.jar">
-            <sha256 value="1da265d127f372042a4de1032e9af14e0179e7edda4bc8e9f051c68ce25a98ea" origin="Generated by Gradle"/>
+      <component group="org.apache.tika" name="tika-parser-microsoft-module" version="3.2.2">
+         <artifact name="tika-parser-microsoft-module-3.2.2.jar">
+            <sha256 value="e773f4834587285b5f0a1af4cdac80de9610276e560fe1465f10e150128deb53" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.apache.tika" name="tika-parser-miscoffice-module" version="2.9.3">
-         <artifact name="tika-parser-miscoffice-module-2.9.3.jar">
-            <sha256 value="405a68372c64f1d83094555d7c5e3a88902c4166a3c5d0418e89fc23dba40727" origin="Generated by Gradle"/>
+      <component group="org.apache.tika" name="tika-parser-miscoffice-module" version="3.2.2">
+         <artifact name="tika-parser-miscoffice-module-3.2.2.jar">
+            <sha256 value="bebe1f56ac457485ba43d842042d1917fb12404fdc43f4589e58cb6b3109f5db" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.apache.tika" name="tika-parser-pdf-module" version="2.9.3">
-         <artifact name="tika-parser-pdf-module-2.9.3.jar">
-            <sha256 value="b72f736be311b88662680e46275034da65f4d5fb91e5b0178323b36261f1222f" origin="Generated by Gradle"/>
+      <component group="org.apache.tika" name="tika-parser-pdf-module" version="3.2.2">
+         <artifact name="tika-parser-pdf-module-3.2.2.jar">
+            <sha256 value="014cf5920257eb23824f0b2a22154586f5d8d370e9128854b63ff2524b695c50" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.apache.tika" name="tika-parser-text-module" version="2.9.3">
-         <artifact name="tika-parser-text-module-2.9.3.jar">
-            <sha256 value="ad50ef8150497b9a9bd655f19a619f9935d7c2f36986c1889bb85a3f024db4c2" origin="Generated by Gradle"/>
+      <component group="org.apache.tika" name="tika-parser-text-module" version="3.2.2">
+         <artifact name="tika-parser-text-module-3.2.2.jar">
+            <sha256 value="5f82360f2a595012a0d44d829763429af9fe710f976afda4274b3f2f350501b6" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.apache.tika" name="tika-parser-xml-module" version="2.9.3">
-         <artifact name="tika-parser-xml-module-2.9.3.jar">
-            <sha256 value="0b84ca2adb58b8f15c234f75722cf8c63c0d3611dc6d9040c7a5fbff5e2705de" origin="Generated by Gradle"/>
+      <component group="org.apache.tika" name="tika-parser-xml-module" version="3.2.2">
+         <artifact name="tika-parser-xml-module-3.2.2.jar">
+            <sha256 value="9f5319ed616f814eff578f2b256e831b2aa56cb694864deed7cda94c3f74021c" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.apache.tika" name="tika-parser-xmp-commons" version="2.9.3">
-         <artifact name="tika-parser-xmp-commons-2.9.3.jar">
-            <sha256 value="529dc45f7abb9dd7034f1f3c7a4a09c9147d8ef24619e96a74532e4f283a85a9" origin="Generated by Gradle"/>
+      <component group="org.apache.tika" name="tika-parser-xmp-commons" version="3.2.2">
+         <artifact name="tika-parser-xmp-commons-3.2.2.jar">
+            <sha256 value="ab07e31b971f2442f2100db302317fb4964bbdbb1c6959499882e2e016ba5043" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.apache.tika" name="tika-parser-zip-commons" version="2.9.3">
-         <artifact name="tika-parser-zip-commons-2.9.3.jar">
-            <sha256 value="29167d467ae43893f4e0e1df072f8aa8011c85604df7ce039f50159818b47023" origin="Generated by Gradle"/>
+      <component group="org.apache.tika" name="tika-parser-zip-commons" version="3.2.2">
+         <artifact name="tika-parser-zip-commons-3.2.2.jar">
+            <sha256 value="609006e0c9d67ac70e9d33e536dd6edc13f7cca5ddff91824bc61f42ff4409d1" origin="Generated by Gradle"/>
          </artifact>
       </component>
       <component group="org.apache.xmlbeans" name="xmlbeans" version="5.3.0">
@@ -3543,11 +3543,6 @@
             <sha256 value="19d292ffd189e5ff578bbfedc0076e922898f9af96b183dda4a23863cefa6fb8" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.ccil.cowan.tagsoup" name="tagsoup" version="1.2.1">
-         <artifact name="tagsoup-1.2.1.jar">
-            <sha256 value="ac97f7b4b1d8e9337edfa0e34044f8d0efe7223f6ad8f3a85d54cc1018ea2e04" origin="Generated by Gradle"/>
-         </artifact>
-      </component>
       <component group="org.checkerframework" name="checker-qual" version="2.10.0">
          <artifact name="checker-qual-2.10.0.jar">
             <sha256 value="d261fde25d590f6b69db7721d469ac1b0a19a17ccaaaa751c31f0d8b8260b894" origin="Generated by Gradle"/>
@@ -4228,6 +4223,11 @@
             <sha256 value="e2b99c0d2fa39f69f27efb1c0016390713feb2f2e02d8ea7f1c36b780271598a" origin="Generated by Gradle"/>
          </artifact>
       </component>
+      <component group="org.jsoup" name="jsoup" version="1.21.1">
+         <artifact name="jsoup-1.21.1.jar">
+            <sha256 value="436adf71fe9f326e04fe134cd2785b261f0f4b9b60876adda1de3b6919463394" origin="Generated by Gradle"/>
+         </artifact>
+      </component>
       <component group="org.junit.jupiter" name="junit-jupiter" version="5.12.1">
          <artifact name="junit-jupiter-5.12.1.jar">
             <sha256 value="228a94c9ee743d55bfea09b57630681ea7b20a661a477447f5c1f60dc12d9e3a" origin="Generated by Gradle"/>
@@ -4773,9 +4773,9 @@
             <sha256 value="a310bc79c3f4744e2b2e993702fcebaf3696fec0063643ffdc6b49a8fb03ef39" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.slf4j" name="jcl-over-slf4j" version="2.0.16">
-         <artifact name="jcl-over-slf4j-2.0.16.jar">
-            <sha256 value="5744d62c5af556e839ab922c9fa3f737f0a5971e478ba68b2eb5256b2842ec78" origin="Generated by Gradle"/>
+      <component group="org.slf4j" name="jcl-over-slf4j" version="2.0.17">
+         <artifact name="jcl-over-slf4j-2.0.17.jar">
+            <sha256 value="affd06771589ebfe454bb11315a4f466ecaa135b95f3e7939534cf1d2fd7064c" origin="Generated by Gradle"/>
          </artifact>
       </component>
       <component group="org.slf4j" name="slf4j-api" version="1.7.10">
@@ -4808,9 +4808,9 @@
             <sha256 value="fe30825245d2336c859dc38d60c0fc5f3668dbf29cd586828d2b5667ec355b91" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.slf4j" name="slf4j-api" version="2.0.16">
-         <artifact name="slf4j-api-2.0.16.jar">
-            <sha256 value="a12578dde1ba00bd9b816d388a0b879928d00bab3c83c240f7013bf4196c579a" origin="Generated by Gradle"/>
+      <component group="org.slf4j" name="slf4j-api" version="2.0.17">
+         <artifact name="slf4j-api-2.0.17.jar">
+            <sha256 value="7b751d952061954d5abfed7181c1f645d336091b679891591d63329c622eb832" origin="Generated by Gradle"/>
          </artifact>
       </component>
       <component group="org.slf4j" name="slf4j-api" version="2.0.6">
@@ -4828,9 +4828,9 @@
             <sha256 value="4d41e01c40caf8a6c74add2b073055d8a4ce1c30e58154177b13f12d78abbe7b" origin="Generated by Gradle"/>
          </artifact>
       </component>
-      <component group="org.slf4j" name="slf4j-nop" version="2.0.16">
-         <artifact name="slf4j-nop-2.0.16.jar">
-            <sha256 value="deca6c04ed35515a0a911fa44c0e836bee92c0c59d2e8fa9bab8ffbc464a9ba7" origin="Generated by Gradle"/>
+      <component group="org.slf4j" name="slf4j-nop" version="2.0.17">
+         <artifact name="slf4j-nop-2.0.17.jar">
+            <sha256 value="3716f83649ec66161a2edefd4f49df34d1dd1c51cdcf941996c6987260f0a829" origin="Generated by Gradle"/>
          </artifact>
       </component>
       <component group="org.slf4j" name="slf4j-nop" version="2.0.6">

+ 13 - 12
modules/ingest-attachment/build.gradle

@@ -19,24 +19,24 @@ esplugin {
 // when updating tika, please review it's parent pom : https://repo1.maven.org/maven2/org/apache/tika/tika-parent
 // and manually update the transitive dependencies here
 def versions = [
-  'tika'  : '2.9.3',
-  'pdfbox': '2.0.33',
-  'poi'   : '5.4.0',
+  'tika'  : '3.2.2',
+  'pdfbox': '3.0.5',
+  'poi'   : '5.4.1',
   'sparsebitset' : '1.3', //poi dependency: https://repo1.maven.org/maven2/org/apache/poi/poi/
-  'mime4j': '0.8.12',
-  'commonsCodec': '1.18.0',
-  'slf4' : '2.0.16',
+  'mime4j': '0.8.13',
+  'commonsCodec': '1.19.0',
+  'slf4' : '2.0.17',
   'xz' : '1.10',
-  'commonsIo' : '2.18.0',
+  'commonsIo' : '2.20.0',
   //intentionally using the elder "juniversalchardet:juniversalchardet" rather than the newer "com.github.albfernandez:juniversalchardet"
   //since the "com.github.albfernandez" fork has some problems with Chinese.
   'juniversalchardet' : '1.0.3',
-  'tagsoup' : '1.2.1',
+  'jsoup' : '1.21.1',
   'jempbox' : '1.8.17',
   'xmlbeans' : '5.3.0', //poi-ooxml dependency: https://repo1.maven.org/maven2/org/apache/poi/poi-ooxml/
-  'commonsCollections4' : '4.4',
-  'commonsCompress' : '1.27.1',
-  'commonsLang3' :'3.17.0',
+  'commonsCollections4' : '4.5.0',
+  'commonsCompress' : '1.28.0',
+  'commonsLang3' :'3.18.0',
   'commonsMath3' : '3.6.1'
 ]
 
@@ -86,9 +86,10 @@ dependencies {
 
   // external parser libraries
   // HTML
-  api "org.ccil.cowan.tagsoup:tagsoup:${versions.tagsoup}"
+  api "org.jsoup:jsoup:${versions.jsoup}"
   // Adobe PDF
   api "org.apache.pdfbox:pdfbox:${versions.pdfbox}"
+  api "org.apache.pdfbox:pdfbox-io:${versions.pdfbox}"
   api "org.apache.pdfbox:fontbox:${versions.pdfbox}"
   api "org.apache.pdfbox:jempbox:${versions.jempbox}"
   // OpenOffice

+ 21 - 0
modules/ingest-attachment/licenses/jsoup-LICENSE.txt

@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) 2009-2025 Jonathan Hedley <https://jsoup.org/>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

+ 0 - 0
modules/ingest-attachment/licenses/tagsoup-NOTICE.txt → modules/ingest-attachment/licenses/jsoup-NOTICE.txt


+ 143 - 0
modules/ingest-attachment/licenses/tagsoup-LICENSE.txt → modules/ingest-attachment/licenses/pdfbox-io-LICENSE.txt

@@ -1,3 +1,4 @@
+
                                  Apache License
                            Version 2.0, January 2004
                         http://www.apache.org/licenses/
@@ -199,3 +200,145 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
+
+EXTERNAL COMPONENTS
+
+Apache PDFBox includes a number of components with separate copyright notices
+and license terms. Your use of these components is subject to the terms and
+conditions of the following licenses.
+
+Contributions made to the original PDFBox and FontBox projects:
+
+   Copyright (c) 2002-2007, www.pdfbox.org
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+   3. Neither the name of pdfbox; nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+   FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+   OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+   SUCH DAMAGE.
+
+Adobe Font Metrics (AFM) for PDF Core 14 Fonts
+
+   This file and the 14 PostScript(R) AFM files it accompanies may be used,
+   copied, and distributed for any purpose and without charge, with or without
+   modification, provided that all copyright notices are retained; that the
+   AFM files are not distributed without this file; that all modifications
+   to this file or any of the AFM files are prominently noted in the modified
+   file(s); and that this paragraph is not modified. Adobe Systems has no
+   responsibility or obligation to support the use of the AFM files.
+
+CMaps for PDF Fonts (http://opensource.adobe.com/wiki/display/cmap/Downloads)
+
+   Copyright 1990-2009 Adobe Systems Incorporated.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+   Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+   Neither the name of Adobe Systems Incorporated nor the names of its
+   contributors may be used to endorse or promote products derived from this
+   software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+   THE POSSIBILITY OF SUCH DAMAGE.
+
+PaDaF PDF/A preflight (http://sourceforge.net/projects/padaf)
+
+  Copyright 2010 Atos Worldline SAS
+
+  Licensed by Atos Worldline SAS under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  Atos Worldline SAS licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+
+OSXAdapter
+
+  Version: 2.0
+
+  Disclaimer: IMPORTANT:  This Apple software is supplied to you by
+  Apple Inc. ("Apple") in consideration of your agreement to the
+  following terms, and your use, installation, modification or
+  redistribution of this Apple software constitutes acceptance of these
+  terms.  If you do not agree with these terms, please do not use,
+  install, modify or redistribute this Apple software.
+
+  In consideration of your agreement to abide by the following terms, and
+  subject to these terms, Apple grants you a personal, non-exclusive
+  license, under Apple's copyrights in this original Apple software (the
+  "Apple Software"), to use, reproduce, modify and redistribute the Apple
+  Software, with or without modifications, in source and/or binary forms;
+  provided that if you redistribute the Apple Software in its entirety and
+  without modifications, you must retain this notice and the following
+  text and disclaimers in all such redistributions of the Apple Software.
+  Neither the name, trademarks, service marks or logos of Apple Inc.
+  may be used to endorse or promote products derived from the Apple
+  Software without specific prior written permission from Apple.  Except
+  as expressly stated in this notice, no other rights or licenses, express
+  or implied, are granted by Apple herein, including but not limited to
+  any patent rights that may be infringed by your derivative works or by
+  other works in which the Apple Software may be incorporated.
+
+  The Apple Software is provided by Apple on an "AS IS" basis.  APPLE
+  MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
+  THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
+  FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
+  OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+
+  IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
+  OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
+  MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
+  AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
+  STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+  Copyright (C) 2003-2007 Apple, Inc., All Rights Reserved

+ 22 - 0
modules/ingest-attachment/licenses/pdfbox-io-NOTICE.txt

@@ -0,0 +1,22 @@
+Apache PDFBox
+Copyright 2014 The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).
+
+Based on source code originally developed in the PDFBox and 
+FontBox projects.
+
+Copyright (c) 2002-2007, www.pdfbox.org
+
+Based on source code originally developed in the PaDaF project.
+Copyright (c) 2010 Atos Worldline SAS
+
+Includes the Adobe Glyph List
+Copyright 1997, 1998, 2002, 2007, 2010 Adobe Systems Incorporated.
+
+Includes the Zapf Dingbats Glyph List
+Copyright 2002, 2010 Adobe Systems Incorporated.
+
+Includes OSXAdapter
+Copyright (C) 2003-2007 Apple, Inc., All Rights Reserved

+ 2 - 1
modules/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java

@@ -16,6 +16,7 @@ import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParserDecorator;
+import org.apache.tika.parser.html.JSoupParser;
 
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
@@ -46,7 +47,7 @@ final class TikaImpl {
     /** subset of parsers for types we support */
     private static final Parser PARSERS[] = new Parser[] {
         // documents
-        new org.apache.tika.parser.html.HtmlParser(),
+        new JSoupParser(),
         new org.apache.tika.parser.microsoft.rtf.RTFParser(),
         new org.apache.tika.parser.pdf.PDFParser(),
         new org.apache.tika.parser.txt.TXTParser(),