Browse Source

Upgrading to tika 3.2.2 (#133410)

Keith Massey 2 months ago
parent
commit
6985093b6f

+ 5 - 0
docs/changelog/133410.yaml

@@ -0,0 +1,5 @@
+pr: 133410
+summary: Upgrading to tika 3.2.2
+area: Ingest Node
+type: upgrade
+issues: []

+ 88 - 88
gradle/verification-metadata.xml

@@ -1389,9 +1389,9 @@
             <sha256 value="f700de80ac270d0344fdea7468201d8b9c805e5c648331c3619f2ee067ccfc59" origin="Generated by Gradle"/>
             <sha256 value="f700de80ac270d0344fdea7468201d8b9c805e5c648331c3619f2ee067ccfc59" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="commons-codec" name="commons-codec" version="1.18.0">
-         <artifact name="commons-codec-1.18.0.jar">
-            <sha256 value="ba005f304cef92a3dede24a38ad5ac9b8afccf0d8f75839d6c1338634cf7f6e4" origin="Generated by Gradle"/>
+      <component group="commons-codec" name="commons-codec" version="1.19.0">
+         <artifact name="commons-codec-1.19.0.jar">
+            <sha256 value="5c3881e4f556855e9c532927ee0c9dfde94cc66760d5805c031a59887070af5f" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
       <component group="commons-codec" name="commons-codec" version="1.9">
       <component group="commons-codec" name="commons-codec" version="1.9">
@@ -1439,11 +1439,6 @@
             <sha256 value="4aa4ca48f3dfd30b78220b7881d8cb93eac4093ec94361b6befa9487998a550b" origin="Generated by Gradle"/>
             <sha256 value="4aa4ca48f3dfd30b78220b7881d8cb93eac4093ec94361b6befa9487998a550b" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="commons-io" name="commons-io" version="2.18.0">
-         <artifact name="commons-io-2.18.0.jar">
-            <sha256 value="f3ca0f8d63c40e23a56d54101c60d5edee136b42d84bfb85bc7963093109cf8b" origin="Generated by Gradle"/>
-         </artifact>
-      </component>
       <component group="commons-io" name="commons-io" version="2.2">
       <component group="commons-io" name="commons-io" version="2.2">
          <artifact name="commons-io-2.2.jar">
          <artifact name="commons-io-2.2.jar">
             <sha256 value="675f60bd11a82d481736591fe4054c66471fa5463d45616652fd71585792ba87" origin="Generated by Gradle"/>
             <sha256 value="675f60bd11a82d481736591fe4054c66471fa5463d45616652fd71585792ba87" origin="Generated by Gradle"/>
@@ -2122,9 +2117,9 @@
             <sha256 value="5ba0a81f4b0769122b6045b98bb9bbba5f2c69dbf736a6cc7ca4eb603c337487" origin="Generated by Gradle"/>
             <sha256 value="5ba0a81f4b0769122b6045b98bb9bbba5f2c69dbf736a6cc7ca4eb603c337487" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.apache.commons" name="commons-collections4" version="4.4">
-         <artifact name="commons-collections4-4.4.jar">
-            <sha256 value="1df8b9430b5c8ed143d7815e403e33ef5371b2400aadbe9bda0883762e0846d1" origin="Generated by Gradle"/>
+      <component group="org.apache.commons" name="commons-collections4" version="4.5.0">
+         <artifact name="commons-collections4-4.5.0.jar">
+            <sha256 value="00f93263c267be201b8ae521b44a7137271b16688435340bf629db1bac0a5845" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
       <component group="org.apache.commons" name="commons-compress" version="1.21">
       <component group="org.apache.commons" name="commons-compress" version="1.21">
@@ -2147,9 +2142,9 @@
             <sha256 value="9168a03141d8fc7eda21a2360d83cc0412bcbb1d6204d992bd48c2573cb3c6b8" origin="Generated by Gradle"/>
             <sha256 value="9168a03141d8fc7eda21a2360d83cc0412bcbb1d6204d992bd48c2573cb3c6b8" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.apache.commons" name="commons-compress" version="1.27.1">
-         <artifact name="commons-compress-1.27.1.jar">
-            <sha256 value="293d80f54b536b74095dcd7ea3cf0a29bbfc3402519281332495f4420d370d16" origin="Generated by Gradle"/>
+      <component group="org.apache.commons" name="commons-compress" version="1.28.0">
+         <artifact name="commons-compress-1.28.0.jar">
+            <sha256 value="e1522945218456f3649a39bc4afd70ce4bd466221519dba7d378f2141a4642ca" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
       <component group="org.apache.commons" name="commons-compress" version="1.4.1">
       <component group="org.apache.commons" name="commons-compress" version="1.4.1">
@@ -2182,9 +2177,9 @@
             <sha256 value="7b96bf3ee68949abb5bc465559ac270e0551596fa34523fddf890ec418dde13c" origin="Generated by Gradle"/>
             <sha256 value="7b96bf3ee68949abb5bc465559ac270e0551596fa34523fddf890ec418dde13c" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.apache.commons" name="commons-lang3" version="3.17.0">
-         <artifact name="commons-lang3-3.17.0.jar">
-            <sha256 value="6ee731df5c8e5a2976a1ca023b6bb320ea8d3539fbe64c8a1d5cb765127c33b4" origin="Generated by Gradle"/>
+      <component group="org.apache.commons" name="commons-lang3" version="3.18.0">
+         <artifact name="commons-lang3-3.18.0.jar">
+            <sha256 value="4eeeae8d20c078abb64b015ec158add383ac581571cddc45c68f0c9ae0230720" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
       <component group="org.apache.commons" name="commons-lang3" version="3.7">
       <component group="org.apache.commons" name="commons-lang3" version="3.7">
@@ -2805,14 +2800,14 @@
             <sha256 value="a121f4b14ec525e54e29b9f5db7b93f4a97e088774e81c7143b5198f67d81bec" origin="Generated by Gradle"/>
             <sha256 value="a121f4b14ec525e54e29b9f5db7b93f4a97e088774e81c7143b5198f67d81bec" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.apache.james" name="apache-mime4j-core" version="0.8.12">
-         <artifact name="apache-mime4j-core-0.8.12.jar">
-            <sha256 value="b2180c13b97ade21edb5f52581ade0a6f82b5084bb9ca5bdf83584deb6225a69" origin="Generated by Gradle"/>
+      <component group="org.apache.james" name="apache-mime4j-core" version="0.8.13">
+         <artifact name="apache-mime4j-core-0.8.13.jar">
+            <sha256 value="00496c123926395d59e5dfdfc8342c607600c6c9e6e6dcab981a673b62481cdf" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.apache.james" name="apache-mime4j-dom" version="0.8.12">
-         <artifact name="apache-mime4j-dom-0.8.12.jar">
-            <sha256 value="d8de21f9091a0109bdfe68d323f2a5ffb326922f8493f88b1203a04a69198940" origin="Generated by Gradle"/>
+      <component group="org.apache.james" name="apache-mime4j-dom" version="0.8.13">
+         <artifact name="apache-mime4j-dom-0.8.13.jar">
+            <sha256 value="b31d88db955079cd3be745b21ef27c76ab868306688a7e54ad75646e916bfd67" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
       <component group="org.apache.kerby" name="kerb-admin" version="1.0.1">
       <component group="org.apache.kerby" name="kerb-admin" version="1.0.1">
@@ -3305,9 +3300,9 @@
             <sha256 value="39b2dfc8e84380bf7adab657d3d5e1625cb6592a885ebdb854ec5c6f7a3ec88d" origin="Generated by Gradle"/>
             <sha256 value="39b2dfc8e84380bf7adab657d3d5e1625cb6592a885ebdb854ec5c6f7a3ec88d" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.apache.pdfbox" name="fontbox" version="2.0.33">
-         <artifact name="fontbox-2.0.33.jar">
-            <sha256 value="95e16863508697a2b4a3a8b8996919cd426b9b01696e10964e42523496854c5b" origin="Generated by Gradle"/>
+      <component group="org.apache.pdfbox" name="fontbox" version="3.0.5">
+         <artifact name="fontbox-3.0.5.jar">
+            <sha256 value="e8a62be2df27a0d44191b6669c0b18df6efe5271232db8dcb8745d5d9774755b" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
       <component group="org.apache.pdfbox" name="jempbox" version="1.8.17">
       <component group="org.apache.pdfbox" name="jempbox" version="1.8.17">
@@ -3315,29 +3310,34 @@
             <sha256 value="ded9c81038dd1bbcba18f07e1028d70c9ceaf0b48ac56cea8ab6ec2c255fc1b3" origin="Generated by Gradle"/>
             <sha256 value="ded9c81038dd1bbcba18f07e1028d70c9ceaf0b48ac56cea8ab6ec2c255fc1b3" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.apache.pdfbox" name="pdfbox" version="2.0.33">
-         <artifact name="pdfbox-2.0.33.jar">
-            <sha256 value="34e104a3526925419a3671f8eb3f38565890f0f0106c659f2f8e7ce87b46d490" origin="Generated by Gradle"/>
+      <component group="org.apache.pdfbox" name="pdfbox" version="3.0.5">
+         <artifact name="pdfbox-3.0.5.jar">
+            <sha256 value="f0e5d3a1e573c707e4fbcc2ee8e42ea8ca1d5261bdcb3a05a08d2118553c1e5a" origin="Generated by Gradle"/>
+         </artifact>
+      </component>
+      <component group="org.apache.pdfbox" name="pdfbox-io" version="3.0.5">
+         <artifact name="pdfbox-io-3.0.5.jar">
+            <sha256 value="6df3f3b4db4fd55ef502847ea4e4ebc58e28908800e86eab031345efe219b705" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.apache.poi" name="poi" version="5.4.0">
-         <artifact name="poi-5.4.0.jar">
-            <sha256 value="ace71e79873059e273036674560b50c3d6b945b7ca168b0d4962ad7650ae1eec" origin="Generated by Gradle"/>
+      <component group="org.apache.poi" name="poi" version="5.4.1">
+         <artifact name="poi-5.4.1.jar">
+            <sha256 value="da5abf42da4604c5a7bca38956af6e9d6f196d9b6d4cb7eabee4f480b580d505" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.apache.poi" name="poi-ooxml" version="5.4.0">
-         <artifact name="poi-ooxml-5.4.0.jar">
-            <sha256 value="98693442ed7d44791de4a57962b6c820ae678e0eba9cf854681b62ff62c9611d" origin="Generated by Gradle"/>
+      <component group="org.apache.poi" name="poi-ooxml" version="5.4.1">
+         <artifact name="poi-ooxml-5.4.1.jar">
+            <sha256 value="fd200c9e6f74d704160a97e9d52041995ed87439454530001edd920688f19f53" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.apache.poi" name="poi-ooxml-lite" version="5.4.0">
-         <artifact name="poi-ooxml-lite-5.4.0.jar">
-            <sha256 value="bb5a8a6c833279ced51afb6042aa15ae5d5ca312ee682e570e23917b522b079e" origin="Generated by Gradle"/>
+      <component group="org.apache.poi" name="poi-ooxml-lite" version="5.4.1">
+         <artifact name="poi-ooxml-lite-5.4.1.jar">
+            <sha256 value="dc590461efdfcd4f27e2a892737979ab5e30b4132a7adfc7c9e56447b71a45b0" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.apache.poi" name="poi-scratchpad" version="5.4.0">
-         <artifact name="poi-scratchpad-5.4.0.jar">
-            <sha256 value="6665792cde201f5828e38e0d214aa67d817e34de3e7a5946e6a488c4534b4561" origin="Generated by Gradle"/>
+      <component group="org.apache.poi" name="poi-scratchpad" version="5.4.1">
+         <artifact name="poi-scratchpad-5.4.1.jar">
+            <sha256 value="6497ba15c1cba7062aa71661a8d776d321b1f998bb2bfa19b57d7e35606381f1" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
       <component group="org.apache.rat" name="apache-rat" version="0.11">
       <component group="org.apache.rat" name="apache-rat" version="0.11">
@@ -3365,59 +3365,59 @@
             <sha256 value="3902794d36d9b81da1b7e697f21ed04ccae276cc116eecc640a4cd0fff2691f2" origin="Generated by Gradle"/>
             <sha256 value="3902794d36d9b81da1b7e697f21ed04ccae276cc116eecc640a4cd0fff2691f2" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.apache.tika" name="tika-core" version="2.9.3">
-         <artifact name="tika-core-2.9.3.jar">
-            <sha256 value="ec1ce5791ed2c81867a45b183c87d6e2a9fe67f4920567b08fcae5a3edff0fee" origin="Generated by Gradle"/>
+      <component group="org.apache.tika" name="tika-core" version="3.2.2">
+         <artifact name="tika-core-3.2.2.jar">
+            <sha256 value="a34ba35d675f5a9733e3b60d5782edd8636a5c7785ac5e1c44125d025d57ad1f" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.apache.tika" name="tika-langdetect-tika" version="2.9.3">
-         <artifact name="tika-langdetect-tika-2.9.3.jar">
-            <sha256 value="5b5075b838c75bef02052d9bef2631cb83fa245418113135f2e6cfab0c81c2a5" origin="Generated by Gradle"/>
+      <component group="org.apache.tika" name="tika-langdetect-tika" version="3.2.2">
+         <artifact name="tika-langdetect-tika-3.2.2.jar">
+            <sha256 value="1db5f0007440c70f1e22b99765ceefbb83e7d6b8344c8c9777e9a1af1d999529" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.apache.tika" name="tika-parser-apple-module" version="2.9.3">
-         <artifact name="tika-parser-apple-module-2.9.3.jar">
-            <sha256 value="798c334f71849fe1652bded5e518de59c485dbbf2dd2dede4bfcecf7a2ebd523" origin="Generated by Gradle"/>
+      <component group="org.apache.tika" name="tika-parser-apple-module" version="3.2.2">
+         <artifact name="tika-parser-apple-module-3.2.2.jar">
+            <sha256 value="02385a1228ce85b44228653296d5d43b2cc6e45fc2d16989231b484a2a0f5cd4" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.apache.tika" name="tika-parser-html-module" version="2.9.3">
-         <artifact name="tika-parser-html-module-2.9.3.jar">
-            <sha256 value="6445ba6bc1dcc41301346099b47e3b88bb0a264a4f839206a5dc7a46bda78efa" origin="Generated by Gradle"/>
+      <component group="org.apache.tika" name="tika-parser-html-module" version="3.2.2">
+         <artifact name="tika-parser-html-module-3.2.2.jar">
+            <sha256 value="598b9639bfc53acf4327bdbbdd62c86531b67f9fd2ced97828dd18789ab424c4" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.apache.tika" name="tika-parser-microsoft-module" version="2.9.3">
-         <artifact name="tika-parser-microsoft-module-2.9.3.jar">
-            <sha256 value="1da265d127f372042a4de1032e9af14e0179e7edda4bc8e9f051c68ce25a98ea" origin="Generated by Gradle"/>
+      <component group="org.apache.tika" name="tika-parser-microsoft-module" version="3.2.2">
+         <artifact name="tika-parser-microsoft-module-3.2.2.jar">
+            <sha256 value="e773f4834587285b5f0a1af4cdac80de9610276e560fe1465f10e150128deb53" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.apache.tika" name="tika-parser-miscoffice-module" version="2.9.3">
-         <artifact name="tika-parser-miscoffice-module-2.9.3.jar">
-            <sha256 value="405a68372c64f1d83094555d7c5e3a88902c4166a3c5d0418e89fc23dba40727" origin="Generated by Gradle"/>
+      <component group="org.apache.tika" name="tika-parser-miscoffice-module" version="3.2.2">
+         <artifact name="tika-parser-miscoffice-module-3.2.2.jar">
+            <sha256 value="bebe1f56ac457485ba43d842042d1917fb12404fdc43f4589e58cb6b3109f5db" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.apache.tika" name="tika-parser-pdf-module" version="2.9.3">
-         <artifact name="tika-parser-pdf-module-2.9.3.jar">
-            <sha256 value="b72f736be311b88662680e46275034da65f4d5fb91e5b0178323b36261f1222f" origin="Generated by Gradle"/>
+      <component group="org.apache.tika" name="tika-parser-pdf-module" version="3.2.2">
+         <artifact name="tika-parser-pdf-module-3.2.2.jar">
+            <sha256 value="014cf5920257eb23824f0b2a22154586f5d8d370e9128854b63ff2524b695c50" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.apache.tika" name="tika-parser-text-module" version="2.9.3">
-         <artifact name="tika-parser-text-module-2.9.3.jar">
-            <sha256 value="ad50ef8150497b9a9bd655f19a619f9935d7c2f36986c1889bb85a3f024db4c2" origin="Generated by Gradle"/>
+      <component group="org.apache.tika" name="tika-parser-text-module" version="3.2.2">
+         <artifact name="tika-parser-text-module-3.2.2.jar">
+            <sha256 value="5f82360f2a595012a0d44d829763429af9fe710f976afda4274b3f2f350501b6" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.apache.tika" name="tika-parser-xml-module" version="2.9.3">
-         <artifact name="tika-parser-xml-module-2.9.3.jar">
-            <sha256 value="0b84ca2adb58b8f15c234f75722cf8c63c0d3611dc6d9040c7a5fbff5e2705de" origin="Generated by Gradle"/>
+      <component group="org.apache.tika" name="tika-parser-xml-module" version="3.2.2">
+         <artifact name="tika-parser-xml-module-3.2.2.jar">
+            <sha256 value="9f5319ed616f814eff578f2b256e831b2aa56cb694864deed7cda94c3f74021c" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.apache.tika" name="tika-parser-xmp-commons" version="2.9.3">
-         <artifact name="tika-parser-xmp-commons-2.9.3.jar">
-            <sha256 value="529dc45f7abb9dd7034f1f3c7a4a09c9147d8ef24619e96a74532e4f283a85a9" origin="Generated by Gradle"/>
+      <component group="org.apache.tika" name="tika-parser-xmp-commons" version="3.2.2">
+         <artifact name="tika-parser-xmp-commons-3.2.2.jar">
+            <sha256 value="ab07e31b971f2442f2100db302317fb4964bbdbb1c6959499882e2e016ba5043" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.apache.tika" name="tika-parser-zip-commons" version="2.9.3">
-         <artifact name="tika-parser-zip-commons-2.9.3.jar">
-            <sha256 value="29167d467ae43893f4e0e1df072f8aa8011c85604df7ce039f50159818b47023" origin="Generated by Gradle"/>
+      <component group="org.apache.tika" name="tika-parser-zip-commons" version="3.2.2">
+         <artifact name="tika-parser-zip-commons-3.2.2.jar">
+            <sha256 value="609006e0c9d67ac70e9d33e536dd6edc13f7cca5ddff91824bc61f42ff4409d1" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
       <component group="org.apache.xmlbeans" name="xmlbeans" version="5.3.0">
       <component group="org.apache.xmlbeans" name="xmlbeans" version="5.3.0">
@@ -3543,11 +3543,6 @@
             <sha256 value="19d292ffd189e5ff578bbfedc0076e922898f9af96b183dda4a23863cefa6fb8" origin="Generated by Gradle"/>
             <sha256 value="19d292ffd189e5ff578bbfedc0076e922898f9af96b183dda4a23863cefa6fb8" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.ccil.cowan.tagsoup" name="tagsoup" version="1.2.1">
-         <artifact name="tagsoup-1.2.1.jar">
-            <sha256 value="ac97f7b4b1d8e9337edfa0e34044f8d0efe7223f6ad8f3a85d54cc1018ea2e04" origin="Generated by Gradle"/>
-         </artifact>
-      </component>
       <component group="org.checkerframework" name="checker-qual" version="2.10.0">
       <component group="org.checkerframework" name="checker-qual" version="2.10.0">
          <artifact name="checker-qual-2.10.0.jar">
          <artifact name="checker-qual-2.10.0.jar">
             <sha256 value="d261fde25d590f6b69db7721d469ac1b0a19a17ccaaaa751c31f0d8b8260b894" origin="Generated by Gradle"/>
             <sha256 value="d261fde25d590f6b69db7721d469ac1b0a19a17ccaaaa751c31f0d8b8260b894" origin="Generated by Gradle"/>
@@ -4228,6 +4223,11 @@
             <sha256 value="e2b99c0d2fa39f69f27efb1c0016390713feb2f2e02d8ea7f1c36b780271598a" origin="Generated by Gradle"/>
             <sha256 value="e2b99c0d2fa39f69f27efb1c0016390713feb2f2e02d8ea7f1c36b780271598a" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
+      <component group="org.jsoup" name="jsoup" version="1.21.1">
+         <artifact name="jsoup-1.21.1.jar">
+            <sha256 value="436adf71fe9f326e04fe134cd2785b261f0f4b9b60876adda1de3b6919463394" origin="Generated by Gradle"/>
+         </artifact>
+      </component>
       <component group="org.junit.jupiter" name="junit-jupiter" version="5.12.1">
       <component group="org.junit.jupiter" name="junit-jupiter" version="5.12.1">
          <artifact name="junit-jupiter-5.12.1.jar">
          <artifact name="junit-jupiter-5.12.1.jar">
             <sha256 value="228a94c9ee743d55bfea09b57630681ea7b20a661a477447f5c1f60dc12d9e3a" origin="Generated by Gradle"/>
             <sha256 value="228a94c9ee743d55bfea09b57630681ea7b20a661a477447f5c1f60dc12d9e3a" origin="Generated by Gradle"/>
@@ -4773,9 +4773,9 @@
             <sha256 value="a310bc79c3f4744e2b2e993702fcebaf3696fec0063643ffdc6b49a8fb03ef39" origin="Generated by Gradle"/>
             <sha256 value="a310bc79c3f4744e2b2e993702fcebaf3696fec0063643ffdc6b49a8fb03ef39" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.slf4j" name="jcl-over-slf4j" version="2.0.16">
-         <artifact name="jcl-over-slf4j-2.0.16.jar">
-            <sha256 value="5744d62c5af556e839ab922c9fa3f737f0a5971e478ba68b2eb5256b2842ec78" origin="Generated by Gradle"/>
+      <component group="org.slf4j" name="jcl-over-slf4j" version="2.0.17">
+         <artifact name="jcl-over-slf4j-2.0.17.jar">
+            <sha256 value="affd06771589ebfe454bb11315a4f466ecaa135b95f3e7939534cf1d2fd7064c" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
       <component group="org.slf4j" name="slf4j-api" version="1.7.10">
       <component group="org.slf4j" name="slf4j-api" version="1.7.10">
@@ -4808,9 +4808,9 @@
             <sha256 value="fe30825245d2336c859dc38d60c0fc5f3668dbf29cd586828d2b5667ec355b91" origin="Generated by Gradle"/>
             <sha256 value="fe30825245d2336c859dc38d60c0fc5f3668dbf29cd586828d2b5667ec355b91" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.slf4j" name="slf4j-api" version="2.0.16">
-         <artifact name="slf4j-api-2.0.16.jar">
-            <sha256 value="a12578dde1ba00bd9b816d388a0b879928d00bab3c83c240f7013bf4196c579a" origin="Generated by Gradle"/>
+      <component group="org.slf4j" name="slf4j-api" version="2.0.17">
+         <artifact name="slf4j-api-2.0.17.jar">
+            <sha256 value="7b751d952061954d5abfed7181c1f645d336091b679891591d63329c622eb832" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
       <component group="org.slf4j" name="slf4j-api" version="2.0.6">
       <component group="org.slf4j" name="slf4j-api" version="2.0.6">
@@ -4828,9 +4828,9 @@
             <sha256 value="4d41e01c40caf8a6c74add2b073055d8a4ce1c30e58154177b13f12d78abbe7b" origin="Generated by Gradle"/>
             <sha256 value="4d41e01c40caf8a6c74add2b073055d8a4ce1c30e58154177b13f12d78abbe7b" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
-      <component group="org.slf4j" name="slf4j-nop" version="2.0.16">
-         <artifact name="slf4j-nop-2.0.16.jar">
-            <sha256 value="deca6c04ed35515a0a911fa44c0e836bee92c0c59d2e8fa9bab8ffbc464a9ba7" origin="Generated by Gradle"/>
+      <component group="org.slf4j" name="slf4j-nop" version="2.0.17">
+         <artifact name="slf4j-nop-2.0.17.jar">
+            <sha256 value="3716f83649ec66161a2edefd4f49df34d1dd1c51cdcf941996c6987260f0a829" origin="Generated by Gradle"/>
          </artifact>
          </artifact>
       </component>
       </component>
       <component group="org.slf4j" name="slf4j-nop" version="2.0.6">
       <component group="org.slf4j" name="slf4j-nop" version="2.0.6">

+ 13 - 12
modules/ingest-attachment/build.gradle

@@ -19,24 +19,24 @@ esplugin {
 // when updating tika, please review it's parent pom : https://repo1.maven.org/maven2/org/apache/tika/tika-parent
 // when updating tika, please review it's parent pom : https://repo1.maven.org/maven2/org/apache/tika/tika-parent
 // and manually update the transitive dependencies here
 // and manually update the transitive dependencies here
 def versions = [
 def versions = [
-  'tika'  : '2.9.3',
-  'pdfbox': '2.0.33',
-  'poi'   : '5.4.0',
+  'tika'  : '3.2.2',
+  'pdfbox': '3.0.5',
+  'poi'   : '5.4.1',
   'sparsebitset' : '1.3', //poi dependency: https://repo1.maven.org/maven2/org/apache/poi/poi/
   'sparsebitset' : '1.3', //poi dependency: https://repo1.maven.org/maven2/org/apache/poi/poi/
-  'mime4j': '0.8.12',
-  'commonsCodec': '1.18.0',
-  'slf4' : '2.0.16',
+  'mime4j': '0.8.13',
+  'commonsCodec': '1.19.0',
+  'slf4' : '2.0.17',
   'xz' : '1.10',
   'xz' : '1.10',
-  'commonsIo' : '2.18.0',
+  'commonsIo' : '2.20.0',
   //intentionally using the elder "juniversalchardet:juniversalchardet" rather than the newer "com.github.albfernandez:juniversalchardet"
   //intentionally using the elder "juniversalchardet:juniversalchardet" rather than the newer "com.github.albfernandez:juniversalchardet"
   //since the "com.github.albfernandez" fork has some problems with Chinese.
   //since the "com.github.albfernandez" fork has some problems with Chinese.
   'juniversalchardet' : '1.0.3',
   'juniversalchardet' : '1.0.3',
-  'tagsoup' : '1.2.1',
+  'jsoup' : '1.21.1',
   'jempbox' : '1.8.17',
   'jempbox' : '1.8.17',
   'xmlbeans' : '5.3.0', //poi-ooxml dependency: https://repo1.maven.org/maven2/org/apache/poi/poi-ooxml/
   'xmlbeans' : '5.3.0', //poi-ooxml dependency: https://repo1.maven.org/maven2/org/apache/poi/poi-ooxml/
-  'commonsCollections4' : '4.4',
-  'commonsCompress' : '1.27.1',
-  'commonsLang3' :'3.17.0',
+  'commonsCollections4' : '4.5.0',
+  'commonsCompress' : '1.28.0',
+  'commonsLang3' :'3.18.0',
   'commonsMath3' : '3.6.1'
   'commonsMath3' : '3.6.1'
 ]
 ]
 
 
@@ -86,9 +86,10 @@ dependencies {
 
 
   // external parser libraries
   // external parser libraries
   // HTML
   // HTML
-  api "org.ccil.cowan.tagsoup:tagsoup:${versions.tagsoup}"
+  api "org.jsoup:jsoup:${versions.jsoup}"
   // Adobe PDF
   // Adobe PDF
   api "org.apache.pdfbox:pdfbox:${versions.pdfbox}"
   api "org.apache.pdfbox:pdfbox:${versions.pdfbox}"
+  api "org.apache.pdfbox:pdfbox-io:${versions.pdfbox}"
   api "org.apache.pdfbox:fontbox:${versions.pdfbox}"
   api "org.apache.pdfbox:fontbox:${versions.pdfbox}"
   api "org.apache.pdfbox:jempbox:${versions.jempbox}"
   api "org.apache.pdfbox:jempbox:${versions.jempbox}"
   // OpenOffice
   // OpenOffice

+ 21 - 0
modules/ingest-attachment/licenses/jsoup-LICENSE.txt

@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) 2009-2025 Jonathan Hedley <https://jsoup.org/>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

+ 0 - 0
modules/ingest-attachment/licenses/tagsoup-NOTICE.txt → modules/ingest-attachment/licenses/jsoup-NOTICE.txt


+ 143 - 0
modules/ingest-attachment/licenses/tagsoup-LICENSE.txt → modules/ingest-attachment/licenses/pdfbox-io-LICENSE.txt

@@ -1,3 +1,4 @@
+
                                  Apache License
                                  Apache License
                            Version 2.0, January 2004
                            Version 2.0, January 2004
                         http://www.apache.org/licenses/
                         http://www.apache.org/licenses/
@@ -199,3 +200,145 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    See the License for the specific language governing permissions and
    limitations under the License.
    limitations under the License.
+
+EXTERNAL COMPONENTS
+
+Apache PDFBox includes a number of components with separate copyright notices
+and license terms. Your use of these components is subject to the terms and
+conditions of the following licenses.
+
+Contributions made to the original PDFBox and FontBox projects:
+
+   Copyright (c) 2002-2007, www.pdfbox.org
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+   3. Neither the name of pdfbox; nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+   FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+   OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+   SUCH DAMAGE.
+
+Adobe Font Metrics (AFM) for PDF Core 14 Fonts
+
+   This file and the 14 PostScript(R) AFM files it accompanies may be used,
+   copied, and distributed for any purpose and without charge, with or without
+   modification, provided that all copyright notices are retained; that the
+   AFM files are not distributed without this file; that all modifications
+   to this file or any of the AFM files are prominently noted in the modified
+   file(s); and that this paragraph is not modified. Adobe Systems has no
+   responsibility or obligation to support the use of the AFM files.
+
+CMaps for PDF Fonts (http://opensource.adobe.com/wiki/display/cmap/Downloads)
+
+   Copyright 1990-2009 Adobe Systems Incorporated.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+   Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+   Neither the name of Adobe Systems Incorporated nor the names of its
+   contributors may be used to endorse or promote products derived from this
+   software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+   THE POSSIBILITY OF SUCH DAMAGE.
+
+PaDaF PDF/A preflight (http://sourceforge.net/projects/padaf)
+
+  Copyright 2010 Atos Worldline SAS
+
+  Licensed by Atos Worldline SAS under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  Atos Worldline SAS licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+
+OSXAdapter
+
+  Version: 2.0
+
+  Disclaimer: IMPORTANT:  This Apple software is supplied to you by
+  Apple Inc. ("Apple") in consideration of your agreement to the
+  following terms, and your use, installation, modification or
+  redistribution of this Apple software constitutes acceptance of these
+  terms.  If you do not agree with these terms, please do not use,
+  install, modify or redistribute this Apple software.
+
+  In consideration of your agreement to abide by the following terms, and
+  subject to these terms, Apple grants you a personal, non-exclusive
+  license, under Apple's copyrights in this original Apple software (the
+  "Apple Software"), to use, reproduce, modify and redistribute the Apple
+  Software, with or without modifications, in source and/or binary forms;
+  provided that if you redistribute the Apple Software in its entirety and
+  without modifications, you must retain this notice and the following
+  text and disclaimers in all such redistributions of the Apple Software.
+  Neither the name, trademarks, service marks or logos of Apple Inc.
+  may be used to endorse or promote products derived from the Apple
+  Software without specific prior written permission from Apple.  Except
+  as expressly stated in this notice, no other rights or licenses, express
+  or implied, are granted by Apple herein, including but not limited to
+  any patent rights that may be infringed by your derivative works or by
+  other works in which the Apple Software may be incorporated.
+
+  The Apple Software is provided by Apple on an "AS IS" basis.  APPLE
+  MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
+  THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
+  FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
+  OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+
+  IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
+  OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
+  MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
+  AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
+  STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+  Copyright (C) 2003-2007 Apple, Inc., All Rights Reserved

+ 22 - 0
modules/ingest-attachment/licenses/pdfbox-io-NOTICE.txt

@@ -0,0 +1,22 @@
+Apache PDFBox
+Copyright 2014 The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).
+
+Based on source code originally developed in the PDFBox and 
+FontBox projects.
+
+Copyright (c) 2002-2007, www.pdfbox.org
+
+Based on source code originally developed in the PaDaF project.
+Copyright (c) 2010 Atos Worldline SAS
+
+Includes the Adobe Glyph List
+Copyright 1997, 1998, 2002, 2007, 2010 Adobe Systems Incorporated.
+
+Includes the Zapf Dingbats Glyph List
+Copyright 2002, 2010 Adobe Systems Incorporated.
+
+Includes OSXAdapter
+Copyright (C) 2003-2007 Apple, Inc., All Rights Reserved

+ 2 - 1
modules/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java

@@ -16,6 +16,7 @@ import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParserDecorator;
 import org.apache.tika.parser.ParserDecorator;
+import org.apache.tika.parser.html.JSoupParser;
 
 
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.IOException;
@@ -46,7 +47,7 @@ final class TikaImpl {
     /** subset of parsers for types we support */
     /** subset of parsers for types we support */
     private static final Parser PARSERS[] = new Parser[] {
     private static final Parser PARSERS[] = new Parser[] {
         // documents
         // documents
-        new org.apache.tika.parser.html.HtmlParser(),
+        new JSoupParser(),
         new org.apache.tika.parser.microsoft.rtf.RTFParser(),
         new org.apache.tika.parser.microsoft.rtf.RTFParser(),
         new org.apache.tika.parser.pdf.PDFParser(),
         new org.apache.tika.parser.pdf.PDFParser(),
         new org.apache.tika.parser.txt.TXTParser(),
         new org.apache.tika.parser.txt.TXTParser(),