Browse Source

Fix `uri_parts` processor behaviour for missing extensions (#105689)

The `uri_parts` processor was behaving incorrectly for
URI's that included a dot in the path but did not have an extension.
Also includes YAML REST tests for the same.
Niels Bauman 1 year ago
parent
commit
b1fcedd7ae

+ 6 - 0
docs/changelog/105689.yaml

@@ -0,0 +1,6 @@
+pr: 105689
+summary: Fix `uri_parts` processor behaviour for missing extensions
+area: Ingest Node
+type: bug
+issues:
+ - 105612

+ 10 - 3
modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/UriPartsProcessor.java

@@ -140,9 +140,16 @@ public class UriPartsProcessor extends AbstractProcessor {
         }
         if (path != null) {
             uriParts.put("path", path);
-            if (path.contains(".")) {
-                int periodIndex = path.lastIndexOf('.');
-                uriParts.put("extension", periodIndex < path.length() ? path.substring(periodIndex + 1) : "");
+            // To avoid any issues with extracting the extension from a path that contains a dot, we explicitly extract the extension
+            // from the last segment in the path.
+            var lastSegmentIndex = path.lastIndexOf('/');
+            if (lastSegmentIndex >= 0) {
+                var lastSegment = path.substring(lastSegmentIndex);
+                int periodIndex = lastSegment.lastIndexOf('.');
+                if (periodIndex >= 0) {
+                    // Don't include the dot in the extension field.
+                    uriParts.put("extension", lastSegment.substring(periodIndex + 1));
+                }
             }
         }
         if (port != -1) {

+ 25 - 0
modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/UriPartsProcessorTests.java

@@ -181,6 +181,31 @@ public class UriPartsProcessorTests extends ESTestCase {
         );
     }
 
+    public void testDotPathWithoutExtension() throws Exception {
+        testUriParsing(
+            "https://www.google.com/path.withdot/filenamewithoutextension",
+            Map.of("scheme", "https", "domain", "www.google.com", "path", "/path.withdot/filenamewithoutextension")
+        );
+    }
+
+    public void testDotPathWithExtension() throws Exception {
+        testUriParsing(
+            "https://www.google.com/path.withdot/filenamewithextension.txt",
+            Map.of("scheme", "https", "domain", "www.google.com", "path", "/path.withdot/filenamewithextension.txt", "extension", "txt")
+        );
+    }
+
+    /**
+     * This test verifies that we return an empty extension instead of <code>null</code> if the URI ends with a period. This is probably
+     * not behaviour we necessarily want to keep forever, but this test ensures that we're conscious about changing that behaviour.
+     */
+    public void testEmptyExtension() throws Exception {
+        testUriParsing(
+            "https://www.google.com/foo/bar.",
+            Map.of("scheme", "https", "domain", "www.google.com", "path", "/foo/bar.", "extension", "")
+        );
+    }
+
     public void testRemoveIfSuccessfulDoesNotRemoveTargetField() throws Exception {
         String field = "field";
         UriPartsProcessor processor = new UriPartsProcessor(null, null, field, field, true, false, false);

+ 49 - 0
modules/ingest-common/src/yamlRestTest/resources/rest-api-spec/test/ingest/320_uri_parts_processor.yml

@@ -0,0 +1,49 @@
+---
+teardown:
+  - do:
+      ingest.delete_pipeline:
+        id: "uri-parts-pipeline"
+        ignore: 404
+
+---
+"Test URI parts Processor":
+  - do:
+      ingest.put_pipeline:
+        id: "uri-parts-pipeline"
+        body:  >
+          {
+            "processors": [
+              {
+                "uri_parts" : {
+                  "field" : "my_uri"
+                }
+              }
+            ]
+          }
+  - match: { acknowledged: true }
+
+  - do:
+      index:
+        index: test
+        id: "1"
+        pipeline: "uri-parts-pipeline"
+        body: {
+          my_uri: "https://user:pw@testing.google.com:8080/foo/bar.txt?foo1=bar1&foo2=bar2#anchorVal"
+        }
+
+  - do:
+      get:
+        index: test
+        id: "1"
+  - match: { _source.my_uri: "https://user:pw@testing.google.com:8080/foo/bar.txt?foo1=bar1&foo2=bar2#anchorVal" }
+  - match: { _source.url.original: "https://user:pw@testing.google.com:8080/foo/bar.txt?foo1=bar1&foo2=bar2#anchorVal" }
+  - match: { _source.url.scheme: "https" }
+  - match: { _source.url.domain: "testing.google.com" }
+  - match: { _source.url.fragment: "anchorVal" }
+  - match: { _source.url.path: "/foo/bar.txt" }
+  - match: { _source.url.port: 8080 }
+  - match: { _source.url.username: "user" }
+  - match: { _source.url.password: "pw" }
+  - match: { _source.url.user_info: "user:pw" }
+  - match: { _source.url.query: "foo1=bar1&foo2=bar2" }
+  - match: { _source.url.extension: "txt" }