Переглянути джерело

Fix Dissect with leading non-ascii characters (#111184)

Luigi Dell'Aquila 1 рік тому
батько
коміт
8f3244d537

+ 5 - 0
docs/changelog/111184.yaml

@@ -0,0 +1,5 @@
+pr: 111184
+summary: Fix Dissect with leading non-ascii characters
+area: Ingest Node
+type: bug
+issues: []

+ 1 - 1
libs/dissect/src/main/java/org/elasticsearch/dissect/DissectParser.java

@@ -203,7 +203,7 @@ public final class DissectParser {
             DissectKey key = dissectPair.key();
             byte[] delimiter = dissectPair.delimiter().getBytes(StandardCharsets.UTF_8);
             // start dissection after the first delimiter
-            int i = leadingDelimiter.length();
+            int i = leadingDelimiter.getBytes(StandardCharsets.UTF_8).length;
             int valueStart = i;
             int lookAheadMatches;
             // start walking the input string byte by byte, look ahead for matches where needed

+ 12 - 0
libs/dissect/src/test/java/org/elasticsearch/dissect/DissectParserTests.java

@@ -211,6 +211,18 @@ public class DissectParserTests extends ESTestCase {
         assertMatch("%{a->}࿏%{b}", "⟳༒࿏࿏࿏࿏࿏༒⟲", Arrays.asList("a", "b"), Arrays.asList("⟳༒", "༒⟲"));
         assertMatch("%{*a}࿏%{&a}", "⟳༒࿏༒⟲", Arrays.asList("⟳༒"), Arrays.asList("༒⟲"));
         assertMatch("%{}࿏%{a}", "⟳༒࿏༒⟲", Arrays.asList("a"), Arrays.asList("༒⟲"));
+        assertMatch(
+            "Zürich, the %{adjective} city in Switzerland",
+            "Zürich, the largest city in Switzerland",
+            Arrays.asList("adjective"),
+            Arrays.asList("largest")
+        );
+        assertMatch(
+            "Zürich, the %{one} city in Switzerland; Zürich, the %{two} city in Switzerland",
+            "Zürich, the largest city in Switzerland; Zürich, the LARGEST city in Switzerland",
+            Arrays.asList("one", "two"),
+            Arrays.asList("largest", "LARGEST")
+        );
     }
 
     public void testMatchRemainder() {