Răsfoiți Sursa

ES|QL fix no-length substring with supplementary (4-byte) character (#107183)

This commit fixes a bug in the no-length substring variant with supplementary (4-byte) characters.
Chris Hegarty 1 an în urmă
părinte
comite
bdf9c605b5

+ 5 - 0
docs/changelog/107183.yaml

@@ -0,0 +1,5 @@
+pr: 107183
+summary: ES|QL fix no-length substring with supplementary (4-byte) character
+area: ES|QL
+type: bug
+issues: []

+ 8 - 1
x-pack/plugin/esql/qa/testFixtures/src/main/resources/string.csv-spec

@@ -172,6 +172,13 @@ emp_no:integer | last_name:keyword | x:keyword | z:keyword
 10010 | Piveteau  | P | a
 ;
 
+substring Emoji#[skip:-8.13.99,reason:bug fix in 8.14]
+row a = "🐱Meow!🐶Woof!" | eval sub1 = substring(a, 2) | eval sub2 = substring(a, 2, 100);
+
+a:keyword | sub1:keyword | sub2:keyword
+🐱Meow!🐶Woof! | Meow!🐶Woof! | Meow!🐶Woof!
+;
+
 ltrim
 from employees | sort emp_no | limit 10 | eval name = concat("  ", first_name, "  ") | eval name = ltrim(name) | eval name = concat("'", name, "'") | keep emp_no, name;
 
@@ -1236,7 +1243,7 @@ emp_no:integer | last_name:keyword | f_s:keyword | f_l:integer
 ;
 
 locateUtf16Emoji#[skip:-8.13.99,reason:new string function added in 8.14]
-row a = "🐱Meow!🐶Woof!" | eval f_s = substring(a, 3) | eval f_l = locate(a, f_s);
+row a = "🐱Meow!🐶Woof!" | eval f_s = substring(a, 2) | eval f_l = locate(a, f_s);
 
 a:keyword | f_s:keyword | f_l:integer
 🐱Meow!🐶Woof! | Meow!🐶Woof! | 3

+ 3 - 6
x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Substring.java

@@ -86,12 +86,9 @@ public class Substring extends EsqlScalarFunction implements OptionalArgument {
 
     @Evaluator(extraName = "NoLength")
     static BytesRef process(BytesRef str, int start) {
-        if (str.length == 0) {
-            return null;
-        }
-        int codePointCount = UnicodeUtil.codePointCount(str);
-        int indexStart = indexStart(codePointCount, start);
-        return new BytesRef(str.utf8ToString().substring(indexStart));
+        int length = str.length; // we just need a value at least the length of the string
+        return process(str, start, length);
+
     }
 
     @Evaluator

+ 14 - 0
x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/SubstringTests.java

@@ -25,6 +25,7 @@ import org.hamcrest.Matcher;
 import java.util.List;
 import java.util.function.Supplier;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.elasticsearch.compute.data.BlockUtils.toJavaObject;
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
@@ -161,6 +162,19 @@ public class SubstringTests extends AbstractScalarFunctionTestCase {
         assert s.length() == 8 && s.codePointCount(0, s.length()) == 7;
         assertThat(process(s, 3, 1000), equalTo("tiger"));
         assertThat(process(s, -6, 1000), equalTo("\ud83c\udf09tiger"));
+        assert "🐱".length() == 2 && "🐶".length() == 2;
+        assert "🐱".codePointCount(0, 2) == 1 && "🐶".codePointCount(0, 2) == 1;
+        assert "🐱".getBytes(UTF_8).length == 4 && "🐶".getBytes(UTF_8).length == 4;
+
+        for (Integer len : new Integer[] { null, 100, 100000 }) {
+            assertThat(process(s, 3, len), equalTo("tiger"));
+            assertThat(process(s, -6, len), equalTo("\ud83c\udf09tiger"));
+
+            assertThat(process("🐱Meow!🐶Woof!", 0, len), equalTo("🐱Meow!🐶Woof!"));
+            assertThat(process("🐱Meow!🐶Woof!", 1, len), equalTo("🐱Meow!🐶Woof!"));
+            assertThat(process("🐱Meow!🐶Woof!", 2, len), equalTo("Meow!🐶Woof!"));
+            assertThat(process("🐱Meow!🐶Woof!", 3, len), equalTo("eow!🐶Woof!"));
+        }
     }
 
     public void testNegativeLength() {