Explorar el Código

Fix ES|QL locate with supplementary (4-byte) character (#107172)

This commit fixes the ES|QL locate with supplementary (4-byte) character.
Chris Hegarty hace 1 año
padre
commit
9edd67f911

+ 1 - 1
x-pack/plugin/esql/qa/testFixtures/src/main/resources/string.csv-spec

@@ -1246,7 +1246,7 @@ locateUtf16Emoji#[skip:-8.13.99,reason:new string function added in 8.14]
 row a = "🐱Meow!🐶Woof!" | eval f_s = substring(a, 2) | eval f_l = locate(a, f_s);
 
 a:keyword | f_s:keyword | f_l:integer
-🐱Meow!🐶Woof! | Meow!🐶Woof! | 3
+🐱Meow!🐶Woof! | Meow!🐶Woof! | 2
 ;
 
 locateNestedSubstring#[skip:-8.13.99,reason:new string function added in 8.14]

+ 5 - 1
x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Locate.java

@@ -96,7 +96,11 @@ public class Locate extends EsqlScalarFunction implements OptionalArgument {
         int codePointCount = UnicodeUtil.codePointCount(str);
         int indexStart = indexStart(codePointCount, start);
         String utf8ToString = str.utf8ToString();
-        return 1 + utf8ToString.indexOf(substr.utf8ToString(), utf8ToString.offsetByCodePoints(0, indexStart));
+        int idx = utf8ToString.indexOf(substr.utf8ToString(), utf8ToString.offsetByCodePoints(0, indexStart));
+        if (idx == -1) {
+            return 0;
+        }
+        return 1 + utf8ToString.codePointCount(0, idx);
     }
 
     @Evaluator(extraName = "NoStart")

+ 53 - 0
x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/LocateTests.java

@@ -25,6 +25,7 @@ import java.util.ArrayList;
 import java.util.List;
 import java.util.function.Supplier;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.elasticsearch.compute.data.BlockUtils.toJavaObject;
 import static org.hamcrest.Matchers.equalTo;
 
@@ -131,6 +132,58 @@ public class LocateTests extends AbstractFunctionTestCase {
         assertThat(process("界世", "界世", 0), equalTo(1));
     }
 
+    public void testSupplementaryCharacter() {
+        // some assertions about the supplementary (4-byte) character we'll use for testing
+        assert "𠜎".length() == 2;
+        assert "𠜎".codePointCount(0, 2) == 1;
+        assert "𠜎".getBytes(UTF_8).length == 4;
+
+        assertThat(process("a ti𠜎er", "𠜎er", 0), equalTo(5));
+        assertThat(process("a ti𠜎er", "i𠜎e", 0), equalTo(4));
+        assertThat(process("a ti𠜎er", "ti𠜎", 0), equalTo(3));
+        assertThat(process("a ti𠜎er", "er", 0), equalTo(6));
+        assertThat(process("a ti𠜎er", "r", 0), equalTo(7));
+
+        assertThat(process("𠜎a ti𠜎er", "𠜎er", 0), equalTo(6));
+        assertThat(process("𠜎a ti𠜎er", "i𠜎e", 0), equalTo(5));
+        assertThat(process("𠜎a ti𠜎er", "ti𠜎", 0), equalTo(4));
+        assertThat(process("𠜎a ti𠜎er", "er", 0), equalTo(7));
+        assertThat(process("𠜎a ti𠜎er", "r", 0), equalTo(8));
+
+        // exact
+        assertThat(process("a ti𠜎er", "a ti𠜎er", 0), equalTo(1));
+        assertThat(process("𠜎𠜎𠜎abc", "𠜎𠜎𠜎abc", 0), equalTo(1));
+        assertThat(process(" 𠜎𠜎𠜎abc", " 𠜎𠜎𠜎abc", 0), equalTo(1));
+        assertThat(process("𠜎𠜎𠜎 abc ", "𠜎𠜎𠜎 abc ", 0), equalTo(1));
+
+        // prefix
+        assertThat(process("𠜎abc", "𠜎", 0), equalTo(1));
+        assertThat(process("𠜎 abc", "𠜎 ", 0), equalTo(1));
+        assertThat(process("𠜎𠜎𠜎abc", "𠜎𠜎𠜎", 0), equalTo(1));
+        assertThat(process("𠜎𠜎𠜎 abc", "𠜎𠜎𠜎 ", 0), equalTo(1));
+        assertThat(process(" 𠜎𠜎𠜎 abc", " 𠜎𠜎𠜎 ", 0), equalTo(1));
+        assertThat(process("𠜎 𠜎 𠜎 abc", "𠜎 𠜎 𠜎 ", 0), equalTo(1));
+
+        // suffix
+        assertThat(process("abc𠜎", "𠜎", 0), equalTo(4));
+        assertThat(process("abc 𠜎", " 𠜎", 0), equalTo(4));
+        assertThat(process("abc𠜎𠜎𠜎", "𠜎𠜎𠜎", 0), equalTo(4));
+        assertThat(process("abc 𠜎𠜎𠜎", " 𠜎𠜎𠜎", 0), equalTo(4));
+        assertThat(process("abc𠜎𠜎𠜎 ", "𠜎𠜎𠜎 ", 0), equalTo(4));
+
+        // out of range
+        assertThat(process("𠜎a ti𠜎er", "𠜎a ti𠜎ers", 0), equalTo(0));
+        assertThat(process("a ti𠜎er", "aa ti𠜎er", 0), equalTo(0));
+        assertThat(process("abc𠜎𠜎", "𠜎𠜎𠜎", 0), equalTo(0));
+
+        assert "🐱".length() == 2 && "🐶".length() == 2;
+        assert "🐱".codePointCount(0, 2) == 1 && "🐶".codePointCount(0, 2) == 1;
+        assert "🐱".getBytes(UTF_8).length == 4 && "🐶".getBytes(UTF_8).length == 4;
+        assertThat(process("🐱Meow!🐶Woof!", "🐱Meow!🐶Woof!", 0), equalTo(1));
+        assertThat(process("🐱Meow!🐶Woof!", "Meow!🐶Woof!", 0), equalTo(2));
+        assertThat(process("🐱Meow!🐶Woof!", "eow!🐶Woof!", 0), equalTo(3));
+    }
+
     private Integer process(String str, String substr, Integer start) {
         try (
             EvalOperator.ExpressionEvaluator eval = evaluator(