Bläddra i källkod

Grok: Handle utf-8 natively (#62794)

This adds a method to `Grok` that matches against sections offset from
utf-8 byte arrays:
```
Map<String, Object> captures(byte[] utf8Bytes, int offset, int length)
```

This'll be useful for the grok-flavored runtime fields because they
want to match against utf-8 encoded strings stored in a big array. And
joni already supports this.
Nik Everett 5 år sedan
förälder
incheckning
59eac2262e

+ 16 - 6
libs/grok/src/main/java/org/elasticsearch/grok/Grok.java

@@ -235,19 +235,29 @@ public final class Grok {
     }
 
     /**
-     * Matches and returns any named captures within a compiled grok expression that matched
-     * within the provided text.
+     * Matches and returns any named captures.
      *
      * @param text the text to match and extract values from.
      * @return a map containing field names and their respective coerced values that matched.
      */
     public Map<String, Object> captures(String text) {
-        byte[] textAsBytes = text.getBytes(StandardCharsets.UTF_8);
-        Matcher matcher = compiledExpression.matcher(textAsBytes);
+        byte[] utf8Bytes = text.getBytes(StandardCharsets.UTF_8);
+        return captures(utf8Bytes, 0, utf8Bytes.length);
+    }
+
+    /**
+     * Matches and returns any named captures.
+     * @param utf8Bytes array containing the text to match against encoded in utf-8
+     * @param offset offset {@code utf8Bytes} of the start of the text
+     * @param length length of the text to match
+     * @return a map containing field names and their respective coerced values that matched.
+     */
+    public Map<String, Object> captures(byte[] utf8Bytes, int offset, int length) {
+        Matcher matcher = compiledExpression.matcher(utf8Bytes, offset, offset + length);
         int result;
         try {
             matcherWatchdog.register(matcher);
-            result = matcher.search(0, textAsBytes.length, Option.DEFAULT);
+            result = matcher.search(offset, length, Option.DEFAULT);
         } finally {
             matcherWatchdog.unregister(matcher);
         }
@@ -261,7 +271,7 @@ public final class Grok {
             Map<String, Object> fields = new HashMap<>(captureConfig.size());
             Region region = matcher.getEagerRegion();
             for (GrokCaptureConfig config: captureConfig) {
-                Object v = config.extract(textAsBytes, region);
+                Object v = config.extract(utf8Bytes, offset, region);
                 if (v != null) {
                     fields.put(config.name(), v);
                 }

+ 2 - 2
libs/grok/src/main/java/org/elasticsearch/grok/GrokCaptureConfig.java

@@ -54,10 +54,10 @@ public final class GrokCaptureConfig {
         return type;
     }
 
-    Object extract(byte[] textAsBytes, Region region) {
+    Object extract(byte[] utf8Bytes, int offset, Region region) {
         for (int number : backRefs) {
             if (region.beg[number] >= 0) {
-                String matchValue = new String(textAsBytes, region.beg[number], region.end[number] - region.beg[number],
+                String matchValue = new String(utf8Bytes, offset + region.beg[number], region.end[number] - region.beg[number],
                     StandardCharsets.UTF_8);
                 return type.parse(matchValue);
             }

+ 15 - 3
libs/grok/src/test/java/org/elasticsearch/grok/GrokTests.java

@@ -21,6 +21,7 @@ package org.elasticsearch.grok;
 
 import org.elasticsearch.test.ESTestCase;
 
+import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
@@ -45,10 +46,21 @@ import static org.hamcrest.Matchers.nullValue;
 
 public class GrokTests extends ESTestCase {
     public void testMatchWithoutCaptures() {
-        String line = "value";
         Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "value", logger::warn);
-        Map<String, Object> matches = grok.captures(line);
-        assertEquals(0, matches.size());
+        assertThat(grok.captures("value"), equalTo(Map.of()));
+        assertThat(grok.captures("prefix_value"), equalTo(Map.of()));
+        assertThat(grok.captures("no_match"), nullValue());
+    }
+
+    public void testCaputuresBytes() {
+        Grok grok = new Grok(Grok.BUILTIN_PATTERNS, "%{NUMBER:n:int}", logger::warn);
+        byte[] utf8 = "10".getBytes(StandardCharsets.UTF_8);
+        assertThat(grok.captures(utf8, 0, utf8.length), equalTo(Map.of("n", 10)));
+        assertThat(grok.captures(utf8, 0, 1), equalTo(Map.of("n", 1)));
+        utf8 = "10 11 12".getBytes(StandardCharsets.UTF_8);
+        assertThat(grok.captures(utf8, 0, 2), equalTo(Map.of("n", 10)));
+        assertThat(grok.captures(utf8, 3, 2), equalTo(Map.of("n", 11)));
+        assertThat(grok.captures(utf8, 6, 2), equalTo(Map.of("n", 12)));
     }
 
     public void testNoMatchingPatternInDictionary() {