Browse Source

Add painless string split function (splitOnToken) (#39772)

Adds two String split functions to Painless that can be used without enabling regexes.
Christian Mesh 6 years ago
parent
commit
0f26c728b4

+ 2 - 0
docs/painless/painless-api-reference/painless-api-reference-shared/packages.asciidoc

@@ -1253,6 +1253,8 @@ See the <<painless-api-reference-shared, Shared API>> for a high-level overview
 * String {java11-javadoc}/java.base/java/lang/String.html#replace(java.lang.CharSequence,java.lang.CharSequence)[replace](CharSequence, CharSequence)
 * String replaceAll(Pattern, Function)
 * String replaceFirst(Pattern, Function)
+* String[] splitOnToken(String)
+* String[] splitOnToken(String, int)
 * boolean {java11-javadoc}/java.base/java/lang/String.html#startsWith(java.lang.String)[startsWith](String)
 * boolean {java11-javadoc}/java.base/java/lang/String.html#startsWith(java.lang.String,int)[startsWith](String, int)
 * CharSequence {java11-javadoc}/java.base/java/lang/CharSequence.html#subSequence(int,int)[subSequence](int, int)

+ 49 - 0
modules/lang-painless/src/main/java/org/elasticsearch/painless/api/Augmentation.java

@@ -503,4 +503,53 @@ public class Augmentation {
     public static String decodeBase64(String receiver) {
         return new String(Base64.getDecoder().decode(receiver.getBytes(StandardCharsets.UTF_8)), StandardCharsets.UTF_8);
     }
+
+    /**
+     * Split 'receiver' by 'token' as many times as possible..
+     */
+    public static String[] splitOnToken(String receiver, String token) {
+        return splitOnToken(receiver, token, -1);
+    }
+
+    /**
+     * Split 'receiver' by 'token' up to 'limit' times.  Any limit less than 1 is ignored.
+     */
+    public static String[] splitOnToken(String receiver, String token, int limit) {
+        // Check if it's even possible to perform a split
+        if (receiver == null || receiver.length() == 0 || token == null || token.length() == 0 || receiver.length() < token.length()) {
+            return new String[] { receiver };
+        }
+
+        // List of string segments we have found
+        ArrayList<String> result = new ArrayList<String>();
+
+        // Keep track of where we are in the string
+        // indexOf(tok, startPos) is faster than creating a new search context ever loop with substring(start, end)
+        int pos = 0;
+
+        // Loop until we hit the limit or forever if we are passed in less than one (signifying no limit)
+        // If Integer.MIN_VALUE is passed in, it will still continue to loop down to 1 from MAX_VALUE
+        // This edge case should be fine as we are limited by receiver length (Integer.MAX_VALUE) even if we split at every char
+        for(;limit != 1; limit--) {
+
+            // Find the next occurrence of token after current pos
+            int idx = receiver.indexOf(token, pos);
+
+            // Reached the end of the string without another match
+            if (idx == -1) {
+                break;
+            }
+
+            // Add the found segment to the result list
+            result.add(receiver.substring(pos, idx));
+
+            // Move our search position to the next possible location
+            pos = idx + token.length();
+        }
+        // Add the remaining string to the result list
+        result.add(receiver.substring(pos));
+
+        // O(N) or faster depending on implementation
+        return result.toArray(new String[0]);
+    }
 }

+ 2 - 0
modules/lang-painless/src/main/resources/org/elasticsearch/painless/spi/java.lang.txt

@@ -758,6 +758,8 @@ class java.lang.String {
   String copyValueOf(char[],int,int)
   String org.elasticsearch.painless.api.Augmentation decodeBase64()
   String org.elasticsearch.painless.api.Augmentation encodeBase64()
+  String[] org.elasticsearch.painless.api.Augmentation splitOnToken(String)
+  String[] org.elasticsearch.painless.api.Augmentation splitOnToken(String, int)
   boolean endsWith(String)
   boolean equalsIgnoreCase(String)
   String format(Locale,String,def[])

+ 40 - 0
modules/lang-painless/src/test/java/org/elasticsearch/painless/AugmentationTests.java

@@ -23,6 +23,7 @@ import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.regex.Pattern;
 
 public class AugmentationTests extends ScriptTestCase {
 
@@ -199,4 +200,43 @@ public class AugmentationTests extends ScriptTestCase {
         assertEquals(8, exec("def ft = new org.elasticsearch.painless.FeatureTestObject();" +
             " ft.setX(3); ft.setY(2); return ft.addToTotal(3)"));
     }
+
+    private static class SplitCase {
+        final String input;
+        final String token;
+        final int count;
+
+        SplitCase(String input, String token, int count) {
+            this.input = input;
+            this.token = token;
+            this.count = count;
+        }
+        SplitCase(String input, String token) {
+            this(input, token, -1);
+        }
+    }
+    public void testString_SplitOnToken() {
+        SplitCase[] cases = new SplitCase[] {
+            new SplitCase("", ""),
+            new SplitCase("a,b,c", ","),
+            new SplitCase("a,b,c", "|"),
+            new SplitCase("a,,b,,c", ","),
+            new SplitCase("a,,b,,c", ",", 1),
+            new SplitCase("a,,b,,c", ",", 3),
+            new SplitCase("a,,b,,c", ",", 300),
+            new SplitCase("a,b,c", "a,b,c,d"),
+            new SplitCase("aaaaaaa", "a"),
+            new SplitCase("aaaaaaa", "a", 2),
+            new SplitCase("1.1.1.1.111", "1"),
+            new SplitCase("1.1.1.1.111", "."),
+            new SplitCase("1\n1.1.\r\n1\r\n111", "\r\n"),
+        };
+        for (SplitCase split : cases) {
+            //System.out.println(String.format("Splitting '%s' by '%s' %d times", split.input, split.token, split.count));
+            assertArrayEquals(
+                split.input.split(Pattern.quote(split.token), split.count),
+                (String[])exec("return \""+split.input+"\".splitOnToken(\""+split.token+"\", "+split.count+");")
+            );
+        }
+    }
 }