Răsfoiți Sursa

[ML] remove use of hppc collections from NLP tokenization (#84815)

Replaces the use of hppc collections with that of native java collections.

The performance difference is near 0.
Benjamin Trent 3 ani în urmă
părinte
comite
947a342ef2

+ 2 - 3
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/BasicTokenFilter.java

@@ -7,7 +7,6 @@
 
 package org.elasticsearch.xpack.ml.inference.nlp.tokenizers;
 
-import com.carrotsearch.hppc.IntArrayList;
 import com.ibm.icu.text.Normalizer;
 import com.ibm.icu.text.Normalizer2;
 
@@ -146,8 +145,8 @@ public final class BasicTokenFilter extends TokenFilter {
         if (normalizer.quickCheck(termAtt) != Normalizer.YES) {
             normalizer.normalize(termAtt, accentBuffer);
         }
-        IntArrayList badIndices = new IntArrayList();
-        IntArrayList charCount = new IntArrayList();
+        List<Integer> badIndices = new ArrayList<>();
+        List<Integer> charCount = new ArrayList<>();
         int index = 0;
         for (PrimitiveIterator.OfInt it = accentBuffer.codePoints().iterator(); it.hasNext();) {
             int cp = it.next();

+ 12 - 5
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/ControlCharFilter.java

@@ -7,13 +7,14 @@
 
 package org.elasticsearch.xpack.ml.inference.nlp.tokenizers;
 
-import com.carrotsearch.hppc.CharArrayList;
-
 import org.apache.lucene.analysis.charfilter.BaseCharFilter;
 
 import java.io.CharArrayReader;
 import java.io.IOException;
 import java.io.Reader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
 
 /**
  * Char filter for removing control chars from a stream
@@ -47,7 +48,7 @@ public class ControlCharFilter extends BaseCharFilter {
     }
 
     private void fill() throws IOException {
-        CharArrayList charArrayList = new CharArrayList(1024);
+        List<char[]> charArrays = new ArrayList<>();
         char[] temp = new char[1024];
         int totalRead = 0;
         int diff = 0;
@@ -74,12 +75,18 @@ public class ControlCharFilter extends BaseCharFilter {
                         break;
                     }
                 }
-                charArrayList.add(temp, start, size);
+                charArrays.add(Arrays.copyOfRange(temp, start, start + size));
                 pos = start + size;
             }
             totalRead += cnt;
         }
-        transformedInput = new CharArrayReader(charArrayList.toArray());
+        char[] wholeArray = new char[charArrays.stream().mapToInt(cs -> cs.length).sum()];
+        int currIndex = 0;
+        for (char[] elements : charArrays) {
+            System.arraycopy(elements, 0, wholeArray, currIndex, elements.length);
+            currIndex += elements.length;
+        }
+        transformedInput = new CharArrayReader(wholeArray);
     }
 
     private static boolean isControlChar(char c) {