Browse Source

Efficiently encode multi-valued dimensions (#105271)

Detects and efficiently encodes cyclic ordinals, as proposed by
@jpountz. This is beneficial for encoding dimensions that are
multivalued, such as host.ip.

A follow-up on #99747
Felix Barnsteiner 1 year ago
parent
commit
f36dff7485

+ 4 - 0
server/src/main/java/org/elasticsearch/index/codec/PerFieldMapperCodec.java

@@ -23,6 +23,7 @@ import org.elasticsearch.index.codec.postings.ES812PostingsFormat;
 import org.elasticsearch.index.codec.tsdb.ES87TSDBDocValuesFormat;
 import org.elasticsearch.index.mapper.DateFieldMapper;
 import org.elasticsearch.index.mapper.IdFieldMapper;
+import org.elasticsearch.index.mapper.IpFieldMapper;
 import org.elasticsearch.index.mapper.KeywordFieldMapper;
 import org.elasticsearch.index.mapper.Mapper;
 import org.elasticsearch.index.mapper.MapperService;
@@ -125,6 +126,9 @@ public final class PerFieldMapperCodec extends Lucene99Codec {
             if (mappingLookup.getMapper(field) instanceof TimeSeriesIdFieldMapper) {
                 return true;
             }
+            if (mappingLookup.getMapper(field) instanceof IpFieldMapper) {
+                return true;
+            }
         }
         return false;
     }

+ 66 - 25
server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesEncoder.java

@@ -182,35 +182,57 @@ public class ES87TSDBDocValuesEncoder {
      * Optimizes for encoding sorted fields where we expect a block to mostly either be the same value
      * or to make a transition from one value to a second one.
      * <p>
-     * Encodes blocks in the following format:
+     * The header is a vlong where the number of trailing ones defines the encoding strategy:
      * <ul>
-     *     <li>byte 0: 1/2 bits header+6/7 bits data</li>
-     *     <li>byte 1..n: data</li>
-     * </ul>
-     * The header (first 1 or 2 bits) describes how the data is encoded:
-     * <ul>
-     *     <li>?0 block has a single value (vlong), 2nd bit already contains data</li>
-     *     <li>
-     *         01 block has two runs, data contains value 1 (vlong), run-length (vint) of value 1,
-     *         and delta from first to second value (zlong)
-     *     </li>
-     *     <li>11 block is bit-packed</li>
+     *   <li>0: single run</li>
+     *   <li>1: two runs</li>
+     *   <li>2: bit-packed</li>
+     *   <li>3: cycle</li>
      * </ul>
      */
     void encodeOrdinals(long[] in, DataOutput out, int bitsPerOrd) throws IOException {
         assert in.length == ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE;
         int numRuns = 1;
+        long firstValue = in[0];
+        long previousValue = firstValue;
+        boolean cyclic = false;
+        int cycleLength = 0;
         for (int i = 1; i < in.length; ++i) {
-            if (in[i - 1] != in[i]) {
+            long currentValue = in[i];
+            if (previousValue != currentValue) {
                 numRuns++;
             }
+            if (currentValue == firstValue && cycleLength != -1) {
+                if (cycleLength == 0) {
+                    // first candidate cycle detected
+                    cycleLength = i;
+                } else if (cycleLength == 1 || i % cycleLength != 0) {
+                    // if the first two values are the same this isn't a cycle, it might be a run, though
+                    // this also isn't a cycle if the index of the next occurrence of the first value
+                    // isn't a multiple of the candidate cycle length
+                    // we can stop looking for cycles now
+                    cycleLength = -1;
+                }
+            }
+            previousValue = currentValue;
+        }
+        // if the cycle is too long, bit-packing may be more space efficient
+        int maxCycleLength = in.length / 4;
+        if (numRuns > 2 && cycleLength > 1 && cycleLength <= maxCycleLength) {
+            cyclic = true;
+            for (int i = cycleLength; i < in.length; ++i) {
+                if (in[i] != in[i - cycleLength]) {
+                    cyclic = false;
+                    break;
+                }
+            }
         }
         if (numRuns == 1 && bitsPerOrd < 63) {
             long value = in[0];
-            // set first bit to 0 to indicate the block has a single run
+            // unset first bit (0 trailing ones) to indicate the block has a single run
             out.writeVLong(value << 1);
         } else if (numRuns == 2 && bitsPerOrd < 62) {
-            // set first two bits to 01 to indicate the block has two runs
+            // set 1 trailing bit to indicate the block has two runs
             out.writeVLong((in[0] << 2) | 0b01);
             int firstRunLen = in.length;
             for (int i = 1; i < in.length; ++i) {
@@ -221,8 +243,15 @@ public class ES87TSDBDocValuesEncoder {
             }
             out.writeVInt(firstRunLen);
             out.writeZLong(in[in.length - 1] - in[0]);
+        } else if (cyclic) {
+            // set 3 trailing bits to indicate the block cycles through the same values
+            long headerAndCycleLength = ((long) cycleLength << 4) | 0b0111;
+            out.writeVLong(headerAndCycleLength);
+            for (int i = 0; i < cycleLength; i++) {
+                out.writeVLong(in[i]);
+            }
         } else {
-            // set first two bits to 11 to indicate the block is bit-packed
+            // set 2 trailing bits to indicate the block is bit-packed
             out.writeVLong(0b11);
             forUtil.encode(in, bitsPerOrd, out);
         }
@@ -232,20 +261,32 @@ public class ES87TSDBDocValuesEncoder {
         assert out.length == ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE : out.length;
 
         long v1 = in.readVLong();
-        int header = (int) (v1 & 0b11L);
-        if (header == 0b00 || header == 0b10) {
-            // first bit is zero -> single run
-            Arrays.fill(out, v1 >>> 1);
-        } else if (header == 0b01) {
-            // first two bits are 01 -> two runs
-            v1 = v1 >>> 2;
+        int encoding = Long.numberOfTrailingZeros(~v1);
+        v1 >>>= encoding + 1;
+        if (encoding == 0) {
+            // single run
+            Arrays.fill(out, v1);
+        } else if (encoding == 1) {
+            // two runs
             int runLen = in.readVInt();
             long v2 = v1 + in.readZLong();
             Arrays.fill(out, 0, runLen, v1);
             Arrays.fill(out, runLen, out.length, v2);
-        } else {
-            // first two bits are 11 -> bit-packed
+        } else if (encoding == 2) {
+            // bit-packed
             forUtil.decode(bitsPerOrd, in, out);
+        } else if (encoding == 3) {
+            // cycle encoding
+            int cycleLength = (int) v1;
+            for (int i = 0; i < cycleLength; i++) {
+                out[i] = in.readVLong();
+            }
+            int length = cycleLength;
+            while (length < out.length) {
+                int copyLength = Math.min(length, out.length - length);
+                System.arraycopy(out, 0, out, length, copyLength);
+                length += copyLength;
+            }
         }
     }
 

+ 48 - 1
server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesEncoderTests.java

@@ -260,12 +260,59 @@ public class ES87TSDBDocValuesEncoderTests extends LuceneTestCase {
         doTestOrdinals(arr, 113);
     }
 
+    public void testEncodeOrdinalsBitPack3Bits() throws IOException {
+        long[] arr = new long[blockSize];
+        Arrays.fill(arr, 4);
+        for (int i = 0; i < 4; i++) {
+            arr[i] = i;
+        }
+        doTestOrdinals(arr, 49);
+    }
+
+    public void testEncodeOrdinalsCycle2() throws IOException {
+        long[] arr = new long[blockSize];
+        Arrays.setAll(arr, i -> i % 2);
+        doTestOrdinals(arr, 3);
+    }
+
+    public void testEncodeOrdinalsCycle3() throws IOException {
+        long[] arr = new long[blockSize];
+        Arrays.setAll(arr, i -> i % 3);
+        doTestOrdinals(arr, 4);
+    }
+
+    public void testEncodeOrdinalsLongCycle() throws IOException {
+        long[] arr = new long[blockSize];
+        Arrays.setAll(arr, i -> i % 32);
+        doTestOrdinals(arr, 34);
+    }
+
+    public void testEncodeOrdinalsCycleTooLong() throws IOException {
+        long[] arr = new long[blockSize];
+        Arrays.setAll(arr, i -> i % 33);
+        // the cycle is too long and the vales are bit-packed
+        doTestOrdinals(arr, 97);
+    }
+
+    public void testEncodeOrdinalsAlmostCycle() throws IOException {
+        long[] arr = new long[blockSize];
+        Arrays.setAll(arr, i -> i % 3);
+        arr[arr.length - 1] = 4;
+        doTestOrdinals(arr, 49);
+    }
+
+    public void testEncodeOrdinalsDifferentCycles() throws IOException {
+        long[] arr = new long[blockSize];
+        Arrays.setAll(arr, i -> i > 64 ? i % 4 : i % 3);
+        doTestOrdinals(arr, 33);
+    }
+
     private void doTestOrdinals(long[] arr, long expectedNumBytes) throws IOException {
         long maxOrd = 0;
         for (long ord : arr) {
             maxOrd = Math.max(maxOrd, ord);
         }
-        final int bitsPerOrd = PackedInts.bitsRequired(maxOrd - 1);
+        final int bitsPerOrd = PackedInts.bitsRequired(maxOrd);
         final long[] expected = arr.clone();
         try (Directory dir = newDirectory()) {
             try (IndexOutput out = dir.createOutput("tests.bin", IOContext.DEFAULT)) {