浏览代码

Encode using 40, 48 and 56 bits per value (#93371)

Note that we use the encoding as follows:
* for values taking [33, 40] bits per value encode using 40 bits per value
* for values taking [41, 48] bits per value encode using 48 bits per value
* for values taking [49, 56] bits per value encode using 56 bits per value

This is an improvement over the encoding used by ForUtils that does
not apply any compression for values taking more than 32 bits per value.

Note that 40, 48 and 56 bits per value represent exact multiples of bytes
(40 bits per value = 5 bytes, 48 bits per value = 6 bytes and 56 bits per
value = 7 bytes). As a result we always write values using 3, 2 or 1 byte
less than the 8 bytes required for a long value.

We also apply compression to gauge metrics under the assumption that
compressing values taking more than 32 bits per value works well for
floating point values, because of the way floating point values
are represented (IEEE 754 format).
Salvatore Campagna 2 年之前
父节点
当前提交
146b605269

+ 1 - 1
benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/DecodeDecreasingIntegerBenchmark.java

@@ -37,7 +37,7 @@ import java.util.concurrent.TimeUnit;
 public class DecodeDecreasingIntegerBenchmark {
     private static final int SEED = 17;
     private static final int BLOCK_SIZE = 128;
-    @Param({ "4", "8", "12", "16", "24", "28", "32", "36", "40", "44", "48", "52", "56", "64" })
+    @Param({ "4", "8", "12", "16", "20", "24", "28", "32", "36", "40", "44", "48", "52", "56", "60", "64" })
     private int bitsPerValue;
 
     private final AbstractDocValuesForUtilBenchmark decode;

+ 1 - 1
benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/DecodeIncreasingIntegerBenchmark.java

@@ -37,7 +37,7 @@ import java.util.concurrent.TimeUnit;
 public class DecodeIncreasingIntegerBenchmark {
     private static final int SEED = 17;
     private static final int BLOCK_SIZE = 128;
-    @Param({ "4", "8", "12", "16", "24", "28", "32", "36", "40", "44", "48", "52", "56", "64" })
+    @Param({ "4", "8", "12", "16", "20", "24", "28", "32", "36", "40", "44", "48", "52", "56", "60", "64" })
     private int bitsPerValue;
 
     private final AbstractDocValuesForUtilBenchmark decode;

+ 1 - 1
benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/DecodeNonSortedIntegerBenchmark.java

@@ -37,7 +37,7 @@ import java.util.concurrent.TimeUnit;
 public class DecodeNonSortedIntegerBenchmark {
     private static final int SEED = 17;
     private static final int BLOCK_SIZE = 128;
-    @Param({ "4", "8", "12", "16", "24", "28", "32", "36", "40", "44", "48", "52", "56", "64" })
+    @Param({ "4", "8", "12", "16", "20", "24", "28", "32", "36", "40", "44", "48", "52", "56", "60", "64" })
     private int bitsPerValue;
 
     private final AbstractDocValuesForUtilBenchmark decode;

+ 1 - 1
benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/EncodeDecreasingIntegerBenchmark.java

@@ -37,7 +37,7 @@ import java.util.concurrent.TimeUnit;
 public class EncodeDecreasingIntegerBenchmark {
     private static final int SEED = 17;
     private static final int BLOCK_SIZE = 128;
-    @Param({ "4", "8", "12", "16", "24", "28", "32", "36", "40", "44", "48", "52", "56", "64" })
+    @Param({ "4", "8", "12", "16", "20", "24", "28", "32", "36", "40", "44", "48", "52", "56", "60", "64" })
     private int bitsPerValue;
 
     private final AbstractDocValuesForUtilBenchmark encode;

+ 1 - 1
benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/EncodeIncreasingIntegerBenchmark.java

@@ -37,7 +37,7 @@ import java.util.concurrent.TimeUnit;
 public class EncodeIncreasingIntegerBenchmark {
     private static final int SEED = 17;
     private static final int BLOCK_SIZE = 128;
-    @Param({ "4", "8", "12", "16", "24", "28", "32", "36", "40", "44", "48", "52", "56", "64" })
+    @Param({ "4", "8", "12", "16", "20", "24", "28", "32", "36", "40", "44", "48", "52", "56", "60", "64" })
     private int bitsPerValue;
 
     private final AbstractDocValuesForUtilBenchmark encode;

+ 1 - 1
benchmarks/src/main/java/org/elasticsearch/benchmark/index/codec/tsdb/EncodeNonSortedIntegerBenchmark.java

@@ -37,7 +37,7 @@ import java.util.concurrent.TimeUnit;
 public class EncodeNonSortedIntegerBenchmark {
     private static final int SEED = 17;
     private static final int BLOCK_SIZE = 128;
-    @Param({ "4", "8", "12", "16", "24", "28", "32", "36", "40", "44", "48", "52", "56", "64" })
+    @Param({ "4", "8", "12", "16", "20", "24", "28", "32", "36", "40", "44", "48", "52", "56", "60", "64" })
     private int bitsPerValue;
 
     private final AbstractDocValuesForUtilBenchmark encode;

+ 31 - 0
docs/changelog/93371.yaml

@@ -0,0 +1,31 @@
+pr: 93371
+summary: "Encode using 40, 48 and 56 bits per value"
+area: TSDB
+type: feature
+issues: []
+highlight:
+  title: "Encode using 40, 48 and 56 bits per value"
+  body: |-
+    We use the encoding as follows:
+    * for values taking [33, 40] bits per value, encode using 40 bits per value
+    * for values taking [41, 48] bits per value, encode using 48 bits per value
+    * for values taking [49, 56] bits per value, encode using 56 bits per value
+
+    This is an improvement over the encoding used by ForUtils which does not
+    apply any compression for values taking more than 32 bits per value.
+
+    Note that 40, 48 and 56 bits per value represent exact multiples of
+    bytes (5, 6 and 7 bytes per value). As a result, we always write values
+    using 3, 2 or 1 byte less than the 8 bytes required for a long value.
+
+    Looking at the savings in stored bytes, for a block of 128 (long) values we
+    would normally store 128 x 8 bytes = 1024 bytes, while now we have the following:
+    * 40 bits per value: write 645 bytes instead of 1024, saving 379 bytes (37%)
+    * 48 bits per value: write 772 bytes instead of 1024, saving 252 bytes (24%)
+    * 56 bits per value: write 897 bytes instead of 1024, saving 127 bytes (12%)
+
+    We also apply compression to gauge metrics under the assumption that
+    compressing values taking more than 32 bits per value works well for
+    floating point values, because of the way floating point values are
+    represented (IEEE 754 format).
+  notable: true

+ 2 - 1
server/src/main/java/org/elasticsearch/index/codec/PerFieldMapperCodec.java

@@ -123,7 +123,8 @@ public class PerFieldMapperCodec extends Lucene95Codec {
             final MappingLookup mappingLookup = mapperService.mappingLookup();
             if (mappingLookup.getMapper(field) instanceof NumberFieldMapper) {
                 final MappedFieldType fieldType = mappingLookup.getFieldType(field);
-                return TimeSeriesParams.MetricType.COUNTER.equals(fieldType.getMetricType());
+                return TimeSeriesParams.MetricType.COUNTER.equals(fieldType.getMetricType())
+                    || TimeSeriesParams.MetricType.GAUGE.equals(fieldType.getMetricType());
             }
         }
         return false;

+ 49 - 2
server/src/main/java/org/elasticsearch/index/codec/tsdb/DocValuesForUtil.java

@@ -10,12 +10,18 @@ package org.elasticsearch.index.codec.tsdb;
 
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
+import org.elasticsearch.common.util.ByteUtils;
 
 import java.io.IOException;
 
 public class DocValuesForUtil {
+    private static final int BITS_IN_FOUR_BYTES = 4 * Byte.SIZE;
+    private static final int BITS_IN_FIVE_BYTES = 5 * Byte.SIZE;
+    private static final int BITS_IN_SIX_BYTES = 6 * Byte.SIZE;
+    private static final int BITS_IN_SEVEN_BYTES = 7 * Byte.SIZE;
     private final ForUtil forUtil = new ForUtil();
     private final int blockSize;
+    private final byte[] encoded;
 
     public DocValuesForUtil() {
         this(ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE);
@@ -23,9 +29,25 @@ public class DocValuesForUtil {
 
     private DocValuesForUtil(int blockSize) {
         this.blockSize = blockSize;
+        this.encoded = new byte[1024];
     }
 
-    public void encode(long[] in, int bitsPerValue, DataOutput out) throws IOException {
+    public static int roundBits(int bitsPerValue) {
+        if (bitsPerValue > 24 && bitsPerValue <= 32) {
+            return BITS_IN_FOUR_BYTES;
+        } else if (bitsPerValue > 32 && bitsPerValue <= BITS_IN_FIVE_BYTES) {
+            return BITS_IN_FIVE_BYTES;
+        } else if (bitsPerValue > BITS_IN_FIVE_BYTES && bitsPerValue <= BITS_IN_SIX_BYTES) {
+            return BITS_IN_SIX_BYTES;
+        } else if (bitsPerValue > BITS_IN_SIX_BYTES && bitsPerValue <= BITS_IN_SEVEN_BYTES) {
+            return BITS_IN_SEVEN_BYTES;
+        } else if (bitsPerValue > BITS_IN_SEVEN_BYTES) {
+            return Long.BYTES * Byte.SIZE;
+        }
+        return bitsPerValue;
+    }
+
+    public void encode(long[] in, int bitsPerValue, final DataOutput out) throws IOException {
         if (bitsPerValue <= 24) { // these bpvs are handled efficiently by ForUtil
             forUtil.encode(in, bitsPerValue, out);
         } else if (bitsPerValue <= 32) {
@@ -33,24 +55,49 @@ public class DocValuesForUtil {
             for (int i = 0; i < blockSize / 2; ++i) {
                 out.writeLong(in[i]);
             }
+        } else if (bitsPerValue == BITS_IN_FIVE_BYTES || bitsPerValue == BITS_IN_SIX_BYTES || bitsPerValue == BITS_IN_SEVEN_BYTES) {
+            encodeFiveSixOrSevenBytesPerValue(in, bitsPerValue, out);
         } else {
+            assert bitsPerValue > 56 : "bitsPerValue must be greater than 56 but was [" + bitsPerValue + "]";
             for (long l : in) {
                 out.writeLong(l);
             }
         }
     }
 
-    public void decode(int bitsPerValue, DataInput in, long[] out) throws IOException {
+    private void encodeFiveSixOrSevenBytesPerValue(long[] in, int bitsPerValue, final DataOutput out) throws IOException {
+        int bytesPerValue = bitsPerValue / Byte.SIZE;
+        for (int i = 0; i < in.length; ++i) {
+            ByteUtils.writeLongLE(in[i], this.encoded, i * bytesPerValue);
+        }
+        out.writeBytes(this.encoded, bytesPerValue * in.length);
+    }
+
+    public void decode(int bitsPerValue, final DataInput in, long[] out) throws IOException {
         if (bitsPerValue <= 24) {
             forUtil.decode(bitsPerValue, in, out);
         } else if (bitsPerValue <= 32) {
             in.readLongs(out, 0, blockSize / 2);
             expand32(out);
+        } else if (bitsPerValue == BITS_IN_FIVE_BYTES || bitsPerValue == BITS_IN_SIX_BYTES || bitsPerValue == BITS_IN_SEVEN_BYTES) {
+            decodeFiveSixOrSevenBytesPerValue(bitsPerValue, in, out);
         } else {
+            assert bitsPerValue > 56 : "bitsPerValue must be greater than 56 but was [" + bitsPerValue + "]";
             in.readLongs(out, 0, blockSize);
         }
     }
 
+    private void decodeFiveSixOrSevenBytesPerValue(int bitsPerValue, final DataInput in, long[] out) throws IOException {
+        // NOTE: we expect multibyte values to be written "least significant byte" first
+        int bytesPerValue = bitsPerValue / Byte.SIZE;
+        long mask = (1L << bitsPerValue) - 1;
+        byte[] buffer = new byte[bytesPerValue * blockSize + Long.BYTES - bytesPerValue];
+        in.readBytes(buffer, 0, bytesPerValue * blockSize);
+        for (int i = 0; i < blockSize; ++i) {
+            out[i] = ByteUtils.readLongLE(buffer, i * bytesPerValue) & mask;
+        }
+    }
+
     private static void collapse32(long[] arr) {
         for (int i = 0; i < 64; ++i) {
             arr[i] = (arr[i] << 32) | arr[64 + i];

+ 1 - 2
server/src/main/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesEncoder.java

@@ -125,8 +125,7 @@ public class ES87TSDBDocValuesEncoder {
             or |= l;
         }
 
-        final int bitsPerValue = or == 0 ? 0 : PackedInts.unsignedBitsRequired(or);
-
+        int bitsPerValue = or == 0 ? 0 : DocValuesForUtil.roundBits(PackedInts.unsignedBitsRequired(or));
         out.writeVInt((bitsPerValue << tokenBits) | token);
         if (bitsPerValue > 0) {
             forUtil.encode(in, bitsPerValue, out);

+ 1 - 1
server/src/test/java/org/elasticsearch/index/codec/tsdb/DocValuesForUtilTests.java

@@ -39,7 +39,7 @@ public class DocValuesForUtilTests extends LuceneTestCase {
 
         for (int i = 0; i < iterations; ++i) {
             final int bpv = TestUtil.nextInt(random(), 1, 64);
-            bpvs[i] = bpv;
+            bpvs[i] = DocValuesForUtil.roundBits(bpv);
             for (int j = 0; j < ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE; ++j) {
                 values[i * ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE + j] = bpv == 64
                     ? random().nextLong()

+ 36 - 0
server/src/test/java/org/elasticsearch/index/codec/tsdb/ES87TSDBDocValuesEncoderTests.java

@@ -17,6 +17,7 @@ import org.apache.lucene.util.NumericUtils;
 
 import java.io.IOException;
 import java.util.Arrays;
+import java.util.Random;
 
 public class ES87TSDBDocValuesEncoderTests extends LuceneTestCase {
 
@@ -135,6 +136,41 @@ public class ES87TSDBDocValuesEncoderTests extends LuceneTestCase {
         doTest(arr, expectedNumBytes);
     }
 
+    public void testFloatingPointValues() throws IOException {
+        long[] arr = new long[blockSize];
+        // NOTE: these values are crafted in such a way that after applying GCD encoding we get values represented using 36 bits per value.
+        for (int i = 0; i < blockSize; ++i) {
+            double value = (i % 2 == 1) ? (i * 1956.0) : (i * 356923.5);
+            arr[i] = Double.doubleToLongBits(value);
+        }
+        // NOTE: 36 bits per value strictly required, but we round to 40 bits per value to write exactly 5 bytes per value
+        final long expectedNumBytes = 6 // token (2 bytes) + GCD (4 bytes)
+            + (blockSize * 40) / Byte.SIZE; // data
+        doTest(arr, expectedNumBytes);
+    }
+
+    public void testBitsPerValueFullRange() throws IOException {
+        final Random random = new Random(17);
+        long[] arr = new long[blockSize];
+        long constant = 1;
+        for (int bitsPerValue = 0; bitsPerValue <= 64; bitsPerValue++) {
+            for (int i = 0; i < blockSize; ++i) {
+                if (bitsPerValue == 0) {
+                    arr[i] = constant;
+                } else {
+                    arr[i] = random.nextLong(0, bitsPerValue <= 62 ? 1L << bitsPerValue : Long.MAX_VALUE);
+                }
+            }
+            long actualBitsPerValue = DocValuesForUtil.roundBits(bitsPerValue);
+            int actualTokenBytes = bitsPerValue < 16 ? 1 : 2;
+            final long expectedNumBytes = bitsPerValue == 0
+                ? 2
+                : actualTokenBytes // token
+                    + (blockSize * actualBitsPerValue) / Byte.SIZE; // data
+            doTest(arr, expectedNumBytes);
+        }
+    }
+
     private void doTest(long[] arr, long expectedNumBytes) throws IOException {
         final long[] expected = arr.clone();
         try (Directory dir = newDirectory()) {