|
@@ -182,35 +182,57 @@ public class ES87TSDBDocValuesEncoder {
|
|
|
* Optimizes for encoding sorted fields where we expect a block to mostly either be the same value
|
|
|
* or to make a transition from one value to a second one.
|
|
|
* <p>
|
|
|
- * Encodes blocks in the following format:
|
|
|
+ * The header is a vlong where the number of trailing ones defines the encoding strategy:
|
|
|
* <ul>
|
|
|
- * <li>byte 0: 1/2 bits header+6/7 bits data</li>
|
|
|
- * <li>byte 1..n: data</li>
|
|
|
- * </ul>
|
|
|
- * The header (first 1 or 2 bits) describes how the data is encoded:
|
|
|
- * <ul>
|
|
|
- * <li>?0 block has a single value (vlong), 2nd bit already contains data</li>
|
|
|
- * <li>
|
|
|
- * 01 block has two runs, data contains value 1 (vlong), run-length (vint) of value 1,
|
|
|
- * and delta from first to second value (zlong)
|
|
|
- * </li>
|
|
|
- * <li>11 block is bit-packed</li>
|
|
|
+ * <li>0: single run</li>
|
|
|
+ * <li>1: two runs</li>
|
|
|
+ * <li>2: bit-packed</li>
|
|
|
+ * <li>3: cycle</li>
|
|
|
* </ul>
|
|
|
*/
|
|
|
void encodeOrdinals(long[] in, DataOutput out, int bitsPerOrd) throws IOException {
|
|
|
assert in.length == ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE;
|
|
|
int numRuns = 1;
|
|
|
+ long firstValue = in[0];
|
|
|
+ long previousValue = firstValue;
|
|
|
+ boolean cyclic = false;
|
|
|
+ int cycleLength = 0;
|
|
|
for (int i = 1; i < in.length; ++i) {
|
|
|
- if (in[i - 1] != in[i]) {
|
|
|
+ long currentValue = in[i];
|
|
|
+ if (previousValue != currentValue) {
|
|
|
numRuns++;
|
|
|
}
|
|
|
+ if (currentValue == firstValue && cycleLength != -1) {
|
|
|
+ if (cycleLength == 0) {
|
|
|
+ // first candidate cycle detected
|
|
|
+ cycleLength = i;
|
|
|
+ } else if (cycleLength == 1 || i % cycleLength != 0) {
|
|
|
+ // if the first two values are the same this isn't a cycle, it might be a run, though
|
|
|
+ // this also isn't a cycle if the index of the next occurrence of the first value
|
|
|
+ // isn't a multiple of the candidate cycle length
|
|
|
+ // we can stop looking for cycles now
|
|
|
+ cycleLength = -1;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ previousValue = currentValue;
|
|
|
+ }
|
|
|
+ // if the cycle is too long, bit-packing may be more space efficient
|
|
|
+ int maxCycleLength = in.length / 4;
|
|
|
+ if (numRuns > 2 && cycleLength > 1 && cycleLength <= maxCycleLength) {
|
|
|
+ cyclic = true;
|
|
|
+ for (int i = cycleLength; i < in.length; ++i) {
|
|
|
+ if (in[i] != in[i - cycleLength]) {
|
|
|
+ cyclic = false;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
if (numRuns == 1 && bitsPerOrd < 63) {
|
|
|
long value = in[0];
|
|
|
- // set first bit to 0 to indicate the block has a single run
|
|
|
+ // unset first bit (0 trailing ones) to indicate the block has a single run
|
|
|
out.writeVLong(value << 1);
|
|
|
} else if (numRuns == 2 && bitsPerOrd < 62) {
|
|
|
- // set first two bits to 01 to indicate the block has two runs
|
|
|
+ // set 1 trailing bit to indicate the block has two runs
|
|
|
out.writeVLong((in[0] << 2) | 0b01);
|
|
|
int firstRunLen = in.length;
|
|
|
for (int i = 1; i < in.length; ++i) {
|
|
@@ -221,8 +243,15 @@ public class ES87TSDBDocValuesEncoder {
|
|
|
}
|
|
|
out.writeVInt(firstRunLen);
|
|
|
out.writeZLong(in[in.length - 1] - in[0]);
|
|
|
+ } else if (cyclic) {
|
|
|
+ // set 3 trailing bits to indicate the block cycles through the same values
|
|
|
+ long headerAndCycleLength = ((long) cycleLength << 4) | 0b0111;
|
|
|
+ out.writeVLong(headerAndCycleLength);
|
|
|
+ for (int i = 0; i < cycleLength; i++) {
|
|
|
+ out.writeVLong(in[i]);
|
|
|
+ }
|
|
|
} else {
|
|
|
- // set first two bits to 11 to indicate the block is bit-packed
|
|
|
+ // set 2 trailing bits to indicate the block is bit-packed
|
|
|
out.writeVLong(0b11);
|
|
|
forUtil.encode(in, bitsPerOrd, out);
|
|
|
}
|
|
@@ -232,20 +261,32 @@ public class ES87TSDBDocValuesEncoder {
|
|
|
assert out.length == ES87TSDBDocValuesFormat.NUMERIC_BLOCK_SIZE : out.length;
|
|
|
|
|
|
long v1 = in.readVLong();
|
|
|
- int header = (int) (v1 & 0b11L);
|
|
|
- if (header == 0b00 || header == 0b10) {
|
|
|
- // first bit is zero -> single run
|
|
|
- Arrays.fill(out, v1 >>> 1);
|
|
|
- } else if (header == 0b01) {
|
|
|
- // first two bits are 01 -> two runs
|
|
|
- v1 = v1 >>> 2;
|
|
|
+ int encoding = Long.numberOfTrailingZeros(~v1);
|
|
|
+ v1 >>>= encoding + 1;
|
|
|
+ if (encoding == 0) {
|
|
|
+ // single run
|
|
|
+ Arrays.fill(out, v1);
|
|
|
+ } else if (encoding == 1) {
|
|
|
+ // two runs
|
|
|
int runLen = in.readVInt();
|
|
|
long v2 = v1 + in.readZLong();
|
|
|
Arrays.fill(out, 0, runLen, v1);
|
|
|
Arrays.fill(out, runLen, out.length, v2);
|
|
|
- } else {
|
|
|
- // first two bits are 11 -> bit-packed
|
|
|
+ } else if (encoding == 2) {
|
|
|
+ // bit-packed
|
|
|
forUtil.decode(bitsPerOrd, in, out);
|
|
|
+ } else if (encoding == 3) {
|
|
|
+ // cycle encoding
|
|
|
+ int cycleLength = (int) v1;
|
|
|
+ for (int i = 0; i < cycleLength; i++) {
|
|
|
+ out[i] = in.readVLong();
|
|
|
+ }
|
|
|
+ int length = cycleLength;
|
|
|
+ while (length < out.length) {
|
|
|
+ int copyLength = Math.min(length, out.length - length);
|
|
|
+ System.arraycopy(out, 0, out, length, copyLength);
|
|
|
+ length += copyLength;
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
|