Browse Source

Re-structure document ID generation favoring `_id` inverted index compression (#104683) (#116810)

This implementation restructures auto-generated document IDs to maximize compression within Lucene's terms dictionary. The key insight is placing stable or slowly-changing components at the start of the ID - the most significant bytes of the timestamp change very gradually (the first byte shifts only every 35 years, the second every 50 days). This careful ordering means that large sequences of IDs generated close in time will share common prefixes, allowing Lucene's Finite State Transducer (FST) to store terms more compactly.

To maintain uniqueness while preserving these compression benefits, the ID combines three elements: a timestamp that ensures time-based ordering, the coordinator's MAC address for cluster-wide uniqueness, and a sequence number for handling high-throughput scenarios. The timestamp handling is particularly robust, using atomic operations to prevent backwards movement even if the system clock shifts.

For high-volume indices generating millions of documents, this optimization can lead to substantial storage savings while maintaining strict guarantees about ID uniqueness and ordering.
Salvatore Campagna 11 tháng trước cách đây
mục cha
commit
f0740f3550

+ 5 - 0
docs/changelog/104683.yaml

@@ -0,0 +1,5 @@
+pr: 104683
+summary: "Feature: re-structure document ID generation favoring _id inverted index compression"
+area: Logs
+type: enhancement
+issues: []

+ 5 - 1
server/src/internalClusterTest/java/org/elasticsearch/action/bulk/BulkIntegrationIT.java

@@ -104,7 +104,11 @@ public class BulkIntegrationIT extends ESIntegTestCase {
     // allowing the auto-generated timestamp to externally be set would allow making the index inconsistent with duplicate docs
     public void testExternallySetAutoGeneratedTimestamp() {
         IndexRequest indexRequest = new IndexRequest("index1").source(Collections.singletonMap("foo", "baz"));
-        indexRequest.autoGenerateId();
+        if (randomBoolean()) {
+            indexRequest.autoGenerateId();
+        } else {
+            indexRequest.autoGenerateTimeBasedId();
+        }
         if (randomBoolean()) {
             indexRequest.id("test");
         }

+ 23 - 6
server/src/main/java/org/elasticsearch/action/index/IndexRequest.java

@@ -51,6 +51,7 @@ import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Objects;
+import java.util.function.Supplier;
 
 import static org.elasticsearch.action.ValidateActions.addValidationError;
 import static org.elasticsearch.index.seqno.SequenceNumbers.UNASSIGNED_PRIMARY_TERM;
@@ -76,6 +77,9 @@ public class IndexRequest extends ReplicatedWriteRequest<IndexRequest> implement
     private static final long SHALLOW_SIZE = RamUsageEstimator.shallowSizeOfInstance(IndexRequest.class);
     private static final TransportVersion PIPELINES_HAVE_RUN_FIELD_ADDED = TransportVersions.V_8_10_X;
 
+    private static final Supplier<String> ID_GENERATOR = UUIDs::base64UUID;
+    private static final Supplier<String> K_SORTED_TIME_BASED_ID_GENERATOR = UUIDs::base64TimeBasedKOrderedUUID;
+
     /**
      * Max length of the source document to include into string()
      *
@@ -692,10 +696,18 @@ public class IndexRequest extends ReplicatedWriteRequest<IndexRequest> implement
      * request compatible with the append-only optimization.
      */
     public void autoGenerateId() {
-        assert id == null;
-        assert autoGeneratedTimestamp == UNSET_AUTO_GENERATED_TIMESTAMP : "timestamp has already been generated!";
-        assert ifSeqNo == UNASSIGNED_SEQ_NO;
-        assert ifPrimaryTerm == UNASSIGNED_PRIMARY_TERM;
+        assertBeforeGeneratingId();
+        autoGenerateTimestamp();
+        id(ID_GENERATOR.get());
+    }
+
+    public void autoGenerateTimeBasedId() {
+        assertBeforeGeneratingId();
+        autoGenerateTimestamp();
+        id(K_SORTED_TIME_BASED_ID_GENERATOR.get());
+    }
+
+    private void autoGenerateTimestamp() {
         /*
          * Set the auto generated timestamp so the append only optimization
          * can quickly test if this request *must* be unique without reaching
@@ -704,8 +716,13 @@ public class IndexRequest extends ReplicatedWriteRequest<IndexRequest> implement
          * never work before 1970, but that's ok. It's after 1970.
          */
         autoGeneratedTimestamp = Math.max(0, System.currentTimeMillis());
-        String uid = UUIDs.base64UUID();
-        id(uid);
+    }
+
+    private void assertBeforeGeneratingId() {
+        assert id == null;
+        assert autoGeneratedTimestamp == UNSET_AUTO_GENERATED_TIMESTAMP : "timestamp has already been generated!";
+        assert ifSeqNo == UNASSIGNED_SEQ_NO;
+        assert ifPrimaryTerm == UNASSIGNED_PRIMARY_TERM;
     }
 
     /**

+ 11 - 1
server/src/main/java/org/elasticsearch/cluster/routing/IndexRouting.java

@@ -23,6 +23,8 @@ import org.elasticsearch.common.util.ByteUtils;
 import org.elasticsearch.common.xcontent.XContentHelper;
 import org.elasticsearch.core.Nullable;
 import org.elasticsearch.features.NodeFeature;
+import org.elasticsearch.index.IndexMode;
+import org.elasticsearch.index.IndexVersion;
 import org.elasticsearch.index.IndexVersions;
 import org.elasticsearch.index.mapper.TimeSeriesRoutingHashFieldMapper;
 import org.elasticsearch.transport.Transports;
@@ -146,11 +148,15 @@ public abstract class IndexRouting {
 
     private abstract static class IdAndRoutingOnly extends IndexRouting {
         private final boolean routingRequired;
+        private final IndexVersion creationVersion;
+        private final IndexMode indexMode;
 
         IdAndRoutingOnly(IndexMetadata metadata) {
             super(metadata);
+            this.creationVersion = metadata.getCreationVersion();
             MappingMetadata mapping = metadata.mapping();
             this.routingRequired = mapping == null ? false : mapping.routingRequired();
+            this.indexMode = metadata.getIndexMode();
         }
 
         protected abstract int shardId(String id, @Nullable String routing);
@@ -160,7 +166,11 @@ public abstract class IndexRouting {
             // generate id if not already provided
             final String id = indexRequest.id();
             if (id == null) {
-                indexRequest.autoGenerateId();
+                if (creationVersion.onOrAfter(IndexVersions.TIME_BASED_K_ORDERED_DOC_ID_BACKPORT) && indexMode == IndexMode.LOGSDB) {
+                    indexRequest.autoGenerateTimeBasedId();
+                } else {
+                    indexRequest.autoGenerateId();
+                }
             } else if (id.isEmpty()) {
                 throw new IllegalArgumentException("if _id is specified it must not be empty");
             }

+ 73 - 0
server/src/main/java/org/elasticsearch/common/TimeBasedKOrderedUUIDGenerator.java

@@ -0,0 +1,73 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.common;
+
+import java.nio.ByteBuffer;
+import java.util.Base64;
+
+/**
+ * Generates a base64-encoded, k-ordered UUID string optimized for compression and efficient indexing.
+ * <p>
+ * This method produces a time-based UUID where slowly changing components like the timestamp appear first,
+ * improving prefix-sharing and compression during indexing. It ensures uniqueness across nodes by incorporating
+ * a timestamp, a MAC address, and a sequence ID.
+ * <p>
+ * <b>Timestamp:</b> Represents the current time in milliseconds, ensuring ordering and uniqueness.
+ * <br>
+ * <b>MAC Address:</b> Ensures uniqueness across different coordinators.
+ * <br>
+ * <b>Sequence ID:</b> Differentiates UUIDs generated within the same millisecond, ensuring uniqueness even at high throughput.
+ * <p>
+ * The result is a compact base64-encoded string, optimized for efficient compression of the _id field in an inverted index.
+ */
+public class TimeBasedKOrderedUUIDGenerator extends TimeBasedUUIDGenerator {
+    private static final Base64.Encoder BASE_64_NO_PADDING = Base64.getEncoder().withoutPadding();
+
+    @Override
+    public String getBase64UUID() {
+        final int sequenceId = this.sequenceNumber.incrementAndGet() & 0x00FF_FFFF;
+
+        // Calculate timestamp to ensure ordering and avoid backward movement in case of time shifts.
+        // Uses AtomicLong to guarantee that timestamp increases even if the system clock moves backward.
+        // If the sequenceId overflows (reaches 0 within the same millisecond), the timestamp is incremented
+        // to ensure strict ordering.
+        long timestamp = this.lastTimestamp.accumulateAndGet(
+            currentTimeMillis(),
+            sequenceId == 0 ? (lastTimestamp, currentTimeMillis) -> Math.max(lastTimestamp, currentTimeMillis) + 1 : Math::max
+        );
+
+        final byte[] uuidBytes = new byte[15];
+        final ByteBuffer buffer = ByteBuffer.wrap(uuidBytes);
+
+        buffer.put((byte) (timestamp >>> 40)); // changes every 35 years
+        buffer.put((byte) (timestamp >>> 32)); // changes every ~50 days
+        buffer.put((byte) (timestamp >>> 24)); // changes every ~4.5h
+        buffer.put((byte) (timestamp >>> 16)); // changes every ~65 secs
+
+        // MAC address of the coordinator might change if there are many coordinators in the cluster
+        // and the indexing api does not necessarily target the same coordinator.
+        byte[] macAddress = macAddress();
+        assert macAddress.length == 6;
+        buffer.put(macAddress, 0, macAddress.length);
+
+        buffer.put((byte) (sequenceId >>> 16));
+
+        // From hereinafter everything is almost like random and does not compress well
+        // due to unlikely prefix-sharing
+        buffer.put((byte) (timestamp >>> 8));
+        buffer.put((byte) (sequenceId >>> 8));
+        buffer.put((byte) timestamp);
+        buffer.put((byte) sequenceId);
+
+        assert buffer.position() == uuidBytes.length;
+
+        return BASE_64_NO_PADDING.encodeToString(uuidBytes);
+    }
+}

+ 2 - 2
server/src/main/java/org/elasticsearch/common/TimeBasedUUIDGenerator.java

@@ -25,10 +25,10 @@ class TimeBasedUUIDGenerator implements UUIDGenerator {
 
     // We only use bottom 3 bytes for the sequence number. Paranoia: init with random int so that if JVM/OS/machine goes down, clock slips
     // backwards, and JVM comes back up, we are less likely to be on the same sequenceNumber at the same time:
-    private final AtomicInteger sequenceNumber = new AtomicInteger(SecureRandomHolder.INSTANCE.nextInt());
+    protected final AtomicInteger sequenceNumber = new AtomicInteger(SecureRandomHolder.INSTANCE.nextInt());
 
     // Used to ensure clock moves forward:
-    private final AtomicLong lastTimestamp = new AtomicLong(0);
+    protected final AtomicLong lastTimestamp = new AtomicLong(0);
 
     private static final byte[] SECURE_MUNGED_ADDRESS = MacAddressProvider.getSecureMungedAddress();
 

+ 10 - 0
server/src/main/java/org/elasticsearch/common/UUIDs.java

@@ -16,6 +16,8 @@ import java.util.Random;
 public class UUIDs {
 
     private static final RandomBasedUUIDGenerator RANDOM_UUID_GENERATOR = new RandomBasedUUIDGenerator();
+
+    private static final UUIDGenerator TIME_BASED_K_ORDERED_GENERATOR = new TimeBasedKOrderedUUIDGenerator();
     private static final UUIDGenerator TIME_UUID_GENERATOR = new TimeBasedUUIDGenerator();
 
     /**
@@ -33,6 +35,14 @@ public class UUIDs {
         return TIME_UUID_GENERATOR.getBase64UUID();
     }
 
+    public static String base64TimeBasedKOrderedUUID() {
+        return TIME_BASED_K_ORDERED_GENERATOR.getBase64UUID();
+    }
+
+    public static String base64TimeBasedUUID() {
+        return TIME_UUID_GENERATOR.getBase64UUID();
+    }
+
     /**
      * The length of a UUID string generated by {@link #randomBase64UUID} and {@link #randomBase64UUIDSecureString}.
      */

+ 1 - 0
server/src/main/java/org/elasticsearch/index/IndexVersions.java

@@ -120,6 +120,7 @@ public class IndexVersions {
     public static final IndexVersion ENABLE_IGNORE_ABOVE_LOGSDB = def(8_517_00_0, Version.LUCENE_9_12_0);
     public static final IndexVersion ADD_ROLE_MAPPING_CLEANUP_MIGRATION = def(8_518_00_0, Version.LUCENE_9_12_0);
     public static final IndexVersion LOGSDB_DEFAULT_IGNORE_DYNAMIC_BEYOND_LIMIT_BACKPORT = def(8_519_00_0, Version.LUCENE_9_12_0);
+    public static final IndexVersion TIME_BASED_K_ORDERED_DOC_ID_BACKPORT = def(8_520_00_0, Version.LUCENE_9_12_0);
     /*
      * STOP! READ THIS FIRST! No, really,
      *        ____ _____ ___  ____  _        ____  _____    _    ____    _____ _   _ ___ ____    _____ ___ ____  ____ _____ _

+ 6 - 0
server/src/test/java/org/elasticsearch/action/index/IndexRequestTests.java

@@ -128,6 +128,12 @@ public class IndexRequestTests extends ESTestCase {
         assertTrue("expected > 0 but got: " + request.getAutoGeneratedTimestamp(), request.getAutoGeneratedTimestamp() > 0);
     }
 
+    public void testAutoGenerateTimeBasedId() {
+        IndexRequest request = new IndexRequest("index");
+        request.autoGenerateTimeBasedId();
+        assertTrue("expected > 0 but got: " + request.getAutoGeneratedTimestamp(), request.getAutoGeneratedTimestamp() > 0);
+    }
+
     public void testIndexResponse() {
         ShardId shardId = new ShardId(randomAlphaOfLengthBetween(3, 10), randomAlphaOfLengthBetween(3, 10), randomIntBetween(0, 1000));
         String id = randomAlphaOfLengthBetween(3, 10);

+ 89 - 45
server/src/test/java/org/elasticsearch/common/UUIDTests.java

@@ -35,6 +35,7 @@ public class UUIDTests extends ESTestCase {
 
     static UUIDGenerator timeUUIDGen = new TimeBasedUUIDGenerator();
     static UUIDGenerator randomUUIDGen = new RandomBasedUUIDGenerator();
+    static UUIDGenerator kOrderedUUIDGen = new TimeBasedKOrderedUUIDGenerator();
 
     public void testRandomUUID() {
         verifyUUIDSet(100000, randomUUIDGen);
@@ -44,14 +45,49 @@ public class UUIDTests extends ESTestCase {
         verifyUUIDSet(100000, timeUUIDGen);
     }
 
-    public void testThreadedTimeUUID() {
-        testUUIDThreaded(timeUUIDGen);
+    public void testKOrderedUUID() {
+        verifyUUIDSet(100000, kOrderedUUIDGen);
     }
 
     public void testThreadedRandomUUID() {
         testUUIDThreaded(randomUUIDGen);
     }
 
+    public void testThreadedTimeUUID() {
+        testUUIDThreaded(timeUUIDGen);
+    }
+
+    public void testThreadedKOrderedUUID() {
+        testUUIDThreaded(kOrderedUUIDGen);
+    }
+
+    public void testCompression() throws Exception {
+        Logger logger = LogManager.getLogger(UUIDTests.class);
+
+        assertThat(testCompression(timeUUIDGen, 100000, 10000, 3, logger), Matchers.lessThan(14d));
+        assertThat(testCompression(timeUUIDGen, 100000, 1000, 3, logger), Matchers.lessThan(15d));
+        assertThat(testCompression(timeUUIDGen, 100000, 100, 3, logger), Matchers.lessThan(21d));
+
+        assertThat(testCompression(kOrderedUUIDGen, 100000, 10000, 3, logger), Matchers.lessThan(13d));
+        assertThat(testCompression(kOrderedUUIDGen, 100000, 1000, 3, logger), Matchers.lessThan(14d));
+        assertThat(testCompression(kOrderedUUIDGen, 100000, 100, 3, logger), Matchers.lessThan(19d));
+    }
+
+    public void testComparativeCompression() throws Exception {
+        Logger logger = LogManager.getLogger(UUIDTests.class);
+
+        int numDocs = 100000;
+        int docsPerSecond = 1000;
+        int nodes = 3;
+
+        double randomCompression = testCompression(randomUUIDGen, numDocs, docsPerSecond, nodes, logger);
+        double baseCompression = testCompression(timeUUIDGen, numDocs, docsPerSecond, nodes, logger);
+        double kOrderedCompression = testCompression(kOrderedUUIDGen, numDocs, docsPerSecond, nodes, logger);
+
+        assertThat(kOrderedCompression, Matchers.lessThanOrEqualTo(baseCompression));
+        assertThat(kOrderedCompression, Matchers.lessThanOrEqualTo(randomCompression));
+    }
+
     Set<String> verifyUUIDSet(int count, UUIDGenerator uuidSource) {
         HashSet<String> uuidSet = new HashSet<>();
         for (int i = 0; i < count; ++i) {
@@ -109,49 +145,62 @@ public class UUIDTests extends ESTestCase {
         assertEquals(count * uuids, globalSet.size());
     }
 
-    public void testCompression() throws Exception {
-        Logger logger = LogManager.getLogger(UUIDTests.class);
-        // Low number so that the test runs quickly, but the results are more interesting with larger numbers
-        // of indexed documents
-        assertThat(testCompression(100000, 10000, 3, logger), Matchers.lessThan(14d)); // ~12 in practice
-        assertThat(testCompression(100000, 1000, 3, logger), Matchers.lessThan(15d)); // ~13 in practice
-        assertThat(testCompression(100000, 100, 3, logger), Matchers.lessThan(21d)); // ~20 in practice
-    }
-
-    private static double testCompression(int numDocs, int numDocsPerSecond, int numNodes, Logger logger) throws Exception {
-        final double intervalBetweenDocs = 1000. / numDocsPerSecond; // milliseconds
+    private static double testCompression(final UUIDGenerator generator, int numDocs, int numDocsPerSecond, int numNodes, Logger logger)
+        throws Exception {
+        final double intervalBetweenDocs = 1000. / numDocsPerSecond;
         final byte[][] macAddresses = new byte[numNodes][];
         Random r = random();
         for (int i = 0; i < macAddresses.length; ++i) {
             macAddresses[i] = new byte[6];
             random().nextBytes(macAddresses[i]);
         }
-        UUIDGenerator generator = new TimeBasedUUIDGenerator() {
-            double currentTimeMillis = TestUtil.nextLong(random(), 0L, 10000000000L);
 
-            @Override
-            protected long currentTimeMillis() {
-                currentTimeMillis += intervalBetweenDocs * 2 * r.nextDouble();
-                return (long) currentTimeMillis;
+        UUIDGenerator uuidSource = generator;
+        if (generator instanceof TimeBasedUUIDGenerator) {
+            if (generator instanceof TimeBasedKOrderedUUIDGenerator) {
+                uuidSource = new TimeBasedKOrderedUUIDGenerator() {
+                    double currentTimeMillis = TestUtil.nextLong(random(), 0L, 10000000000L);
+
+                    @Override
+                    protected long currentTimeMillis() {
+                        currentTimeMillis += intervalBetweenDocs * 2 * r.nextDouble();
+                        return (long) currentTimeMillis;
+                    }
+
+                    @Override
+                    protected byte[] macAddress() {
+                        return RandomPicks.randomFrom(r, macAddresses);
+                    }
+                };
+            } else {
+                uuidSource = new TimeBasedUUIDGenerator() {
+                    double currentTimeMillis = TestUtil.nextLong(random(), 0L, 10000000000L);
+
+                    @Override
+                    protected long currentTimeMillis() {
+                        currentTimeMillis += intervalBetweenDocs * 2 * r.nextDouble();
+                        return (long) currentTimeMillis;
+                    }
+
+                    @Override
+                    protected byte[] macAddress() {
+                        return RandomPicks.randomFrom(r, macAddresses);
+                    }
+                };
             }
+        }
 
-            @Override
-            protected byte[] macAddress() {
-                return RandomPicks.randomFrom(r, macAddresses);
-            }
-        };
-        // Avoid randomization which will slow down things without improving
-        // the quality of this test
         Directory dir = newFSDirectory(createTempDir());
         IndexWriterConfig config = new IndexWriterConfig().setCodec(Codec.forName(Lucene.LATEST_CODEC))
-            .setMergeScheduler(new SerialMergeScheduler()); // for reproducibility
+            .setMergeScheduler(new SerialMergeScheduler());
+
         IndexWriter w = new IndexWriter(dir, config);
         Document doc = new Document();
         StringField id = new StringField("_id", "", Store.NO);
         doc.add(id);
         long start = System.nanoTime();
         for (int i = 0; i < numDocs; ++i) {
-            id.setStringValue(generator.getBase64UUID());
+            id.setStringValue(uuidSource.getBase64UUID());
             w.addDocument(doc);
         }
         w.forceMerge(1);
@@ -164,30 +213,25 @@ public class UUIDTests extends ESTestCase {
         dir.close();
         double bytesPerDoc = (double) size / numDocs;
         logger.info(
-            numDocs
-                + " docs indexed at "
-                + numDocsPerSecond
-                + " docs/s required "
-                + ByteSizeValue.ofBytes(size)
-                + " bytes of disk space, or "
-                + bytesPerDoc
-                + " bytes per document. Took: "
-                + new TimeValue(time)
-                + "."
+            "{} - {} docs indexed at {} docs/s required {} bytes of disk space, or {} bytes per document. Took: {}.",
+            uuidSource.getClass().getSimpleName(),
+            numDocs,
+            numDocsPerSecond,
+            ByteSizeValue.ofBytes(size),
+            bytesPerDoc,
+            new TimeValue(time)
         );
         return bytesPerDoc;
     }
 
     public void testStringLength() {
         assertEquals(UUIDs.RANDOM_BASED_UUID_STRING_LENGTH, getUnpaddedBase64StringLength(RandomBasedUUIDGenerator.SIZE_IN_BYTES));
-        assertEquals(UUIDs.RANDOM_BASED_UUID_STRING_LENGTH, UUIDs.randomBase64UUID().length());
-        assertEquals(UUIDs.RANDOM_BASED_UUID_STRING_LENGTH, UUIDs.randomBase64UUID(random()).length());
-        try (var secureString = UUIDs.randomBase64UUIDSecureString()) {
-            assertEquals(UUIDs.RANDOM_BASED_UUID_STRING_LENGTH, secureString.toString().length());
-        }
-
         assertEquals(UUIDs.TIME_BASED_UUID_STRING_LENGTH, getUnpaddedBase64StringLength(TimeBasedUUIDGenerator.SIZE_IN_BYTES));
-        assertEquals(UUIDs.TIME_BASED_UUID_STRING_LENGTH, UUIDs.base64UUID().length());
+        assertEquals(UUIDs.TIME_BASED_UUID_STRING_LENGTH, getUnpaddedBase64StringLength(TimeBasedKOrderedUUIDGenerator.SIZE_IN_BYTES));
+
+        assertEquals(UUIDs.RANDOM_BASED_UUID_STRING_LENGTH, randomUUIDGen.getBase64UUID().length());
+        assertEquals(UUIDs.TIME_BASED_UUID_STRING_LENGTH, timeUUIDGen.getBase64UUID().length());
+        assertEquals(UUIDs.TIME_BASED_UUID_STRING_LENGTH, kOrderedUUIDGen.getBase64UUID().length());
     }
 
     private static int getUnpaddedBase64StringLength(int sizeInBytes) {