瀏覽代碼

Ignore disk usage of the frozen indices when balancing (#112241)

This change takes into account ignoreDiskWatermarks (essentially frozen index marker) when calculating the index balancing threshold. Previously this flag was not used possibly causing balancing to exit early, leaving the frozen tear in a less balanced state than desired (having more shards on certain nodes or multiple shards of the same index on the same node). This is especially noticeable not when allocating new cold indices but when balancing a cluster with some number of previously assigned frozen shards.
Ievgen Degtiarenko 1 年之前
父節點
當前提交
9bd49f01dd

+ 5 - 0
server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java

@@ -413,6 +413,11 @@ public class BalancedShardsAllocator implements ShardsAllocator {
 
         private float maxShardSizeBytes(String index) {
             final var indexMetadata = metadata.index(index);
+            if (indexMetadata.ignoreDiskWatermarks()) {
+                // disk watermarks are ignored for partial searchable snapshots
+                // and is equivalent to indexMetadata.isPartialSearchableSnapshot()
+                return 0;
+            }
             var maxShardSizeBytes = indexMetadata.getForecastedShardSizeInBytes().orElse(0L);
             for (int shard = 0; shard < indexMetadata.getNumberOfShards(); shard++) {
                 final var shardId = new ShardId(indexMetadata.getIndex(), shard);

+ 29 - 10
server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java

@@ -116,7 +116,7 @@ public class BalancedShardsAllocatorTests extends ESAllocationTestCase {
         );
 
         var clusterState = applyStartedShardsUntilNoChange(
-            stateWithStartedIndices(
+            createStateWithIndices(
                 anIndex("heavy-index").indexWriteLoadForecast(8.0),
                 anIndex("light-index-1").indexWriteLoadForecast(1.0),
                 anIndex("light-index-2").indexWriteLoadForecast(2.0),
@@ -151,7 +151,7 @@ public class BalancedShardsAllocatorTests extends ESAllocationTestCase {
         var allocationService = createAllocationService(WITH_DISK_BALANCING);
 
         var clusterState = applyStartedShardsUntilNoChange(
-            stateWithStartedIndices(
+            createStateWithIndices(
                 anIndex("heavy-index").shardSizeInBytesForecast(ByteSizeValue.ofGb(8).getBytes()),
                 anIndex("light-index-1").shardSizeInBytesForecast(ByteSizeValue.ofGb(1).getBytes()),
                 anIndex("light-index-2").shardSizeInBytesForecast(ByteSizeValue.ofGb(2).getBytes()),
@@ -196,7 +196,7 @@ public class BalancedShardsAllocatorTests extends ESAllocationTestCase {
         );
 
         var clusterState = applyStartedShardsUntilNoChange(
-            stateWithStartedIndices(
+            createStateWithIndices(
                 anIndex("heavy-index"),
                 anIndex("light-index-1"),
                 anIndex("light-index-2"),
@@ -232,7 +232,7 @@ public class BalancedShardsAllocatorTests extends ESAllocationTestCase {
         );
 
         var clusterState = applyStartedShardsUntilNoChange(
-            stateWithStartedIndices(
+            createStateWithIndices(
                 anIndex("heavy-index"),// size is set in cluster info
                 anIndex("light-index-1").shardSizeInBytesForecast(ByteSizeValue.ofGb(1).getBytes()),
                 anIndex("light-index-2").shardSizeInBytesForecast(ByteSizeValue.ofGb(2).getBytes()),
@@ -268,7 +268,7 @@ public class BalancedShardsAllocatorTests extends ESAllocationTestCase {
             .put(SETTING_IGNORE_DISK_WATERMARKS.getKey(), true);
 
         var clusterState = applyStartedShardsUntilNoChange(
-            stateWithStartedIndices(
+            createStateWithIndices(
                 anIndex("frozen-index-1", partialSearchableSnapshotSettings).shardSizeInBytesForecast(ByteSizeValue.ofGb(1).getBytes()),
                 anIndex("frozen-index-2", partialSearchableSnapshotSettings).shardSizeInBytesForecast(ByteSizeValue.ofGb(1).getBytes()),
                 anIndex("frozen-index-3", partialSearchableSnapshotSettings).shardSizeInBytesForecast(ByteSizeValue.ofGb(1).getBytes()),
@@ -536,13 +536,32 @@ public class BalancedShardsAllocatorTests extends ESAllocationTestCase {
         return IndexMetadata.builder(name).settings(settings);
     }
 
-    private static ClusterState stateWithStartedIndices(IndexMetadata.Builder... indices) {
+    private static ClusterState createStateWithIndices(IndexMetadata.Builder... indexMetadataBuilders) {
         var metadataBuilder = Metadata.builder();
         var routingTableBuilder = RoutingTable.builder(TestShardRoutingRoleStrategies.DEFAULT_ROLE_ONLY);
-        for (var index : indices) {
-            var build = index.build();
-            metadataBuilder.put(build, false);
-            routingTableBuilder.addAsNew(build);
+        if (randomBoolean()) {
+            // allocate all shards from scratch
+            for (var index : indexMetadataBuilders) {
+                var indexMetadata = index.build();
+                metadataBuilder.put(indexMetadata, false);
+                routingTableBuilder.addAsNew(indexMetadata);
+            }
+        } else {
+            // ensure unbalanced cluster cloud be properly balanced
+            // simulates a case when we add a second node and ensure shards could be evenly spread across all available nodes
+            for (var index : indexMetadataBuilders) {
+                var inSyncId = UUIDs.randomBase64UUID();
+                var indexMetadata = index.putInSyncAllocationIds(0, Set.of(inSyncId)).build();
+                metadataBuilder.put(indexMetadata, false);
+                routingTableBuilder.add(
+                    IndexRoutingTable.builder(indexMetadata.getIndex())
+                        .addShard(
+                            shardRoutingBuilder(new ShardId(indexMetadata.getIndex(), 0), "node-1", true, ShardRoutingState.STARTED)
+                                .withAllocationId(AllocationId.newInitializing(inSyncId))
+                                .build()
+                        )
+                );
+            }
         }
 
         return ClusterState.builder(ClusterName.DEFAULT)