Browse Source

Add allocation explain output for THROTTLING shards (#109563)

Normally THROTTLING status should be temporarily, however we discovered a case when the node was not moving shard for extended period of time (30 minutes) due to a single throttling shard. This change adds allocation explain if there are no relocating shards and remaining ones are throttled so that the cause of throttling could be addressed.
Ievgen Degtiarenko 1 year ago
parent
commit
ea2b33a7bc

+ 5 - 0
docs/changelog/109563.yaml

@@ -0,0 +1,5 @@
+pr: 109563
+summary: Add allocation explain output for THROTTLING shards
+area: Infra/Core
+type: enhancement
+issues: []

+ 23 - 0
server/src/main/java/org/elasticsearch/cluster/metadata/ShutdownShardMigrationStatus.java

@@ -89,6 +89,25 @@ public class ShutdownShardMigrationStatus implements Writeable, ChunkedToXConten
         );
     }
 
+    public ShutdownShardMigrationStatus(
+        SingleNodeShutdownMetadata.Status status,
+        long startedShards,
+        long relocatingShards,
+        long initializingShards,
+        @Nullable String explanation,
+        @Nullable ShardAllocationDecision allocationDecision
+    ) {
+        this(
+            status,
+            startedShards,
+            relocatingShards,
+            initializingShards,
+            startedShards + relocatingShards + initializingShards,
+            explanation,
+            allocationDecision
+        );
+    }
+
     private ShutdownShardMigrationStatus(
         SingleNodeShutdownMetadata.Status status,
         long startedShards,
@@ -140,6 +159,10 @@ public class ShutdownShardMigrationStatus implements Writeable, ChunkedToXConten
         return status;
     }
 
+    public ShardAllocationDecision getAllocationDecision() {
+        return allocationDecision;
+    }
+
     @Override
     public Iterator<? extends ToXContent> toXContentChunked(ToXContent.Params params) {
         return Iterators.concat(

+ 62 - 24
x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusAction.java

@@ -49,7 +49,6 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Objects;
-import java.util.Optional;
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Collectors;
@@ -254,11 +253,20 @@ public class TransportGetShutdownStatusAction extends TransportMasterNodeAction<
         int totalRemainingShards = relocatingShards + startedShards + initializingShards;
 
         // If there's relocating shards, or no shards on this node, we'll just use the number of shards left to move
-        if (relocatingShards > 0 || totalRemainingShards == 0) {
-            SingleNodeShutdownMetadata.Status shardStatus = totalRemainingShards == 0
-                ? SingleNodeShutdownMetadata.Status.COMPLETE
-                : SingleNodeShutdownMetadata.Status.IN_PROGRESS;
-            return new ShutdownShardMigrationStatus(shardStatus, startedShards, relocatingShards, initializingShards);
+        if (totalRemainingShards == 0) {
+            return new ShutdownShardMigrationStatus(
+                SingleNodeShutdownMetadata.Status.COMPLETE,
+                startedShards,
+                relocatingShards,
+                initializingShards
+            );
+        } else if (relocatingShards > 0) {
+            return new ShutdownShardMigrationStatus(
+                SingleNodeShutdownMetadata.Status.IN_PROGRESS,
+                startedShards,
+                relocatingShards,
+                initializingShards
+            );
         } else if (initializingShards > 0 && relocatingShards == 0 && startedShards == 0) {
             // If there's only initializing shards left, return now with a note that only initializing shards are left
             return new ShutdownShardMigrationStatus(
@@ -270,11 +278,8 @@ public class TransportGetShutdownStatusAction extends TransportMasterNodeAction<
             );
         }
 
-        // If there's no relocating shards and shards still on this node, we need to figure out why
-        AtomicInteger shardsToIgnoreForFinalStatus = new AtomicInteger(0);
-
-        // Explain shard allocations until we find one that can't move, then stop (as `findFirst` short-circuits)
-        Optional<Tuple<ShardRouting, ShardAllocationDecision>> unmovableShard = currentState.getRoutingNodes()
+        // Get all shard explanations
+        var unmovableShards = currentState.getRoutingNodes()
             .node(nodeId)
             .shardsWithState(ShardRoutingState.STARTED)
             .peek(s -> cancellableTask.ensureNotCancelled())
@@ -285,10 +290,16 @@ public class TransportGetShutdownStatusAction extends TransportMasterNodeAction<
                     : "shard [" + pair + "] can remain on node [" + nodeId + "], but that node is shutting down";
                 return pair.v2().getMoveDecision().canRemain() == false;
             })
-            // It's okay if some are throttled, they'll move eventually
-            .filter(pair -> pair.v2().getMoveDecision().getAllocationDecision().equals(AllocationDecision.THROTTLED) == false)
             // These shards will move as soon as possible
             .filter(pair -> pair.v2().getMoveDecision().getAllocationDecision().equals(AllocationDecision.YES) == false)
+            .toList();
+
+        // If there's no relocating shards and shards still on this node, we need to figure out why
+        AtomicInteger shardsToIgnoreForFinalStatus = new AtomicInteger(0);
+
+        // Find first one that can not move permanently
+        var unmovableShard = unmovableShards.stream()
+            .filter(pair -> pair.v2().getMoveDecision().getAllocationDecision().equals(AllocationDecision.THROTTLED) == false)
             // If the shard that can't move is on every node in the cluster, we shouldn't be `STALLED` on it.
             .filter(pair -> {
                 final boolean hasShardCopyOnOtherNode = hasShardCopyOnAnotherNode(currentState, pair.v1(), shuttingDownNodes);
@@ -312,6 +323,10 @@ public class TransportGetShutdownStatusAction extends TransportMasterNodeAction<
             )
             .findFirst();
 
+        var temporarilyUnmovableShards = unmovableShards.stream()
+            .filter(pair -> pair.v2().getMoveDecision().getAllocationDecision().equals(AllocationDecision.THROTTLED))
+            .toList();
+
         if (totalRemainingShards == shardsToIgnoreForFinalStatus.get() && unmovableShard.isEmpty()) {
             return new ShutdownShardMigrationStatus(
                 SingleNodeShutdownMetadata.Status.COMPLETE,
@@ -338,14 +353,38 @@ public class TransportGetShutdownStatusAction extends TransportMasterNodeAction<
                 ),
                 decision
             );
-        } else {
-            return new ShutdownShardMigrationStatus(
-                SingleNodeShutdownMetadata.Status.IN_PROGRESS,
-                startedShards,
-                relocatingShards,
-                initializingShards
-            );
-        }
+        } else if (relocatingShards == 0
+            && initializingShards == 0
+            && startedShards > 0
+            && temporarilyUnmovableShards.size() == startedShards) {
+                // We found a shard that can't be moved temporarily,
+                // report it so that the cause of the throttling could be addressed if it is taking significant time
+                ShardRouting shardRouting = temporarilyUnmovableShards.get(0).v1();
+                ShardAllocationDecision decision = temporarilyUnmovableShards.get(0).v2();
+
+                return new ShutdownShardMigrationStatus(
+                    SingleNodeShutdownMetadata.Status.IN_PROGRESS,
+                    startedShards,
+                    relocatingShards,
+                    initializingShards,
+                    format(
+                        "shard [%s] [%s] of index [%s] is waiting to be moved, see [%s] "
+                            + "for details or use the cluster allocation explain API",
+                        shardRouting.shardId().getId(),
+                        shardRouting.primary() ? "primary" : "replica",
+                        shardRouting.index().getName(),
+                        NODE_ALLOCATION_DECISION_KEY
+                    ),
+                    decision
+                );
+            } else {
+                return new ShutdownShardMigrationStatus(
+                    SingleNodeShutdownMetadata.Status.IN_PROGRESS,
+                    startedShards,
+                    relocatingShards,
+                    initializingShards
+                );
+            }
     }
 
     private static boolean isIlmRestrictingShardMovement(ClusterState currentState, ShardRouting pair) {
@@ -373,9 +412,8 @@ public class TransportGetShutdownStatusAction extends TransportMasterNodeAction<
 
     private static boolean hasShardCopyOnAnotherNode(ClusterState clusterState, ShardRouting shardRouting, Set<String> shuttingDownNodes) {
         return clusterState.routingTable()
-            .allShards(shardRouting.index().getName())
-            .stream()
-            .filter(sr -> sr.id() == shardRouting.id())
+            .shardRoutingTable(shardRouting.shardId())
+            .allShards()
             .filter(sr -> sr.role().equals(shardRouting.role()))
             // If any shards are both 1) `STARTED` and 2) are not on a node that's shutting down, we have at least one copy
             // of this shard safely on a node that's not shutting down, so we don't want to report `STALLED` because of this shard.

+ 46 - 1
x-pack/plugin/shutdown/src/test/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusActionTests.java

@@ -28,6 +28,7 @@ import org.elasticsearch.cluster.routing.ShardRoutingState;
 import org.elasticsearch.cluster.routing.TestShardRouting;
 import org.elasticsearch.cluster.routing.UnassignedInfo;
 import org.elasticsearch.cluster.routing.allocation.AllocationService;
+import org.elasticsearch.cluster.routing.allocation.Explanations;
 import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
 import org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator;
 import org.elasticsearch.cluster.routing.allocation.decider.AllocationDecider;
@@ -74,6 +75,7 @@ import static org.hamcrest.Matchers.allOf;
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.notNullValue;
 import static org.hamcrest.Matchers.nullValue;
 import static org.mockito.ArgumentMatchers.anyInt;
 import static org.mockito.Mockito.doAnswer;
@@ -410,7 +412,7 @@ public class TransportGetShutdownStatusActionTests extends ESTestCase {
             status,
             SingleNodeShutdownMetadata.Status.STALLED,
             1,
-            allOf(containsString(index.getName()), containsString("[2] [primary]"))
+            allOf(containsString(index.getName()), containsString("[2] [primary]"), containsString("cannot move"))
         );
     }
 
@@ -645,6 +647,49 @@ public class TransportGetShutdownStatusActionTests extends ESTestCase {
         assertShardMigration(status, SingleNodeShutdownMetadata.Status.NOT_STARTED, 0, is("node is not currently part of the cluster"));
     }
 
+    public void testExplainThrottled() {
+        Index index = new Index(randomAlphaOfLength(5), randomAlphaOfLengthBetween(1, 20));
+        IndexMetadata imd = generateIndexMetadata(index, 3, 0);
+        IndexRoutingTable indexRoutingTable = IndexRoutingTable.builder(index)
+            .addShard(TestShardRouting.newShardRouting(new ShardId(index, 0), LIVE_NODE_ID, true, ShardRoutingState.INITIALIZING))
+            .addShard(TestShardRouting.newShardRouting(new ShardId(index, 1), LIVE_NODE_ID, true, ShardRoutingState.INITIALIZING))
+            .addShard(TestShardRouting.newShardRouting(new ShardId(index, 2), SHUTTING_DOWN_NODE_ID, true, ShardRoutingState.STARTED))
+            .build();
+
+        RoutingTable.Builder routingTable = RoutingTable.builder();
+        routingTable.add(indexRoutingTable);
+        ClusterState state = createTestClusterState(routingTable.build(), List.of(imd), SingleNodeShutdownMetadata.Type.REMOVE);
+
+        // LIVE_NODE_ID can not accept the remaining shard as it is temporarily initializing 2 other shards
+        canAllocate.set((r, n, a) -> n.nodeId().equals(LIVE_NODE_ID) ? Decision.THROTTLE : Decision.NO);
+        // And the remain decider simulates NodeShutdownAllocationDecider
+        canRemain.set((r, n, a) -> n.nodeId().equals(SHUTTING_DOWN_NODE_ID) ? Decision.NO : Decision.YES);
+
+        ShutdownShardMigrationStatus status = TransportGetShutdownStatusAction.shardMigrationStatus(
+            new CancellableTask(1, "direct", GetShutdownStatusAction.NAME, "", TaskId.EMPTY_TASK_ID, Map.of()),
+            state,
+            SHUTTING_DOWN_NODE_ID,
+            SingleNodeShutdownMetadata.Type.REMOVE,
+            true,
+            clusterInfoService,
+            snapshotsInfoService,
+            allocationService,
+            allocationDeciders
+        );
+
+        assertShardMigration(
+            status,
+            SingleNodeShutdownMetadata.Status.IN_PROGRESS,
+            1,
+            allOf(containsString(index.getName()), containsString("[2] [primary]"), containsString("is waiting to be moved"))
+        );
+        var explain = status.getAllocationDecision();
+        assertThat(explain, notNullValue());
+        assertThat(explain.getAllocateDecision().isDecisionTaken(), is(false));
+        assertThat(explain.getMoveDecision().isDecisionTaken(), is(true));
+        assertThat(explain.getMoveDecision().getExplanation(), equalTo(Explanations.Move.THROTTLED));
+    }
+
     public void testIlmShrinkingIndexAvoidsStall() {
         LifecycleExecutionState executionState = LifecycleExecutionState.builder()
             .setAction(ShrinkAction.NAME)