Browse Source

Distinguish no shutdowns case in NodeShutdownAllocationDecider (#89851)

Today the `NodeShutdownAllocationDecider` explains its `YES` decision as
`this node is not shutting down` whether there are other node shutdowns
ongoing or not. It'd be useful to distinguish these cases, so this
commit refines the explanation to do so.

Closes #89823
David Turner 3 years ago
parent
commit
984d225ff9

+ 6 - 0
docs/changelog/89851.yaml

@@ -0,0 +1,6 @@
+pr: 89851
+summary: Distinguish no shutdowns case in `NodeShutdownAllocationDecider`
+area: Infra/Node Lifecycle
+type: enhancement
+issues:
+ - 89823

+ 18 - 33
server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/NodeShutdownAllocationDecider.java

@@ -27,32 +27,15 @@ public class NodeShutdownAllocationDecider extends AllocationDecider {
 
     private static final String NAME = "node_shutdown";
 
+    private static final Decision YES_EMPTY_SHUTDOWN_METADATA = Decision.single(Decision.Type.YES, NAME, "no nodes are shutting down");
+    private static final Decision YES_NODE_NOT_SHUTTING_DOWN = Decision.single(Decision.Type.YES, NAME, "this node is not shutting down");
+
     /**
      * Determines if a shard can be allocated to a particular node, based on whether that node is shutting down or not.
      */
     @Override
     public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
-        final SingleNodeShutdownMetadata thisNodeShutdownMetadata = allocation.nodeShutdowns().get(node.nodeId());
-
-        if (thisNodeShutdownMetadata == null) {
-            // There's no shutdown metadata for this node, return yes.
-            return allocation.decision(Decision.YES, NAME, "this node is not currently shutting down");
-        }
-
-        return switch (thisNodeShutdownMetadata.getType()) {
-            case REPLACE, REMOVE -> allocation.decision(
-                Decision.NO,
-                NAME,
-                "node [%s] is preparing to be removed from the cluster",
-                node.nodeId()
-            );
-            case RESTART -> allocation.decision(
-                Decision.YES,
-                NAME,
-                "node [%s] is preparing to restart, but will remain in the cluster",
-                node.nodeId()
-            );
-        };
+        return getDecision(allocation, node.nodeId());
     }
 
     /**
@@ -61,7 +44,7 @@ public class NodeShutdownAllocationDecider extends AllocationDecider {
      */
     @Override
     public Decision canRemain(IndexMetadata indexMetadata, ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
-        return this.canAllocate(shardRouting, node, allocation);
+        return canAllocate(shardRouting, node, allocation);
     }
 
     /**
@@ -70,26 +53,28 @@ public class NodeShutdownAllocationDecider extends AllocationDecider {
      */
     @Override
     public Decision shouldAutoExpandToNode(IndexMetadata indexMetadata, DiscoveryNode node, RoutingAllocation allocation) {
-        SingleNodeShutdownMetadata thisNodeShutdownMetadata = allocation.nodeShutdowns().get(node.getId());
+        return getDecision(allocation, node.getId());
+    }
+
+    private static Decision getDecision(RoutingAllocation allocation, String nodeId) {
+        final var nodeShutdowns = allocation.nodeShutdowns();
+        if (nodeShutdowns.isEmpty()) {
+            return YES_EMPTY_SHUTDOWN_METADATA;
+        }
 
+        final SingleNodeShutdownMetadata thisNodeShutdownMetadata = nodeShutdowns.get(nodeId);
         if (thisNodeShutdownMetadata == null) {
-            return allocation.decision(Decision.YES, NAME, "node [%s] is not preparing for removal from the cluster", node.getId());
+            return YES_NODE_NOT_SHUTTING_DOWN;
         }
 
         return switch (thisNodeShutdownMetadata.getType()) {
+            case REPLACE, REMOVE -> allocation.decision(Decision.NO, NAME, "node [%s] is preparing to be removed from the cluster", nodeId);
             case RESTART -> allocation.decision(
                 Decision.YES,
                 NAME,
-                "node [%s] is not preparing for removal from the cluster (is " + "restarting)",
-                node.getId()
-            );
-            case REPLACE, REMOVE -> allocation.decision(
-                Decision.NO,
-                NAME,
-                "node [%s] is preparing for removal from the cluster",
-                node.getId()
+                "node [%s] is preparing to restart, but will remain in the cluster",
+                nodeId
             );
         };
     }
-
 }

+ 25 - 6
server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/NodeShutdownAllocationDeciderTests.java

@@ -48,7 +48,7 @@ public class NodeShutdownAllocationDeciderTests extends ESAllocationTestCase {
         new UnassignedInfo(UnassignedInfo.Reason.INDEX_CREATED, "index created")
     );
     private final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
-    private NodeShutdownAllocationDecider decider = new NodeShutdownAllocationDecider();
+    private final NodeShutdownAllocationDecider decider = new NodeShutdownAllocationDecider();
     private final AllocationDeciders allocationDeciders = new AllocationDeciders(
         Arrays.asList(
             decider,
@@ -151,11 +151,11 @@ public class NodeShutdownAllocationDeciderTests extends ESAllocationTestCase {
         assertThat(decision.type(), equalTo(Decision.Type.YES));
         assertThat(
             decision.getExplanation(),
-            equalTo("node [" + DATA_NODE.getId() + "] is not preparing for removal from the cluster (is restarting)")
+            equalTo("node [" + DATA_NODE.getId() + "] is preparing to restart, but will remain in the cluster")
         );
     }
 
-    public void testCanAutoExpandToNodeThatIsNotShuttingDown() {
+    public void testCanAutoExpandToNodeIfNoNodesShuttingDown() {
         ClusterState state = service.reroute(ClusterState.EMPTY_STATE, "initial state");
 
         RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, state, null, null, 0);
@@ -163,7 +163,22 @@ public class NodeShutdownAllocationDeciderTests extends ESAllocationTestCase {
 
         Decision decision = decider.shouldAutoExpandToNode(indexMetadata, DATA_NODE, allocation);
         assertThat(decision.type(), equalTo(Decision.Type.YES));
-        assertThat(decision.getExplanation(), equalTo("node [" + DATA_NODE.getId() + "] is not preparing for removal from the cluster"));
+        assertThat(decision.getExplanation(), equalTo("no nodes are shutting down"));
+    }
+
+    public void testCanAutoExpandToNodeThatIsNotShuttingDown() {
+        ClusterState state = prepareState(
+            service.reroute(ClusterState.EMPTY_STATE, "initial state"),
+            randomFrom(SingleNodeShutdownMetadata.Type.REMOVE, SingleNodeShutdownMetadata.Type.REPLACE),
+            "other-node-id"
+        );
+
+        RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, state, null, null, 0);
+        allocation.debugDecision(true);
+
+        Decision decision = decider.shouldAutoExpandToNode(indexMetadata, DATA_NODE, allocation);
+        assertThat(decision.type(), equalTo(Decision.Type.YES));
+        assertThat(decision.getExplanation(), equalTo("this node is not shutting down"));
     }
 
     public void testCannotAutoExpandToRemovingNode() {
@@ -176,13 +191,17 @@ public class NodeShutdownAllocationDeciderTests extends ESAllocationTestCase {
 
         Decision decision = decider.shouldAutoExpandToNode(indexMetadata, DATA_NODE, allocation);
         assertThat(decision.type(), equalTo(Decision.Type.NO));
-        assertThat(decision.getExplanation(), equalTo("node [" + DATA_NODE.getId() + "] is preparing for removal from the cluster"));
+        assertThat(decision.getExplanation(), equalTo("node [" + DATA_NODE.getId() + "] is preparing to be removed from the cluster"));
     }
 
     private ClusterState prepareState(ClusterState initialState, SingleNodeShutdownMetadata.Type shutdownType) {
+        return prepareState(initialState, shutdownType, DATA_NODE.getId());
+    }
+
+    private ClusterState prepareState(ClusterState initialState, SingleNodeShutdownMetadata.Type shutdownType, String nodeId) {
         final String targetNodeName = shutdownType == SingleNodeShutdownMetadata.Type.REPLACE ? randomAlphaOfLengthBetween(10, 20) : null;
         final SingleNodeShutdownMetadata nodeShutdownMetadata = SingleNodeShutdownMetadata.builder()
-            .setNodeId(DATA_NODE.getId())
+            .setNodeId(nodeId)
             .setType(shutdownType)
             .setReason(this.getTestName())
             .setStartedAtMillis(1L)