Browse Source

Consider ShardRouting roles when calculating shard copies in shutdown status (#106063)

Not considering the roles can result in returning COMPLETE in stateless if the other copy is INDEX_ONLY. This can result in unavailability of search shards which is considered RED health in stateless. With this change, we'll return STALLED which prevents ES from exiting.

Relates ES-7999
Pooya Salehi 1 year ago
parent
commit
393bdbc0f3

+ 5 - 0
docs/changelog/106063.yaml

@@ -0,0 +1,5 @@
+pr: 106063
+summary: Consider `ShardRouting` roles when calculating shard copies in shutdown status
+area: Infra/Node Lifecycle
+type: bug
+issues: []

+ 1 - 0
x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusAction.java

@@ -376,6 +376,7 @@ public class TransportGetShutdownStatusAction extends TransportMasterNodeAction<
             .allShards(shardRouting.index().getName())
             .stream()
             .filter(sr -> sr.id() == shardRouting.id())
+            .filter(sr -> sr.role().equals(shardRouting.role()))
             // If any shards are both 1) `STARTED` and 2) are not on a node that's shutting down, we have at least one copy
             // of this shard safely on a node that's not shutting down, so we don't want to report `STALLED` because of this shard.
             .filter(ShardRouting::started)

+ 45 - 0
x-pack/plugin/shutdown/src/test/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusActionTests.java

@@ -468,6 +468,51 @@ public class TransportGetShutdownStatusActionTests extends ESTestCase {
         );
     }
 
+    public void testStalledIfShardCopyOnAnotherNodeHasDifferentRole() {
+        Index index = new Index(randomIdentifier(), randomUUID());
+        IndexMetadata imd = generateIndexMetadata(index, 3, 0);
+        IndexRoutingTable indexRoutingTable = IndexRoutingTable.builder(index)
+            .addShard(
+                new TestShardRouting.Builder(new ShardId(index, 0), LIVE_NODE_ID, true, ShardRoutingState.STARTED).withRole(
+                    ShardRouting.Role.INDEX_ONLY
+                ).build()
+            )
+            .addShard(
+                new TestShardRouting.Builder(new ShardId(index, 0), SHUTTING_DOWN_NODE_ID, false, ShardRoutingState.STARTED).withRole(
+                    ShardRouting.Role.SEARCH_ONLY
+                ).build()
+            )
+            .build();
+
+        // Force a decision of NO for all moves and new allocations, simulating a decider that's stuck
+        canAllocate.set((r, n, a) -> Decision.NO);
+        // And the remain decider simulates NodeShutdownAllocationDecider
+        canRemain.set((r, n, a) -> n.nodeId().equals(SHUTTING_DOWN_NODE_ID) ? Decision.NO : Decision.YES);
+
+        RoutingTable.Builder routingTable = RoutingTable.builder();
+        routingTable.add(indexRoutingTable);
+        ClusterState state = createTestClusterState(routingTable.build(), List.of(imd), SingleNodeShutdownMetadata.Type.REMOVE);
+
+        ShutdownShardMigrationStatus status = TransportGetShutdownStatusAction.shardMigrationStatus(
+            new CancellableTask(1, "direct", GetShutdownStatusAction.NAME, "", TaskId.EMPTY_TASK_ID, Map.of()),
+            state,
+            SHUTTING_DOWN_NODE_ID,
+            SingleNodeShutdownMetadata.Type.SIGTERM,
+            true,
+            clusterInfoService,
+            snapshotsInfoService,
+            allocationService,
+            allocationDeciders
+        );
+
+        assertShardMigration(
+            status,
+            SingleNodeShutdownMetadata.Status.STALLED,
+            1,
+            allOf(containsString(index.getName()), containsString("[0] [replica]"))
+        );
+    }
+
     public void testNotStalledIfAllShardsHaveACopyOnAnotherNode() {
         Index index = new Index(randomAlphaOfLength(5), randomAlphaOfLengthBetween(1, 20));
         IndexMetadata imd = generateIndexMetadata(index, 3, 0);