|  | @@ -24,6 +24,7 @@ import org.elasticsearch.cluster.routing.RoutingTable;
 | 
	
		
			
				|  |  |  import org.elasticsearch.cluster.routing.ShardRouting;
 | 
	
		
			
				|  |  |  import org.elasticsearch.cluster.routing.ShardRoutingState;
 | 
	
		
			
				|  |  |  import org.elasticsearch.cluster.routing.TestShardRouting;
 | 
	
		
			
				|  |  | +import org.elasticsearch.cluster.routing.UnassignedInfo;
 | 
	
		
			
				|  |  |  import org.elasticsearch.cluster.routing.allocation.AllocationService;
 | 
	
		
			
				|  |  |  import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
 | 
	
		
			
				|  |  |  import org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator;
 | 
	
	
		
			
				|  | @@ -34,12 +35,14 @@ import org.elasticsearch.cluster.routing.allocation.decider.NodeReplacementAlloc
 | 
	
		
			
				|  |  |  import org.elasticsearch.cluster.routing.allocation.decider.NodeShutdownAllocationDecider;
 | 
	
		
			
				|  |  |  import org.elasticsearch.common.settings.Settings;
 | 
	
		
			
				|  |  |  import org.elasticsearch.common.transport.TransportAddress;
 | 
	
		
			
				|  |  | +import org.elasticsearch.gateway.GatewayAllocator;
 | 
	
		
			
				|  |  |  import org.elasticsearch.index.Index;
 | 
	
		
			
				|  |  |  import org.elasticsearch.index.shard.ShardId;
 | 
	
		
			
				|  |  |  import org.elasticsearch.node.Node;
 | 
	
		
			
				|  |  |  import org.elasticsearch.snapshots.SnapshotShardSizeInfo;
 | 
	
		
			
				|  |  |  import org.elasticsearch.snapshots.SnapshotsInfoService;
 | 
	
		
			
				|  |  |  import org.elasticsearch.test.ESTestCase;
 | 
	
		
			
				|  |  | +import org.elasticsearch.test.gateway.TestGatewayAllocator;
 | 
	
		
			
				|  |  |  import org.hamcrest.Matcher;
 | 
	
		
			
				|  |  |  import org.junit.Before;
 | 
	
		
			
				|  |  |  
 | 
	
	
		
			
				|  | @@ -47,6 +50,7 @@ import java.util.Collections;
 | 
	
		
			
				|  |  |  import java.util.HashMap;
 | 
	
		
			
				|  |  |  import java.util.List;
 | 
	
		
			
				|  |  |  import java.util.Map;
 | 
	
		
			
				|  |  | +import java.util.Set;
 | 
	
		
			
				|  |  |  import java.util.concurrent.atomic.AtomicReference;
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  import static org.hamcrest.Matchers.allOf;
 | 
	
	
		
			
				|  | @@ -54,6 +58,9 @@ import static org.hamcrest.Matchers.containsString;
 | 
	
		
			
				|  |  |  import static org.hamcrest.Matchers.equalTo;
 | 
	
		
			
				|  |  |  import static org.hamcrest.Matchers.is;
 | 
	
		
			
				|  |  |  import static org.hamcrest.Matchers.nullValue;
 | 
	
		
			
				|  |  | +import static org.mockito.ArgumentMatchers.anyInt;
 | 
	
		
			
				|  |  | +import static org.mockito.Mockito.doAnswer;
 | 
	
		
			
				|  |  | +import static org.mockito.Mockito.spy;
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  public class TransportGetShutdownStatusActionTests extends ESTestCase {
 | 
	
		
			
				|  |  |      public static final String SHUTTING_DOWN_NODE_ID = "node1";
 | 
	
	
		
			
				|  | @@ -122,6 +129,7 @@ public class TransportGetShutdownStatusActionTests extends ESTestCase {
 | 
	
		
			
				|  |  |              clusterInfoService,
 | 
	
		
			
				|  |  |              snapshotsInfoService
 | 
	
		
			
				|  |  |          );
 | 
	
		
			
				|  |  | +        allocationService.setExistingShardsAllocators(Map.of(GatewayAllocator.ALLOCATOR_NAME, new TestGatewayAllocator()));
 | 
	
		
			
				|  |  |      }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      /**
 | 
	
	
		
			
				|  | @@ -349,6 +357,60 @@ public class TransportGetShutdownStatusActionTests extends ESTestCase {
 | 
	
		
			
				|  |  |          );
 | 
	
		
			
				|  |  |      }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +    /**
 | 
	
		
			
				|  |  | +     * Ensure we can detect stalled migrations when we have unassigned shards that had the shutting down node as their last known
 | 
	
		
			
				|  |  | +     * node id
 | 
	
		
			
				|  |  | +     */
 | 
	
		
			
				|  |  | +    public void testStalledUnassigned() {
 | 
	
		
			
				|  |  | +        Index index = new Index(randomAlphaOfLength(5), randomAlphaOfLengthBetween(1, 20));
 | 
	
		
			
				|  |  | +        IndexMetadata imd = spy(generateIndexMetadata(index, 3, 0));
 | 
	
		
			
				|  |  | +        // make sure the TestGatewayAllocator stays in sync always, avoid flaky tests
 | 
	
		
			
				|  |  | +        doAnswer(i -> {
 | 
	
		
			
				|  |  | +            if ((Integer) i.getArgument(0) < 2) {
 | 
	
		
			
				|  |  | +                return Set.of(LIVE_NODE_ID);
 | 
	
		
			
				|  |  | +            }
 | 
	
		
			
				|  |  | +            return Set.of(SHUTTING_DOWN_NODE_ID);
 | 
	
		
			
				|  |  | +        }).when(imd).inSyncAllocationIds(anyInt());
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        var shard0 = TestShardRouting.newShardRouting(new ShardId(index, 0), LIVE_NODE_ID, true, ShardRoutingState.STARTED);
 | 
	
		
			
				|  |  | +        var shard1 = TestShardRouting.newShardRouting(new ShardId(index, 1), LIVE_NODE_ID, true, ShardRoutingState.STARTED);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        // we should stall the node if we find an unassigned shard with lastAllocatedNodeId matching the shutting down node
 | 
	
		
			
				|  |  | +        var unassigned = makeUnassignedShard(index, 2, SHUTTING_DOWN_NODE_ID, true);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        assertShardMigration(
 | 
	
		
			
				|  |  | +            getUnassignedShutdownStatus(index, imd, shard0, shard1, unassigned),
 | 
	
		
			
				|  |  | +            SingleNodeShutdownMetadata.Status.STALLED,
 | 
	
		
			
				|  |  | +            1,
 | 
	
		
			
				|  |  | +            allOf(containsString(index.getName()), containsString("[2] [primary]"))
 | 
	
		
			
				|  |  | +        );
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        // if the shard is unassigned, but it's not a primary on this node, we shouldn't stall
 | 
	
		
			
				|  |  | +        var shard2 = TestShardRouting.newShardRouting(new ShardId(index, 2), LIVE_NODE_ID, true, ShardRoutingState.STARTED);
 | 
	
		
			
				|  |  | +        var unassignedReplica = makeUnassignedShard(index, 2, SHUTTING_DOWN_NODE_ID, false);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        var s = getUnassignedShutdownStatus(index, imd, shard0, shard1, shard2, unassignedReplica);
 | 
	
		
			
				|  |  | +        assertShardMigration(s, SingleNodeShutdownMetadata.Status.COMPLETE, 0, nullValue());
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        // check if we correctly count all of the unassigned shards
 | 
	
		
			
				|  |  | +        var unassigned3 = makeUnassignedShard(index, 3, SHUTTING_DOWN_NODE_ID, true);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        assertShardMigration(
 | 
	
		
			
				|  |  | +            getUnassignedShutdownStatus(index, imd, shard0, shard1, unassigned3, unassigned),
 | 
	
		
			
				|  |  | +            SingleNodeShutdownMetadata.Status.STALLED,
 | 
	
		
			
				|  |  | +            2,
 | 
	
		
			
				|  |  | +            allOf(containsString(index.getName()), containsString("[2] [primary]"))
 | 
	
		
			
				|  |  | +        );
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        // check if we correctly walk all of the unassigned shards, shard 2 replica, shard 3 primary
 | 
	
		
			
				|  |  | +        assertShardMigration(
 | 
	
		
			
				|  |  | +            getUnassignedShutdownStatus(index, imd, shard0, shard1, shard2, unassignedReplica, unassigned3),
 | 
	
		
			
				|  |  | +            SingleNodeShutdownMetadata.Status.STALLED,
 | 
	
		
			
				|  |  | +            1,
 | 
	
		
			
				|  |  | +            allOf(containsString(index.getName()), containsString("[3] [primary]"))
 | 
	
		
			
				|  |  | +        );
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |      public void testNotStalledIfAllShardsHaveACopyOnAnotherNode() {
 | 
	
		
			
				|  |  |          Index index = new Index(randomAlphaOfLength(5), randomAlphaOfLengthBetween(1, 20));
 | 
	
		
			
				|  |  |          IndexMetadata imd = generateIndexMetadata(index, 3, 0);
 | 
	
	
		
			
				|  | @@ -571,4 +633,62 @@ public class TransportGetShutdownStatusActionTests extends ESTestCase {
 | 
	
		
			
				|  |  |              .routingTable(indexRoutingTable)
 | 
	
		
			
				|  |  |              .build();
 | 
	
		
			
				|  |  |      }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    private UnassignedInfo makeUnassignedInfo(String nodeId) {
 | 
	
		
			
				|  |  | +        return new UnassignedInfo(
 | 
	
		
			
				|  |  | +            UnassignedInfo.Reason.ALLOCATION_FAILED,
 | 
	
		
			
				|  |  | +            "testing",
 | 
	
		
			
				|  |  | +            null,
 | 
	
		
			
				|  |  | +            1,
 | 
	
		
			
				|  |  | +            System.nanoTime(),
 | 
	
		
			
				|  |  | +            System.currentTimeMillis(),
 | 
	
		
			
				|  |  | +            false,
 | 
	
		
			
				|  |  | +            UnassignedInfo.AllocationStatus.NO_ATTEMPT,
 | 
	
		
			
				|  |  | +            Collections.emptySet(),
 | 
	
		
			
				|  |  | +            nodeId
 | 
	
		
			
				|  |  | +        );
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    private ShardRouting makeUnassignedShard(Index index, int shardId, String nodeId, boolean primary) {
 | 
	
		
			
				|  |  | +        var unsignedInfo = makeUnassignedInfo(nodeId);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        return TestShardRouting.newShardRouting(
 | 
	
		
			
				|  |  | +            new ShardId(index, shardId),
 | 
	
		
			
				|  |  | +            null,
 | 
	
		
			
				|  |  | +            null,
 | 
	
		
			
				|  |  | +            primary,
 | 
	
		
			
				|  |  | +            ShardRoutingState.UNASSIGNED,
 | 
	
		
			
				|  |  | +            unsignedInfo
 | 
	
		
			
				|  |  | +        );
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    private ShutdownShardMigrationStatus getUnassignedShutdownStatus(Index index, IndexMetadata imd, ShardRouting... shards) {
 | 
	
		
			
				|  |  | +        var indexRoutingTableBuilder = IndexRoutingTable.builder(index);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        for (var routing : shards) {
 | 
	
		
			
				|  |  | +            indexRoutingTableBuilder.addShard(routing);
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        var indexRoutingTable = indexRoutingTableBuilder.build();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        // Force a decision of NO for all moves and new allocations, simulating a decider that's stuck
 | 
	
		
			
				|  |  | +        canAllocate.set((r, n, a) -> Decision.NO);
 | 
	
		
			
				|  |  | +        // And the remain decider simulates NodeShutdownAllocationDecider
 | 
	
		
			
				|  |  | +        canRemain.set((r, n, a) -> n.nodeId().equals(SHUTTING_DOWN_NODE_ID) ? Decision.NO : Decision.YES);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        RoutingTable.Builder routingTable = RoutingTable.builder();
 | 
	
		
			
				|  |  | +        routingTable.add(indexRoutingTable);
 | 
	
		
			
				|  |  | +        ClusterState state = createTestClusterState(routingTable.build(), List.of(imd), SingleNodeShutdownMetadata.Type.REMOVE);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        return TransportGetShutdownStatusAction.shardMigrationStatus(
 | 
	
		
			
				|  |  | +            state,
 | 
	
		
			
				|  |  | +            SHUTTING_DOWN_NODE_ID,
 | 
	
		
			
				|  |  | +            SingleNodeShutdownMetadata.Type.REMOVE,
 | 
	
		
			
				|  |  | +            true,
 | 
	
		
			
				|  |  | +            clusterInfoService,
 | 
	
		
			
				|  |  | +            snapshotsInfoService,
 | 
	
		
			
				|  |  | +            allocationService,
 | 
	
		
			
				|  |  | +            allocationDeciders
 | 
	
		
			
				|  |  | +        );
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  |  }
 |