|  | @@ -52,7 +52,6 @@ import org.elasticsearch.test.ESIntegTestCase.ClusterScope;
 | 
	
		
			
				|  |  |  import org.elasticsearch.test.ESIntegTestCase.Scope;
 | 
	
		
			
				|  |  |  import org.elasticsearch.test.InternalTestCluster;
 | 
	
		
			
				|  |  |  import org.elasticsearch.test.disruption.BlockClusterStateProcessing;
 | 
	
		
			
				|  |  | -import org.elasticsearch.test.disruption.SingleNodeDisruption;
 | 
	
		
			
				|  |  |  import org.elasticsearch.test.transport.MockTransportService;
 | 
	
		
			
				|  |  |  import org.elasticsearch.transport.ConnectTransportException;
 | 
	
		
			
				|  |  |  import org.elasticsearch.transport.TransportRequest;
 | 
	
	
		
			
				|  | @@ -134,21 +133,7 @@ public class IndicesStoreIntegrationIT extends ESIntegTestCase {
 | 
	
		
			
				|  |  |          logger.info("--> move shard from node_1 to node_3, and wait for relocation to finish");
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |          if (randomBoolean()) { // sometimes add cluster-state delay to trigger observers in IndicesStore.ShardActiveRequestHandler
 | 
	
		
			
				|  |  | -            SingleNodeDisruption disruption = new BlockClusterStateProcessing(node_3, random());
 | 
	
		
			
				|  |  | -            internalCluster().setDisruptionScheme(disruption);
 | 
	
		
			
				|  |  | -            MockTransportService transportServiceNode3 = (MockTransportService) internalCluster().getInstance(TransportService.class, node_3);
 | 
	
		
			
				|  |  | -            CountDownLatch beginRelocationLatch = new CountDownLatch(1);
 | 
	
		
			
				|  |  | -            CountDownLatch endRelocationLatch = new CountDownLatch(1);
 | 
	
		
			
				|  |  | -            transportServiceNode3.addTracer(new ReclocationStartEndTracer(logger, beginRelocationLatch, endRelocationLatch));
 | 
	
		
			
				|  |  | -            internalCluster().client().admin().cluster().prepareReroute().add(new MoveAllocationCommand("test", 0, node_1, node_3)).get();
 | 
	
		
			
				|  |  | -            // wait for relocation to start
 | 
	
		
			
				|  |  | -            logger.info("--> waiting for relocation to start");
 | 
	
		
			
				|  |  | -            beginRelocationLatch.await();
 | 
	
		
			
				|  |  | -            logger.info("--> starting disruption");
 | 
	
		
			
				|  |  | -            disruption.startDisrupting();
 | 
	
		
			
				|  |  | -            // wait for relocation to finish
 | 
	
		
			
				|  |  | -            logger.info("--> waiting for relocation to finish");
 | 
	
		
			
				|  |  | -            endRelocationLatch.await();
 | 
	
		
			
				|  |  | +            BlockClusterStateProcessing disruption = relocateAndBlockCompletion(logger, "test", 0, node_1, node_3);
 | 
	
		
			
				|  |  |              // wait a little so that cluster state observer is registered
 | 
	
		
			
				|  |  |              sleep(50);
 | 
	
		
			
				|  |  |              logger.info("--> stopping disruption");
 | 
	
	
		
			
				|  | @@ -170,6 +155,56 @@ public class IndicesStoreIntegrationIT extends ESIntegTestCase {
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +    /**
 | 
	
		
			
				|  |  | +     * relocate a shard and block cluster state processing on the relocation target node to activate the shard
 | 
	
		
			
				|  |  | +     */
 | 
	
		
			
				|  |  | +    public static BlockClusterStateProcessing relocateAndBlockCompletion(Logger logger, String index, int shard, String nodeFrom,
 | 
	
		
			
				|  |  | +                                                                         String nodeTo) throws InterruptedException {
 | 
	
		
			
				|  |  | +        BlockClusterStateProcessing disruption = new BlockClusterStateProcessing(nodeTo, random());
 | 
	
		
			
				|  |  | +        internalCluster().setDisruptionScheme(disruption);
 | 
	
		
			
				|  |  | +        MockTransportService transportService = (MockTransportService) internalCluster().getInstance(TransportService.class, nodeTo);
 | 
	
		
			
				|  |  | +        ClusterService clusterService = internalCluster().getInstance(ClusterService.class, nodeTo);
 | 
	
		
			
				|  |  | +        CountDownLatch beginRelocationLatch = new CountDownLatch(1);
 | 
	
		
			
				|  |  | +        CountDownLatch receivedShardExistsRequestLatch = new CountDownLatch(1);
 | 
	
		
			
				|  |  | +        // use a tracer on the target node to track relocation start and end
 | 
	
		
			
				|  |  | +        transportService.addTracer(new MockTransportService.Tracer() {
 | 
	
		
			
				|  |  | +            @Override
 | 
	
		
			
				|  |  | +            public void receivedRequest(long requestId, String action) {
 | 
	
		
			
				|  |  | +                if (action.equals(PeerRecoveryTargetService.Actions.FILES_INFO)) {
 | 
	
		
			
				|  |  | +                    logger.info("received: {}, relocation starts", action);
 | 
	
		
			
				|  |  | +                    beginRelocationLatch.countDown();
 | 
	
		
			
				|  |  | +                } else if (action.equals(IndicesStore.ACTION_SHARD_EXISTS)) {
 | 
	
		
			
				|  |  | +                    // Whenever a node deletes a shard because it was relocated somewhere else, it first
 | 
	
		
			
				|  |  | +                    // checks if enough other copies are started somewhere else. The node sends a ShardActiveRequest
 | 
	
		
			
				|  |  | +                    // to the other nodes that should have a copy according to cluster state.
 | 
	
		
			
				|  |  | +                    receivedShardExistsRequestLatch.countDown();
 | 
	
		
			
				|  |  | +                    logger.info("received: {}, relocation done", action);
 | 
	
		
			
				|  |  | +                } else if (action.equals(PeerRecoveryTargetService.Actions.WAIT_CLUSTERSTATE)) {
 | 
	
		
			
				|  |  | +                    logger.info("received: {}, waiting on cluster state", action);
 | 
	
		
			
				|  |  | +                    // ensure that relocation target node is on the same cluster state as relocation source before proceeding with
 | 
	
		
			
				|  |  | +                    // this request. If the target does not have the relocating cluster state exposed through ClusterService.state(),
 | 
	
		
			
				|  |  | +                    // then waitForClusterState will have to register a ClusterObserver with the ClusterService, which can cause
 | 
	
		
			
				|  |  | +                    // a race with the BlockClusterStateProcessing block that is added below.
 | 
	
		
			
				|  |  | +                    try {
 | 
	
		
			
				|  |  | +                        assertBusy(() -> assertTrue(
 | 
	
		
			
				|  |  | +                            clusterService.state().routingTable().index(index).shard(shard).primaryShard().relocating()));
 | 
	
		
			
				|  |  | +                    } catch (Exception e) {
 | 
	
		
			
				|  |  | +                        throw new RuntimeException(e);
 | 
	
		
			
				|  |  | +                    }
 | 
	
		
			
				|  |  | +                }
 | 
	
		
			
				|  |  | +            }
 | 
	
		
			
				|  |  | +        });
 | 
	
		
			
				|  |  | +        internalCluster().client().admin().cluster().prepareReroute().add(new MoveAllocationCommand(index, shard, nodeFrom, nodeTo)).get();
 | 
	
		
			
				|  |  | +        logger.info("--> waiting for relocation to start");
 | 
	
		
			
				|  |  | +        beginRelocationLatch.await();
 | 
	
		
			
				|  |  | +        logger.info("--> starting disruption");
 | 
	
		
			
				|  |  | +        disruption.startDisrupting();
 | 
	
		
			
				|  |  | +        logger.info("--> waiting for relocation to finish");
 | 
	
		
			
				|  |  | +        receivedShardExistsRequestLatch.await();
 | 
	
		
			
				|  |  | +        logger.info("--> relocation completed (but cluster state processing block still in place)");
 | 
	
		
			
				|  |  | +        return disruption;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |      /* Test that shard is deleted in case ShardActiveRequest after relocation and next incoming cluster state is an index delete. */
 | 
	
		
			
				|  |  |      public void testShardCleanupIfShardDeletionAfterRelocationFailedAndIndexDeleted() throws Exception {
 | 
	
		
			
				|  |  |          final String node_1 = internalCluster().startNode();
 | 
	
	
		
			
				|  | @@ -449,40 +484,4 @@ public class IndicesStoreIntegrationIT extends ESIntegTestCase {
 | 
	
		
			
				|  |  |          awaitBusy(() -> !Files.exists(indexDirectory(server, index)));
 | 
	
		
			
				|  |  |          return Files.exists(indexDirectory(server, index));
 | 
	
		
			
				|  |  |      }
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -    /**
 | 
	
		
			
				|  |  | -     * This Tracer can be used to signal start and end of a recovery.
 | 
	
		
			
				|  |  | -     * This is used to test the following:
 | 
	
		
			
				|  |  | -     * Whenever a node deletes a shard because it was relocated somewhere else, it first
 | 
	
		
			
				|  |  | -     * checks if enough other copies are started somewhere else. The node sends a ShardActiveRequest
 | 
	
		
			
				|  |  | -     * to the other nodes that should have a copy according to cluster state.
 | 
	
		
			
				|  |  | -     * The nodes that receive this request check if the shard is in state STARTED in which case they
 | 
	
		
			
				|  |  | -     * respond with "true". If they have the shard in POST_RECOVERY they register a cluster state
 | 
	
		
			
				|  |  | -     * observer that checks at each update if the shard has moved to STARTED.
 | 
	
		
			
				|  |  | -     * To test that this mechanism actually works, this can be triggered by blocking the cluster
 | 
	
		
			
				|  |  | -     * state processing when a recover starts and only unblocking it shortly after the node receives
 | 
	
		
			
				|  |  | -     * the ShardActiveRequest.
 | 
	
		
			
				|  |  | -     */
 | 
	
		
			
				|  |  | -    public static class ReclocationStartEndTracer extends MockTransportService.Tracer {
 | 
	
		
			
				|  |  | -        private final Logger logger;
 | 
	
		
			
				|  |  | -        private final CountDownLatch beginRelocationLatch;
 | 
	
		
			
				|  |  | -        private final CountDownLatch receivedShardExistsRequestLatch;
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -        public ReclocationStartEndTracer(Logger logger, CountDownLatch beginRelocationLatch, CountDownLatch receivedShardExistsRequestLatch) {
 | 
	
		
			
				|  |  | -            this.logger = logger;
 | 
	
		
			
				|  |  | -            this.beginRelocationLatch = beginRelocationLatch;
 | 
	
		
			
				|  |  | -            this.receivedShardExistsRequestLatch = receivedShardExistsRequestLatch;
 | 
	
		
			
				|  |  | -        }
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -        @Override
 | 
	
		
			
				|  |  | -        public void receivedRequest(long requestId, String action) {
 | 
	
		
			
				|  |  | -            if (action.equals(PeerRecoveryTargetService.Actions.FILES_INFO)) {
 | 
	
		
			
				|  |  | -                logger.info("received: {}, relocation starts", action);
 | 
	
		
			
				|  |  | -                beginRelocationLatch.countDown();
 | 
	
		
			
				|  |  | -            } else if (action.equals(IndicesStore.ACTION_SHARD_EXISTS)) {
 | 
	
		
			
				|  |  | -                receivedShardExistsRequestLatch.countDown();
 | 
	
		
			
				|  |  | -                logger.info("received: {}, relocation done", action);
 | 
	
		
			
				|  |  | -            }
 | 
	
		
			
				|  |  | -        }
 | 
	
		
			
				|  |  | -    }
 | 
	
		
			
				|  |  |  }
 |