Browse Source

Fix testSnapshotCanceledOnRemovedShard (#65096)

This test failed a couple of times recently when master CS application was slow.
This is explained by the data node blocking before the master even gets around to applying
the cluster state that contains the started snapshot, leading to all the logic around
failing the shard to run too early.
Also we shouldn't just fire off the create snapshot response in the background without making
sure that it actually returns (otherwise we may leak the in-progress snapshot for a tiny window of time
when it was physically written to the repo but not yet removed from the cluster state and fail the repo
cleanup after the test).
Armin Braun 5 years ago
parent
commit
bc9be520da

+ 3 - 4
server/src/internalClusterTest/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java

@@ -1561,12 +1561,11 @@ public class SharedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTestCas
         String blockedNode = blockNodeWithIndex(repo, index);
 
         logger.info("--> snapshot");
-        clusterAdmin().prepareCreateSnapshot(repo, snapshot)
-            .setWaitForCompletion(false)
-            .execute();
+        final ActionFuture<CreateSnapshotResponse> snapshotFuture = startFullSnapshot(repo, snapshot);
 
         logger.info("--> waiting for block to kick in on node [{}]", blockedNode);
         waitForBlock(blockedNode, repo);
+        awaitNumberOfSnapshotsInProgress(1);
 
         logger.info("--> removing primary shard that is being snapshotted");
         ClusterState clusterState = internalCluster().clusterService(internalCluster().getMasterName()).state();
@@ -1582,7 +1581,7 @@ public class SharedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTestCas
 
         logger.info("--> ensuring snapshot is aborted and the aborted shard was marked as failed");
         awaitNoMoreRunningOperations();
-        SnapshotInfo snapshotInfo = getSnapshot(repo, snapshot);
+        SnapshotInfo snapshotInfo = snapshotFuture.get().getSnapshotInfo();
         assertEquals(1, snapshotInfo.shardFailures().size());
         assertEquals(0, snapshotInfo.shardFailures().get(0).shardId());
         assertThat(snapshotInfo.shardFailures().get(0).reason(), is("aborted"));