|
@@ -1062,6 +1062,57 @@ public class ConcurrentSnapshotsIT extends AbstractSnapshotIntegTestCase {
|
|
|
awaitNoMoreRunningOperations();
|
|
|
}
|
|
|
|
|
|
+ public void testMasterFailoverDuringStaleIndicesCleanup() throws Exception {
|
|
|
+ internalCluster().startMasterOnlyNodes(3);
|
|
|
+ final String dataNode = internalCluster().startDataOnlyNode();
|
|
|
+ final String repoName = "test-repo";
|
|
|
+ createRepository(repoName, "mock");
|
|
|
+ createFullSnapshot(repoName, "empty-snapshot");
|
|
|
+ // use a few more shards to make master take a little longer to clean up the stale index and simulate more concurrency between
|
|
|
+ // snapshot create and delete below
|
|
|
+ createIndexWithContent("index-test", indexSettingsNoReplicas(randomIntBetween(6, 10)).build());
|
|
|
+ final NetworkDisruption networkDisruption = isolateMasterDisruption(NetworkDisruption.DISCONNECT);
|
|
|
+ internalCluster().setDisruptionScheme(networkDisruption);
|
|
|
+
|
|
|
+ final List<String> fullSnapshotsToDelete = createNSnapshots(repoName, randomIntBetween(1, 5));
|
|
|
+ final String masterName = internalCluster().getMasterName();
|
|
|
+ blockMasterOnAnyDataFile(repoName);
|
|
|
+ final ActionFuture<AcknowledgedResponse> deleteAllSnapshotsWithIndex = startDeleteSnapshots(
|
|
|
+ repoName,
|
|
|
+ fullSnapshotsToDelete,
|
|
|
+ masterName
|
|
|
+ );
|
|
|
+
|
|
|
+ final ActionFuture<CreateSnapshotResponse> snapshotFuture = startFullSnapshotFromDataNode(repoName, "new-full-snapshot");
|
|
|
+ waitForBlock(masterName, repoName);
|
|
|
+ awaitNDeletionsInProgress(1);
|
|
|
+ awaitNumberOfSnapshotsInProgress(1);
|
|
|
+ networkDisruption.startDisrupting();
|
|
|
+ ensureStableCluster(3, dataNode);
|
|
|
+ // wait for the snapshot to finish while the isolated master is stuck on deleting a data blob
|
|
|
+ try {
|
|
|
+ snapshotFuture.get();
|
|
|
+ } catch (Exception e) {
|
|
|
+ // ignore exceptions here, the snapshot will work out fine in all cases but the API might throw because of the master
|
|
|
+ // fail-over during the snapshot
|
|
|
+ // TODO: remove this leniency once we fix the API to handle master failover cleaner
|
|
|
+ }
|
|
|
+ awaitNoMoreRunningOperations(dataNode);
|
|
|
+
|
|
|
+ // now unblock the stale master and have it continue deleting blobs from the repository
|
|
|
+ unblockNode(repoName, masterName);
|
|
|
+
|
|
|
+ networkDisruption.stopDisrupting();
|
|
|
+ ensureStableCluster(4);
|
|
|
+ try {
|
|
|
+ deleteAllSnapshotsWithIndex.get();
|
|
|
+ } catch (Exception ignored) {
|
|
|
+ // ignored as we had a failover in here and will get all kinds of errors as a result, just making sure the future completes in
|
|
|
+ // all cases for now
|
|
|
+ // TODO: remove this leniency once we fix the API to handle master failover cleaner
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
public void testStatusMultipleSnapshotsMultipleRepos() throws Exception {
|
|
|
internalCluster().startMasterOnlyNode();
|
|
|
// We're blocking a some of the snapshot threads when we block the first repo below so we have to make sure we have enough threads
|
|
@@ -1975,6 +2026,16 @@ public class ConcurrentSnapshotsIT extends AbstractSnapshotIntegTestCase {
|
|
|
.execute();
|
|
|
}
|
|
|
|
|
|
+ private ActionFuture<CreateSnapshotResponse> startFullSnapshotFromDataNode(String repoName, String snapshotName) {
|
|
|
+ logger.info("--> creating full snapshot [{}] to repo [{}] from data node client", snapshotName, repoName);
|
|
|
+ return internalCluster().dataNodeClient()
|
|
|
+ .admin()
|
|
|
+ .cluster()
|
|
|
+ .prepareCreateSnapshot(repoName, snapshotName)
|
|
|
+ .setWaitForCompletion(true)
|
|
|
+ .execute();
|
|
|
+ }
|
|
|
+
|
|
|
private ActionFuture<CreateSnapshotResponse> startFullSnapshotFromMasterClient(String repoName, String snapshotName) {
|
|
|
logger.info("--> creating full snapshot [{}] to repo [{}] from master client", snapshotName, repoName);
|
|
|
return internalCluster().masterClient()
|