Browse Source

Fix Incorrect Concurrent SnapshotException on Master Failover (#54877)

If we run into an INIT state snapshot and the current master didn't create it, it will be removed anyway.
=> no need to have that block another snapshot from starting.
This has practical relevance because on master fail-over after snapshot INIT but before start, the create snapshot request will be retried by the client (as it's a transport master node action) and needlessly fail with an unexpected exception (snapshot clearly didn't exist so it's confusing to the user).

This allowed making two disruption type tests stricter
Armin Braun 5 years ago
parent
commit
93c6d771ee

+ 5 - 1
server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java

@@ -185,7 +185,11 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
                         "cannot snapshot while a repository cleanup is in-progress in [" + repositoryCleanupInProgress + "]");
                 }
                 SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE);
-                if (snapshots != null && snapshots.entries().isEmpty() == false) {
+                // Fail if there are any concurrently running snapshots. The only exception to this being a snapshot in INIT state from a
+                // previous master that we can simply ignore and remove from the cluster state because we would clean it up from the
+                // cluster state anyway in #applyClusterState.
+                if (snapshots != null && snapshots.entries().stream().anyMatch(entry ->
+                    (entry.state() == State.INIT && initializingSnapshots.contains(entry.snapshot()) == false) == false)) {
                     throw new ConcurrentSnapshotExecutionException(repositoryName, snapshotName, " a snapshot is already running");
                 }
                 // Store newSnapshot here to be processed in clusterStateProcessed

+ 1 - 12
server/src/test/java/org/elasticsearch/discovery/SnapshotDisruptionIT.java

@@ -34,7 +34,6 @@ import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.ByteSizeUnit;
 import org.elasticsearch.plugins.Plugin;
-import org.elasticsearch.snapshots.ConcurrentSnapshotExecutionException;
 import org.elasticsearch.snapshots.SnapshotException;
 import org.elasticsearch.snapshots.SnapshotInfo;
 import org.elasticsearch.snapshots.SnapshotMissingException;
@@ -149,17 +148,7 @@ public class SnapshotDisruptionIT extends ESIntegTestCase {
         ensureStableCluster(4, masterNode1);
         logger.info("--> done");
 
-        try {
-            future.get();
-        } catch (Exception ex) {
-            Throwable cause = ex.getCause();
-            if (cause.getCause() instanceof ConcurrentSnapshotExecutionException) {
-                logger.info("--> got exception from race in master operation retries");
-            } else {
-                logger.info("--> got exception from hanged master", ex);
-            }
-        }
-
+        future.get();
         assertAllSnapshotsCompleted();
     }
 

+ 9 - 5
server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java

@@ -716,6 +716,8 @@ public class SnapshotResiliencyTests extends ESTestCase {
         continueOrDie(createRepoAndIndex(repoName, index, shards),
             createIndexResponse -> client().admin().cluster().state(new ClusterStateRequest(), clusterStateResponseStepListener));
 
+        final StepListener<CreateSnapshotResponse> snapshotStartedListener = new StepListener<>();
+
         continueOrDie(clusterStateResponseStepListener, clusterStateResponse -> {
             final ShardRouting shardToRelocate = clusterStateResponse.getState().routingTable().allShards(index).get(0);
             final TestClusterNodes.TestClusterNode currentPrimaryNode = testClusterNodes.nodeById(shardToRelocate.currentNodeId());
@@ -734,11 +736,7 @@ public class SnapshotResiliencyTests extends ESTestCase {
                                 scheduleNow(() -> testClusterNodes.stopNode(masterNode));
                             }
                             testClusterNodes.randomDataNodeSafe().client.admin().cluster().prepareCreateSnapshot(repoName, snapshotName)
-                                .execute(ActionListener.wrap(() -> {
-                                    createdSnapshot.set(true);
-                                    testClusterNodes.randomDataNodeSafe().client.admin().cluster().deleteSnapshot(
-                                        new DeleteSnapshotRequest(repoName, snapshotName), noopListener());
-                                }));
+                                .execute(snapshotStartedListener);
                             scheduleNow(
                                 () -> testClusterNodes.randomMasterNodeSafe().client.admin().cluster().reroute(
                                     new ClusterRerouteRequest().add(new AllocateEmptyPrimaryAllocationCommand(
@@ -751,6 +749,12 @@ public class SnapshotResiliencyTests extends ESTestCase {
             });
         });
 
+        continueOrDie(snapshotStartedListener, snapshotResponse -> {
+            createdSnapshot.set(true);
+            testClusterNodes.randomDataNodeSafe().client.admin().cluster().deleteSnapshot(
+                new DeleteSnapshotRequest(repoName, snapshotName), noopListener());
+        });
+
         runUntil(() -> testClusterNodes.randomMasterNode().map(master -> {
             if (createdSnapshot.get() == false) {
                 return false;