Browse Source

Fix Bug Causing Queued Snapshots of Deleted Indices to Never Finalize (#75942)

We have to run the loop checking for completed snapshots if we see an index disappearing.
Otherwise, we never get around to finalizing a queued snapshot stuck after a clone if the
index is deleted during cloning.
Armin Braun 4 years ago
parent
commit
f92ac0e50d

+ 51 - 0
server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java

@@ -1611,6 +1611,57 @@ public class ConcurrentSnapshotsIT extends AbstractSnapshotIntegTestCase {
         );
     }
 
+    public void testIndexDeletedWhileSnapshotQueuedAfterClone() throws Exception {
+        final String master = internalCluster().startMasterOnlyNode(LARGE_SNAPSHOT_POOL_SETTINGS);
+        internalCluster().startDataOnlyNode();
+        final String index1 = "index-1";
+        final String index2 = "index-2";
+        createIndexWithContent(index1);
+        createIndexWithContent(index2);
+
+        final String repository = "test-repo";
+        createRepository(repository, "mock");
+
+        final String sourceSnapshot = "source-snapshot";
+        createFullSnapshot(repository, sourceSnapshot);
+
+        final IndexId index1Id = getRepositoryData(repository).resolveIndexId(index1);
+        blockMasterOnShardLevelSnapshotFile(repository, index1Id.getId());
+
+        final String cloneTarget = "target-snapshot";
+        final ActionFuture<AcknowledgedResponse> cloneSnapshot = clusterAdmin().prepareCloneSnapshot(
+            repository,
+            sourceSnapshot,
+            cloneTarget
+        ).setIndices(index1, index2).execute();
+        awaitNumberOfSnapshotsInProgress(1);
+        waitForBlock(master, repository);
+
+        final ActionFuture<CreateSnapshotResponse> snapshot3 = clusterAdmin().prepareCreateSnapshot(repository, "snapshot-3")
+            .setIndices(index1, index2)
+            .setWaitForCompletion(true)
+            .setPartial(true)
+            .execute();
+        final ActionFuture<CreateSnapshotResponse> snapshot2 = clusterAdmin().prepareCreateSnapshot(repository, "snapshot-2")
+            .setIndices(index2)
+            .setWaitForCompletion(true)
+            .execute();
+        assertSuccessful(snapshot2);
+        awaitNumberOfSnapshotsInProgress(2);
+        assertFalse(snapshot3.isDone());
+        assertAcked(admin().indices().prepareDelete(index1).get());
+        assertSuccessful(snapshot3);
+        unblockNode(repository, master);
+
+        assertAcked(cloneSnapshot.get());
+        assertAcked(startDeleteSnapshot(repository, cloneTarget).get());
+
+        assertThat(
+            clusterAdmin().prepareSnapshotStatus().setSnapshots("snapshot-2", "snapshot-3").setRepository(repository).get().getSnapshots(),
+            hasSize(2)
+        );
+    }
+
     public void testQueuedAfterFailedShardSnapshot() throws Exception {
         internalCluster().startMasterOnlyNode();
         final String dataNode = internalCluster().startDataOnlyNode();

+ 1 - 0
server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java

@@ -756,6 +756,7 @@ public class SnapshotsInProgress extends AbstractNamedDiffable<Custom> implement
         }
 
         public Index indexByName(String name) {
+            assert isClone() == false : "tried to get routing index for clone entry [" + this + "]";
             return snapshotIndices.get(name);
         }
 

+ 16 - 5
server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java

@@ -1229,8 +1229,18 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
                 // this shard snapshot is waiting for a previous snapshot to finish execution for this shard
                 final ShardSnapshotStatus knownFailure = knownFailures.get(shardId);
                 if (knownFailure == null) {
-                    // if no failure is known for the shard we keep waiting
-                    shards.put(shardId, shardStatus);
+                    final IndexRoutingTable indexShardRoutingTable = routingTable.index(shardId.getIndex());
+                    if (indexShardRoutingTable == null) {
+                        // shard became unassigned while queued so we fail as missing here
+                        assert entry.partial();
+                        snapshotChanged = true;
+                        logger.debug("failing snapshot of shard [{}] because index got deleted", shardId);
+                        shards.put(shardId, ShardSnapshotStatus.MISSING);
+                        knownFailures.put(shardId, ShardSnapshotStatus.MISSING);
+                    } else {
+                        // if no failure is known for the shard we keep waiting
+                        shards.put(shardId, shardStatus);
+                    }
                 } else {
                     // If a failure is known for an execution we waited on for this shard then we fail with the same exception here
                     // as well
@@ -1298,9 +1308,10 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
 
     private static boolean waitingShardsStartedOrUnassigned(SnapshotsInProgress snapshotsInProgress, ClusterChangedEvent event) {
         for (SnapshotsInProgress.Entry entry : snapshotsInProgress.entries()) {
-            if (entry.state() == State.STARTED) {
+            if (entry.state() == State.STARTED && entry.isClone() == false) {
                 for (ObjectObjectCursor<RepositoryShardId, ShardSnapshotStatus> shardStatus : entry.shardsByRepoShardId()) {
-                    if (shardStatus.value.state() != ShardState.WAITING) {
+                    final ShardState state = shardStatus.value.state();
+                    if (state != ShardState.WAITING && state != ShardState.QUEUED) {
                         continue;
                     }
                     final RepositoryShardId shardId = shardStatus.key;
@@ -1309,7 +1320,7 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
                             .getRoutingTable()
                             .index(entry.indexByName(shardId.indexName()));
                         if (indexShardRoutingTable == null) {
-                            // index got removed concurrently and we have to fail WAITING state shards
+                            // index got removed concurrently and we have to fail WAITING or QUEUED state shards
                             return true;
                         }
                         ShardRouting shardRouting = indexShardRoutingTable.shard(shardId.shardId()).primaryShard();