Przeglądaj źródła

Fix Partial Snapshots Recording Spurious Errors (#69150)

If an index is deleted while a partial snapshot is running the behavior
was not deterministic.
If an index was deleted just as one of its shard snapshots was about to
start then it would be recorded as a shard snapshot failure in the
snapshot result and the snapshot would show up as `PARTIAL`.
If the index delete however happened after the shard had been
snapshotted, then the snapshot would show `SUCCESS`.
In both cases however, the snapshot would contain the exact same data
because the deleted index would become part of the final snapshot.
Also, it was confusing that in the `PARTIAL` case, there would be errors
recorded for shards the indices of which would not be part of the
snapshot.

This commit makes it such that not only are indices filtered from the
list of indices in a snapshot but also from the shard snapshot errors
in a snapshot entry so that the snapshot always shows up as `SUCCESS`
because concurrent index deletes are not a failure but allowed in
partial snapshots.

Closes #69014
Tamara Braun 4 lat temu
rodzic
commit
86de86be32

+ 2 - 2
server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java

@@ -823,8 +823,8 @@ public class ConcurrentSnapshotsIT extends AbstractSnapshotIntegTestCase {
         assertAcked(client().admin().indices().prepareDelete("index-two"));
         unblockNode(repoName, masterNode);
 
-        assertThat(snapshotThree.get().getSnapshotInfo().state(), is(SnapshotState.PARTIAL));
-        assertThat(snapshotFour.get().getSnapshotInfo().state(), is(SnapshotState.PARTIAL));
+        assertThat(snapshotThree.get().getSnapshotInfo().state(), is(SnapshotState.SUCCESS));
+        assertThat(snapshotFour.get().getSnapshotInfo().state(), is(SnapshotState.SUCCESS));
         assertAcked(deleteFuture.get());
     }
 

+ 25 - 2
server/src/internalClusterTest/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java

@@ -979,11 +979,34 @@ public class DedicatedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTest
 
         awaitNoMoreRunningOperations();
         SnapshotInfo snapshotInfo = getSnapshot(repoName, "test-snap");
-        assertThat(snapshotInfo.state(), equalTo(SnapshotState.PARTIAL));
-        assertThat(snapshotInfo.shardFailures().size(), greaterThan(0));
+        assertThat(snapshotInfo.state(), equalTo(SnapshotState.SUCCESS));
         logger.info("--> done");
     }
 
+    public void testPartialSnapshotsDoNotRecordDeletedShardFailures() throws Exception {
+        internalCluster().startMasterOnlyNodes(1);
+        final String dataNode = internalCluster().startDataOnlyNode();
+        final String repoName = "test-repo";
+        createRepository(repoName, "mock");
+
+        final String firstIndex = "index-one";
+        createIndexWithContent(firstIndex);
+        final String secondIndex = "index-two";
+        createIndexWithContent(secondIndex);
+
+        final String snapshot = "snapshot-one";
+        blockDataNode(repoName, dataNode);
+        final ActionFuture<CreateSnapshotResponse> snapshotResponse = startFullSnapshot(repoName, snapshot, true);
+        waitForBlock(dataNode, repoName);
+
+        assertAcked(client().admin().indices().prepareDelete(firstIndex));
+
+        unblockNode(repoName, dataNode);
+
+        SnapshotInfo snapshotInfo = assertSuccessful(snapshotResponse);
+        assertThat(snapshotInfo.shardFailures(), empty());
+    }
+
     public void testGetReposWithWildcard() {
         internalCluster().startMasterOnlyNode();
         List<RepositoryMetadata> repositoryMetadata = client().admin().cluster().prepareGetRepositories("*").get().repositories();

+ 3 - 2
server/src/internalClusterTest/java/org/elasticsearch/snapshots/SystemIndicesSnapshotIT.java

@@ -815,15 +815,16 @@ public class SystemIndicesSnapshotIT extends AbstractSnapshotIntegTestCase {
         });
     }
 
-    @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/69014")
     public void testParallelIndexDeleteRemovesFeatureState() throws Exception {
         final String indexToBeDeleted = SystemIndexTestPlugin.SYSTEM_INDEX_NAME;
         final String fullIndexName = AnotherSystemIndexTestPlugin.SYSTEM_INDEX_NAME;
         final String nonsystemIndex = "nonsystem-idx";
 
+        final int nodesInCluster = internalCluster().size();
         // Stop one data node so we only have one data node to start with
         internalCluster().stopNode(dataNodes.get(1));
         dataNodes.remove(1);
+        ensureStableCluster(nodesInCluster - 1);
 
         createRepositoryNoVerify(REPO_NAME, "mock");
 
@@ -865,7 +866,7 @@ public class SystemIndicesSnapshotIT extends AbstractSnapshotIntegTestCase {
         unblockNode(REPO_NAME, dataNodes.get(1));
 
         logger.info("--> Repo unblocked, checking that snapshot finished...");
-        CreateSnapshotResponse createSnapshotResponse = createSnapshotFuture.actionGet();
+        CreateSnapshotResponse createSnapshotResponse = createSnapshotFuture.get();
         logger.info(createSnapshotResponse.toString());
         assertThat(createSnapshotResponse.status(), equalTo(RestStatus.OK));
 

+ 7 - 4
server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java

@@ -1254,9 +1254,16 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
             final String failure = entry.failure();
             final Snapshot snapshot = entry.snapshot();
             logger.trace("[{}] finalizing snapshot in repository, state: [{}], failure[{}]", snapshot, entry.state(), failure);
+            final ShardGenerations shardGenerations = buildGenerations(entry, metadata);
+            final List<String> finalIndices = shardGenerations.indices().stream().map(IndexId::getName).collect(Collectors.toList());
+            final Set<String> indexNames = new HashSet<>(finalIndices);
             ArrayList<SnapshotShardFailure> shardFailures = new ArrayList<>();
             for (ObjectObjectCursor<ShardId, ShardSnapshotStatus> shardStatus : entry.shards()) {
                 ShardId shardId = shardStatus.key;
+                if (indexNames.contains(shardId.getIndexName()) == false) {
+                    assert entry.partial() : "only ignoring shard failures for concurrently deleted indices for partial snapshots";
+                    continue;
+                }
                 ShardSnapshotStatus status = shardStatus.value;
                 final ShardState state = status.state();
                 if (state.failed()) {
@@ -1267,7 +1274,6 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
                     assert state == ShardState.SUCCESS;
                 }
             }
-            final ShardGenerations shardGenerations = buildGenerations(entry, metadata);
             final String repository = snapshot.getRepository();
             final StepListener<Metadata> metadataListener = new StepListener<>();
             final Repository repo = repositoriesService.repository(snapshot.getRepository());
@@ -1297,9 +1303,6 @@ public class SnapshotsService extends AbstractLifecycleComponent implements Clus
             }
             metadataListener.whenComplete(meta -> {
                         final Metadata metaForSnapshot = metadataForSnapshot(entry, meta);
-                        final List<String> finalIndices = shardGenerations.indices().stream()
-                            .map(IndexId::getName)
-                            .collect(Collectors.toList());
                         final SnapshotInfo snapshotInfo = new SnapshotInfo(snapshot.getSnapshotId(),
                                 finalIndices,
                                 entry.partial() ? entry.dataStreams().stream()