Răsfoiți Sursa

Manage retention of failed snapshots in SLM (#47617)

Failed snapshots will eventually build up unless they are deleted. While
failures may not take up much space, they add noise to the list of
snapshots and it's desirable to remove them when they are no longer
useful.

With this change, failed snapshots are deleted using the following
strategy: `FAILED` snapshots will be kept until the configured
`expire_after` period has passed, if present, and then be deleted. If
there is no configured `expire_after` in the retention policy, then they
will be deleted if there is at least one more recent successful snapshot
from this policy (as they may otherwise be useful for troubleshooting
purposes). Failed snapshots are not counted towards either `min_count`
or `max_count`.
Gordon Brown 6 ani în urmă
părinte
comite
e221f8632f

+ 45 - 19
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/slm/SnapshotRetentionConfiguration.java

@@ -20,6 +20,7 @@ import org.elasticsearch.common.xcontent.ToXContentObject;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.snapshots.SnapshotInfo;
+import org.elasticsearch.snapshots.SnapshotState;
 
 import java.io.IOException;
 import java.util.Comparator;
@@ -29,6 +30,7 @@ import java.util.Set;
 import java.util.function.LongSupplier;
 import java.util.function.Predicate;
 import java.util.stream.Collectors;
+import java.util.stream.Stream;
 
 public class SnapshotRetentionConfiguration implements ToXContentObject, Writeable {
 
@@ -113,33 +115,48 @@ public class SnapshotRetentionConfiguration implements ToXContentObject, Writeab
      * @param allSnapshots a list of all snapshot pertaining to this SLM policy and repository
      */
     public Predicate<SnapshotInfo> getSnapshotDeletionPredicate(final List<SnapshotInfo> allSnapshots) {
-        final int snapCount = allSnapshots.size();
-        List<SnapshotInfo> sortedSnapshots = allSnapshots.stream()
+        final int totalSnapshotCount = allSnapshots.size();
+        final List<SnapshotInfo> sortedSnapshots = allSnapshots.stream()
             .sorted(Comparator.comparingLong(SnapshotInfo::startTime))
             .collect(Collectors.toList());
+        final long successfulSnapshotCount = allSnapshots.stream()
+            .filter(snap -> SnapshotState.SUCCESS.equals(snap.state()))
+            .count();
+        final long newestSuccessfulTimestamp = allSnapshots.stream()
+            .filter(snap -> SnapshotState.SUCCESS.equals(snap.state()))
+            .mapToLong(SnapshotInfo::startTime)
+            .max()
+            .orElse(Long.MIN_VALUE);
 
         return si -> {
             final String snapName = si.snapshotId().getName();
 
-            // First, enforce the maximum count, if the size is over the maximum number of
+            // First, if there's no expire_after and a more recent successful snapshot, we can delete all the failed ones
+            if (this.expireAfter == null && SnapshotState.FAILED.equals(si.state()) && newestSuccessfulTimestamp > si.startTime()) {
+                // There's no expire_after and there's a more recent successful snapshot, delete this failed one
+                logger.trace("[{}]: ELIGIBLE as it is FAILED and there is a more recent successful snapshot", snapName);
+                return true;
+            }
+
+            // Next, enforce the maximum count, if the size is over the maximum number of
             // snapshots, then allow the oldest N (where N is the number over the maximum snapshot
             // count) snapshots to be eligible for deletion
             if (this.maximumSnapshotCount != null) {
-                if (allSnapshots.size() > this.maximumSnapshotCount) {
-                    int snapsToDelete = allSnapshots.size() - this.maximumSnapshotCount;
-                    boolean eligible = sortedSnapshots.stream()
+                if (successfulSnapshotCount > this.maximumSnapshotCount) {
+                    final long snapsToDelete = successfulSnapshotCount - this.maximumSnapshotCount;
+                    final boolean eligible = sortedSnapshots.stream()
                         .limit(snapsToDelete)
                         .anyMatch(s -> s.equals(si));
 
                     if (eligible) {
                         logger.trace("[{}]: ELIGIBLE as it is one of the {} oldest snapshots with " +
-                                "{} total snapshots, over the limit of {} maximum snapshots",
-                            snapName, snapsToDelete, snapCount, this.maximumSnapshotCount);
+                                "{} non-failed snapshots ({} total), over the limit of {} maximum snapshots",
+                            snapName, snapsToDelete, successfulSnapshotCount, totalSnapshotCount, this.maximumSnapshotCount);
                         return true;
                     } else {
                         logger.trace("[{}]: INELIGIBLE as it is not one of the {} oldest snapshots with " +
-                                "{} total snapshots, over the limit of {} maximum snapshots",
-                            snapName, snapsToDelete, snapCount, this.maximumSnapshotCount);
+                                "{} non-failed snapshots ({} total), over the limit of {} maximum snapshots",
+                            snapName, snapsToDelete, successfulSnapshotCount, totalSnapshotCount, this.maximumSnapshotCount);
                         return false;
                     }
                 }
@@ -149,25 +166,34 @@ public class SnapshotRetentionConfiguration implements ToXContentObject, Writeab
             // if we haven't hit the minimum then we need to keep the snapshot regardless of
             // expiration time
             if (this.minimumSnapshotCount != null) {
-                if (allSnapshots.size() <= this.minimumSnapshotCount) {
-                    logger.trace("[{}]: INELIGIBLE as there are {} snapshots and {} minimum snapshots needed",
-                        snapName, snapCount, this.minimumSnapshotCount);
-                    return false;
-                }
+                if (successfulSnapshotCount <= this.minimumSnapshotCount)
+                    if (SnapshotState.FAILED.equals(si.state()) == false) {
+                        logger.trace("[{}]: INELIGIBLE as there are {} non-failed snapshots ({} total) and {} minimum snapshots needed",
+                            snapName, successfulSnapshotCount, totalSnapshotCount, this.minimumSnapshotCount);
+                        return false;
+                    } else {
+                        logger.trace("[{}]: SKIPPING minimum snapshot count check as this snapshot is {} and not counted " +
+                            "towards the minimum snapshot count.", snapName, SnapshotState.FAILED);
+                    }
             }
 
             // Finally, check the expiration time of the snapshot, if it is past, then it is
             // eligible for deletion
             if (this.expireAfter != null) {
-                TimeValue snapshotAge = new TimeValue(nowSupplier.getAsLong() - si.startTime());
+                final TimeValue snapshotAge = new TimeValue(nowSupplier.getAsLong() - si.startTime());
 
                 if (this.minimumSnapshotCount != null) {
-                    int eligibleForExpiration = snapCount - minimumSnapshotCount;
+                    final long eligibleForExpiration = successfulSnapshotCount - minimumSnapshotCount;
 
                     // Only the oldest N snapshots are actually eligible, since if we went below this we
                     // would fall below the configured minimum number of snapshots to keep
-                    Set<SnapshotInfo> snapsEligibleForExpiration = sortedSnapshots.stream()
-                        .limit(eligibleForExpiration)
+                    final Stream<SnapshotInfo> successfulSnapsEligibleForExpiration = sortedSnapshots.stream()
+                        .filter(snap -> SnapshotState.SUCCESS.equals(snap.state()))
+                        .limit(eligibleForExpiration);
+                    final Stream<SnapshotInfo> failedSnaps = sortedSnapshots.stream()
+                        .filter(snap -> SnapshotState.FAILED.equals(snap.state()));
+
+                    final Set<SnapshotInfo> snapsEligibleForExpiration = Stream.concat(successfulSnapsEligibleForExpiration, failedSnaps)
                         .collect(Collectors.toSet());
 
                     if (snapsEligibleForExpiration.contains(si) == false) {

+ 130 - 1
x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/slm/SnapshotRetentionConfigurationTests.java

@@ -7,8 +7,10 @@
 package org.elasticsearch.xpack.slm;
 
 import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.index.shard.ShardId;
 import org.elasticsearch.snapshots.SnapshotId;
 import org.elasticsearch.snapshots.SnapshotInfo;
+import org.elasticsearch.snapshots.SnapshotShardFailure;
 import org.elasticsearch.test.ESTestCase;
 import org.elasticsearch.xpack.core.slm.SnapshotLifecyclePolicy;
 import org.elasticsearch.xpack.core.slm.SnapshotRetentionConfiguration;
@@ -100,10 +102,137 @@ public class SnapshotRetentionConfigurationTests extends ESTestCase {
         assertThat(conf.getSnapshotDeletionPredicate(infos).test(s9), equalTo(false));
     }
 
+    public void testFailuresDeletedIfExpired() {
+        SnapshotRetentionConfiguration conf = new SnapshotRetentionConfiguration(
+            () -> TimeValue.timeValueDays(1).millis() + 1,
+            TimeValue.timeValueDays(1), null, null);
+        SnapshotInfo oldInfo = makeFailureInfo(0);
+        assertThat(conf.getSnapshotDeletionPredicate(Collections.singletonList(oldInfo)).test(oldInfo), equalTo(true));
+
+        SnapshotInfo newInfo = makeFailureInfo(1);
+        assertThat(conf.getSnapshotDeletionPredicate(Collections.singletonList(newInfo)).test(newInfo), equalTo(false));
+
+        List<SnapshotInfo> infos = new ArrayList<>();
+        infos.add(newInfo);
+        infos.add(oldInfo);
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(newInfo), equalTo(false));
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(oldInfo), equalTo(true));
+    }
+
+    public void testFailuresDeletedIfNoExpiryAndMoreRecentSuccessExists() {
+        SnapshotRetentionConfiguration conf = new SnapshotRetentionConfiguration(() -> 1, null, 2, 5);
+        SnapshotInfo s1 = makeInfo(1);
+        SnapshotInfo s2 = makeInfo(2);
+        SnapshotInfo s3 = makeFailureInfo(3);
+        SnapshotInfo s4 = makeInfo(4);
+
+        List<SnapshotInfo> infos = Arrays.asList(s1 , s2, s3, s4);
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(s1), equalTo(false));
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(s2), equalTo(false));
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(s3), equalTo(true));
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(s4), equalTo(false));
+    }
+
+    public void testFailuresKeptIfNoExpiryAndNoMoreRecentSuccess() {
+        // Also tests that failures are not counted towards the maximum
+        SnapshotRetentionConfiguration conf = new SnapshotRetentionConfiguration(() -> 1, null, 2, 3);
+        SnapshotInfo s1 = makeInfo(1);
+        SnapshotInfo s2 = makeInfo(2);
+        SnapshotInfo s3 = makeInfo(3);
+        SnapshotInfo s4 = makeFailureInfo(4);
+
+        List<SnapshotInfo> infos = Arrays.asList(s1 , s2, s3, s4);
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(s1), equalTo(false));
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(s2), equalTo(false));
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(s3), equalTo(false));
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(s4), equalTo(false));
+    }
+
+    public void testFailuresNotCountedTowardsMaximum() {
+        SnapshotRetentionConfiguration conf = new SnapshotRetentionConfiguration(() -> 1, TimeValue.timeValueDays(1), 2, 2);
+        SnapshotInfo s1 = makeInfo(1);
+        SnapshotInfo s2 = makeFailureInfo(2);
+        SnapshotInfo s3 = makeFailureInfo(3);
+        SnapshotInfo s4 = makeFailureInfo(4);
+        SnapshotInfo s5 = makeInfo(5);
+
+        List<SnapshotInfo> infos = Arrays.asList(s1 , s2, s3, s4, s5);
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(s1), equalTo(false));
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(s2), equalTo(false));
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(s3), equalTo(false));
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(s4), equalTo(false));
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(s5), equalTo(false));
+    }
+
+    public void testFailuresNotCountedTowardsMinimum() {
+        SnapshotRetentionConfiguration conf = new SnapshotRetentionConfiguration(() -> TimeValue.timeValueDays(1).millis() + 1,
+            TimeValue.timeValueDays(1), 2, null);
+        SnapshotInfo oldInfo = makeInfo(0);
+        SnapshotInfo failureInfo = makeFailureInfo( 1);
+        SnapshotInfo newInfo = makeInfo(2);
+
+        List<SnapshotInfo> infos = new ArrayList<>();
+        infos.add(newInfo);
+        infos.add(failureInfo);
+        infos.add(oldInfo);
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(newInfo), equalTo(false));
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(failureInfo), equalTo(false));
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(oldInfo), equalTo(false));
+
+        conf = new SnapshotRetentionConfiguration(() -> TimeValue.timeValueDays(1).millis() + 2,
+            TimeValue.timeValueDays(1), 1, null);
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(newInfo), equalTo(false));
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(failureInfo), equalTo(true));
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(oldInfo), equalTo(true));
+    }
+
+    public void testMostRecentSuccessfulTimestampIsUsed() {
+        SnapshotRetentionConfiguration conf = new SnapshotRetentionConfiguration(() -> 1, null, 2, 2);
+        SnapshotInfo s1 = makeInfo(1);
+        SnapshotInfo s2 = makeInfo(2);
+        SnapshotInfo s3 = makeFailureInfo(3);
+        SnapshotInfo s4 = makeFailureInfo(4);
+
+        List<SnapshotInfo> infos = Arrays.asList(s1 , s2, s3, s4);
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(s1), equalTo(false));
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(s2), equalTo(false));
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(s3), equalTo(false));
+        assertThat(conf.getSnapshotDeletionPredicate(infos).test(s4), equalTo(false));
+    }
+
     private SnapshotInfo makeInfo(long startTime) {
         final Map<String, Object> meta = new HashMap<>();
         meta.put(SnapshotLifecyclePolicy.POLICY_ID_METADATA_FIELD, REPO);
+        final int totalShards = between(1,20);
         return new SnapshotInfo(new SnapshotId("snap-" + randomAlphaOfLength(3), "uuid"),
-            Collections.singletonList("foo"), startTime, false, meta);
+            Collections.singletonList("foo"),
+            startTime,
+            null,
+            startTime + between(1,10000),
+            totalShards,
+            new ArrayList<>(),
+            false,
+            meta);
+    }
+
+    private SnapshotInfo makeFailureInfo(long startTime) {
+        final Map<String, Object> meta = new HashMap<>();
+        meta.put(SnapshotLifecyclePolicy.POLICY_ID_METADATA_FIELD, REPO);
+        final int totalShards = between(1,20);
+        final List<SnapshotShardFailure> failures = new ArrayList<>();
+        final int failureCount = between(1,totalShards);
+        for (int i = 0; i < failureCount; i++) {
+            failures.add(new SnapshotShardFailure("nodeId", new ShardId("index-name", "index-uuid", i), "failed"));
+        }
+        assert failureCount == failures.size();
+        return new SnapshotInfo(new SnapshotId("snap-fail-" + randomAlphaOfLength(3), "uuid-fail"),
+            Collections.singletonList("foo-fail"),
+            startTime,
+            "forced-failure",
+            startTime + between(1,10000),
+            totalShards,
+            failures,
+            randomBoolean(),
+            meta);
     }
 }

+ 4 - 3
x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotRetentionTask.java

@@ -131,7 +131,7 @@ public class SnapshotRetentionTask implements SchedulerEngine.Listener {
                 // Finally, asynchronously retrieve all the snapshots, deleting them serially,
                 // before updating the cluster state with the new metrics and setting 'running'
                 // back to false
-                getAllSuccessfulSnapshots(repositioriesToFetch, new ActionListener<>() {
+                getAllRetainableSnapshots(repositioriesToFetch, new ActionListener<>() {
                     @Override
                     public void onResponse(Map<String, List<SnapshotInfo>> allSnapshots) {
                         try {
@@ -222,7 +222,7 @@ public class SnapshotRetentionTask implements SchedulerEngine.Listener {
         return eligible;
     }
 
-    void getAllSuccessfulSnapshots(Collection<String> repositories, ActionListener<Map<String, List<SnapshotInfo>>> listener,
+    void getAllRetainableSnapshots(Collection<String> repositories, ActionListener<Map<String, List<SnapshotInfo>>> listener,
                                    Consumer<Exception> errorHandler) {
         if (repositories.isEmpty()) {
             // Skip retrieving anything if there are no repositories to fetch
@@ -236,11 +236,12 @@ public class SnapshotRetentionTask implements SchedulerEngine.Listener {
                 @Override
                 public void onResponse(final GetSnapshotsResponse resp) {
                     Map<String, List<SnapshotInfo>> snapshots = new HashMap<>();
+                    final Set<SnapshotState> retainableStates = Set.of(SnapshotState.SUCCESS, SnapshotState.FAILED);
                     repositories.forEach(repo -> {
                         snapshots.put(repo,
                             // Only return snapshots in the SUCCESS state
                             resp.getSnapshots(repo).stream()
-                                .filter(info -> info.state() == SnapshotState.SUCCESS)
+                                .filter(info -> retainableStates.contains(info.state()))
                                 .collect(Collectors.toList()));
                     });
                     listener.onResponse(snapshots);

+ 155 - 0
x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/slm/SLMSnapshotBlockingIntegTests.java

@@ -6,11 +6,15 @@
 
 package org.elasticsearch.xpack.slm;
 
+import org.elasticsearch.action.ActionFuture;
+import org.elasticsearch.action.admin.cluster.snapshots.get.GetSnapshotsResponse;
 import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotStatus;
 import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotsStatusResponse;
+import org.elasticsearch.action.index.IndexRequestBuilder;
 import org.elasticsearch.action.search.SearchResponse;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.SnapshotsInProgress;
+import org.elasticsearch.cluster.health.ClusterHealthStatus;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.TimeValue;
@@ -18,7 +22,9 @@ import org.elasticsearch.index.query.QueryBuilders;
 import org.elasticsearch.plugins.Plugin;
 import org.elasticsearch.repositories.RepositoriesService;
 import org.elasticsearch.snapshots.ConcurrentSnapshotExecutionException;
+import org.elasticsearch.snapshots.SnapshotInfo;
 import org.elasticsearch.snapshots.SnapshotMissingException;
+import org.elasticsearch.snapshots.SnapshotState;
 import org.elasticsearch.snapshots.mockstore.MockRepository;
 import org.elasticsearch.test.ESIntegTestCase;
 import org.elasticsearch.test.junit.annotations.TestLogging;
@@ -38,8 +44,13 @@ import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicReference;
 
+import static org.elasticsearch.index.mapper.MapperService.SINGLE_MAPPING_NAME;
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
 import static org.hamcrest.Matchers.anyOf;
+import static org.hamcrest.Matchers.empty;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.greaterThan;
 
@@ -231,6 +242,137 @@ public class SLMSnapshotBlockingIntegTests extends ESIntegTestCase {
         }
     }
 
+    public void testBasicFailureRetention() throws Exception {
+        final String indexName = "test-idx";
+        final String policyId = "test-policy";
+        // Setup
+        logger.info("-->  starting two master nodes and two data nodes");
+        internalCluster().startMasterOnlyNodes(2);
+        internalCluster().startDataOnlyNodes(2);
+
+        createAndPopulateIndex(indexName);
+
+        // Create a snapshot repo
+        initializeRepo(REPO);
+
+        createSnapshotPolicy(policyId, "snap", "1 2 3 4 5 ?", REPO, indexName, true,
+            new SnapshotRetentionConfiguration(null, 1, 2));
+
+        // Create a failed snapshot
+        AtomicReference<String> failedSnapshotName = new AtomicReference<>();
+        {
+            logger.info("-->  stopping random data node, which should cause shards to go missing");
+            internalCluster().stopRandomDataNode();
+            assertBusy(() ->
+                    assertEquals(ClusterHealthStatus.RED, client().admin().cluster().prepareHealth().get().getStatus()),
+                30, TimeUnit.SECONDS);
+
+            final String masterNode = blockMasterFromFinalizingSnapshotOnIndexFile(REPO);
+
+            logger.info("-->  start snapshot");
+            ActionFuture<ExecuteSnapshotLifecycleAction.Response> snapshotFuture = client()
+                .execute(ExecuteSnapshotLifecycleAction.INSTANCE, new ExecuteSnapshotLifecycleAction.Request(policyId));
+
+            logger.info("--> waiting for block to kick in on " + masterNode);
+            waitForBlock(masterNode, REPO, TimeValue.timeValueSeconds(60));
+
+            logger.info("-->  stopping master node");
+            internalCluster().stopCurrentMasterNode();
+
+            logger.info("-->  wait until the snapshot is done");
+            failedSnapshotName.set(snapshotFuture.get().getSnapshotName());
+            assertNotNull(failedSnapshotName.get());
+
+            logger.info("-->  verify that snapshot [{}] failed", failedSnapshotName.get());
+            assertBusy(() -> {
+                GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
+                    .prepareGetSnapshots(REPO).setSnapshots(failedSnapshotName.get()).get();
+                SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots(REPO).get(0);
+                assertEquals(SnapshotState.FAILED, snapshotInfo.state());
+            });
+        }
+
+        // Run retention - we'll check the results later to make sure it's had time to run.
+        {
+            logger.info("--> executing SLM retention");
+            assertAcked(client().execute(ExecuteSnapshotRetentionAction.INSTANCE, new ExecuteSnapshotRetentionAction.Request()).get());
+        }
+
+        // Take a successful snapshot
+        AtomicReference<String> successfulSnapshotName = new AtomicReference<>();
+        {
+            logger.info("--> deleting old index [{}], as it is now missing shards", indexName);
+            assertAcked(client().admin().indices().prepareDelete(indexName).get());
+            createAndPopulateIndex(indexName);
+
+            logger.info("--> unblocking snapshots");
+            unblockRepo(REPO);
+            unblockAllDataNodes(REPO);
+
+            logger.info("--> taking new snapshot");
+
+            ActionFuture<ExecuteSnapshotLifecycleAction.Response> snapshotResponse = client()
+                .execute(ExecuteSnapshotLifecycleAction.INSTANCE, new ExecuteSnapshotLifecycleAction.Request(policyId));
+            logger.info("--> waiting for snapshot to complete");
+            successfulSnapshotName.set(snapshotResponse.get().getSnapshotName());
+            assertNotNull(successfulSnapshotName.get());
+            Thread.sleep(TimeValue.timeValueSeconds(10).millis());
+            logger.info("-->  verify that snapshot [{}] succeeded", successfulSnapshotName.get());
+            assertBusy(() -> {
+                GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
+                    .prepareGetSnapshots(REPO).setSnapshots(successfulSnapshotName.get()).get();
+                SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots(REPO).get(0);
+                assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
+            });
+        }
+
+        // Check that the failed snapshot from before still exists, now that retention has run
+        {
+            logger.info("-->  verify that snapshot [{}] still exists", failedSnapshotName.get());
+            GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
+                .prepareGetSnapshots(REPO).setSnapshots(failedSnapshotName.get()).get();
+            SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots(REPO).get(0);
+            assertEquals(SnapshotState.FAILED, snapshotInfo.state());
+        }
+
+        // Run retention again and make sure the failure was deleted
+        {
+            logger.info("--> executing SLM retention");
+            assertAcked(client().execute(ExecuteSnapshotRetentionAction.INSTANCE, new ExecuteSnapshotRetentionAction.Request()).get());
+            logger.info("--> waiting for failed snapshot [{}] to be deleted", failedSnapshotName.get());
+            assertBusy(() -> {
+                try {
+                    GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
+                        .prepareGetSnapshots(REPO).setSnapshots(failedSnapshotName.get()).get();
+                    assertThat(snapshotsStatusResponse.getSnapshots(REPO), empty());
+                } catch (SnapshotMissingException e) {
+                    // This is what we want to happen
+                }
+                logger.info("--> failed snapshot [{}] has been deleted, checking successful snapshot [{}] still exists",
+                    failedSnapshotName.get(), successfulSnapshotName.get());
+                GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
+                    .prepareGetSnapshots(REPO).setSnapshots(successfulSnapshotName.get()).get();
+                SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots(REPO).get(0);
+                assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
+            });
+        }
+    }
+
+    private void createAndPopulateIndex(String indexName) throws InterruptedException {
+        logger.info("--> creating and populating index [{}]", indexName);
+        assertAcked(prepareCreate(indexName, 0, Settings.builder()
+            .put("number_of_shards", 6).put("number_of_replicas", 0)));
+        ensureGreen();
+
+        final int numdocs = randomIntBetween(50, 100);
+        IndexRequestBuilder[] builders = new IndexRequestBuilder[numdocs];
+        for (int i = 0; i < builders.length; i++) {
+            builders[i] = client().prepareIndex(indexName, SINGLE_MAPPING_NAME, Integer.toString(i)).setSource("field1", "bar " + i);
+        }
+        indexRandom(true, builders);
+        flushAndRefresh();
+    }
+
     private void initializeRepo(String repoName) {
         client().admin().cluster().preparePutRepository(repoName)
             .setType("mock")
@@ -314,4 +456,17 @@ public class SLMSnapshotBlockingIntegTests extends ESIntegTestCase {
             ((MockRepository)repositoriesService.repository(repository)).unblock();
         }
     }
+
+    public void waitForBlock(String node, String repository, TimeValue timeout) throws InterruptedException {
+        long start = System.currentTimeMillis();
+        RepositoriesService repositoriesService = internalCluster().getInstance(RepositoriesService.class, node);
+        MockRepository mockRepository = (MockRepository) repositoriesService.repository(repository);
+        while (System.currentTimeMillis() - start < timeout.millis()) {
+            if (mockRepository.blocked()) {
+                return;
+            }
+            Thread.sleep(100);
+        }
+        fail("Timeout waiting for node [" + node + "] to be blocked");
+    }
 }

+ 1 - 1
x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/slm/SnapshotRetentionTaskTests.java

@@ -442,7 +442,7 @@ public class SnapshotRetentionTaskTests extends ESTestCase {
         }
 
         @Override
-        void getAllSuccessfulSnapshots(Collection<String> repositories,
+        void getAllRetainableSnapshots(Collection<String> repositories,
                                        ActionListener<Map<String, List<SnapshotInfo>>> listener,
                                        Consumer<Exception> errorHandler) {
             listener.onResponse(this.snapshotRetriever.get());