Browse Source

Fix SLM check for restore in progress (#50868)

* Fix SLM check for restore in progress

This commit fixes the check in SLM where the `RestoreInProgress`
metadata was checked for existence. Rather than check existence we
should instead check the `isEmpty` method. Prior to this, a successful
restore for a repository that used SLM retention would prevent SLM
retention from running in subsequent invocations, due to SLM thinking
that a restore was still running.
Lee Hinman 5 years ago
parent
commit
c83aceb59c

+ 2 - 1
x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SnapshotRetentionTask.java

@@ -446,7 +446,7 @@ public class SnapshotRetentionTask implements SchedulerEngine.Listener {
 
         // Cannot delete during a restore
         final RestoreInProgress restoreInProgress = state.custom(RestoreInProgress.TYPE);
-        if (restoreInProgress != null) {
+        if (restoreInProgress != null && restoreInProgress.isEmpty() == false) {
             return false;
         }
 
@@ -480,6 +480,7 @@ public class SnapshotRetentionTask implements SchedulerEngine.Listener {
                     logger.debug("retrying SLM snapshot retention deletion after snapshot operation has completed");
                     reRun.accept(state);
                 } else {
+                    logger.trace("received new cluster state but a snapshot operation is still running");
                     observer.waitForNextChange(this);
                 }
             } catch (Exception e) {

+ 73 - 0
x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/slm/SLMSnapshotBlockingIntegTests.java

@@ -8,6 +8,9 @@ package org.elasticsearch.xpack.slm;
 
 import org.elasticsearch.action.ActionFuture;
 import org.elasticsearch.action.admin.cluster.snapshots.get.GetSnapshotsResponse;
+import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotAction;
+import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotRequest;
+import org.elasticsearch.action.admin.cluster.snapshots.restore.RestoreSnapshotResponse;
 import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotStatus;
 import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotsStatusResponse;
 import org.elasticsearch.action.index.IndexRequestBuilder;
@@ -22,6 +25,7 @@ import org.elasticsearch.index.query.QueryBuilders;
 import org.elasticsearch.plugins.Plugin;
 import org.elasticsearch.repositories.RepositoriesService;
 import org.elasticsearch.repositories.RepositoryException;
+import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.snapshots.ConcurrentSnapshotExecutionException;
 import org.elasticsearch.snapshots.SnapshotInfo;
 import org.elasticsearch.snapshots.SnapshotMissingException;
@@ -30,6 +34,7 @@ import org.elasticsearch.snapshots.mockstore.MockRepository;
 import org.elasticsearch.test.ESIntegTestCase;
 import org.elasticsearch.xpack.core.LocalStateCompositeXPackPlugin;
 import org.elasticsearch.xpack.core.ilm.LifecycleSettings;
+import org.elasticsearch.xpack.core.slm.SnapshotInvocationRecord;
 import org.elasticsearch.xpack.core.slm.SnapshotLifecyclePolicy;
 import org.elasticsearch.xpack.core.slm.SnapshotLifecyclePolicyItem;
 import org.elasticsearch.xpack.core.slm.SnapshotRetentionConfiguration;
@@ -385,6 +390,74 @@ public class SLMSnapshotBlockingIntegTests extends ESIntegTestCase {
         }
     }
 
+    public void testSLMRetentionAfterRestore() throws Exception {
+        final String indexName = "test";
+        final String policyName = "test-policy";
+        int docCount = 20;
+        for (int i = 0; i < docCount; i++) {
+            index(indexName, i + "", Collections.singletonMap("foo", "bar"));
+        }
+
+        // Create a snapshot repo
+        initializeRepo(REPO);
+
+        logger.info("--> creating policy {}", policyName);
+        createSnapshotPolicy(policyName, "snap", NEVER_EXECUTE_CRON_SCHEDULE, REPO, indexName, true, false,
+            new SnapshotRetentionConfiguration(TimeValue.ZERO, null, null));
+
+        logger.info("--> executing snapshot lifecycle");
+        final String snapshotName = executePolicy(policyName);
+
+        // Check that the executed snapshot shows up in the SLM output
+        assertBusy(() -> {
+            GetSnapshotLifecycleAction.Response getResp =
+                client().execute(GetSnapshotLifecycleAction.INSTANCE, new GetSnapshotLifecycleAction.Request(policyName)).get();
+            logger.info("--> checking for in progress snapshot...");
+
+            assertThat(getResp.getPolicies().size(), greaterThan(0));
+            SnapshotLifecyclePolicyItem item = getResp.getPolicies().get(0);
+            SnapshotInvocationRecord lastSuccess = item.getLastSuccess();
+            assertNotNull(lastSuccess);
+            assertThat(lastSuccess.getSnapshotName(), equalTo(snapshotName));
+        });
+
+        logger.info("--> restoring index");
+        RestoreSnapshotRequest restoreReq = new RestoreSnapshotRequest(REPO, snapshotName);
+        restoreReq.indices(indexName);
+        restoreReq.renamePattern("(.+)");
+        restoreReq.renameReplacement("restored_$1");
+        restoreReq.waitForCompletion(true);
+        RestoreSnapshotResponse resp = client().execute(RestoreSnapshotAction.INSTANCE, restoreReq).get();
+        assertThat(resp.status(), equalTo(RestStatus.OK));
+
+        logger.info("--> executing SLM retention");
+        assertAcked(client().execute(ExecuteSnapshotRetentionAction.INSTANCE, new ExecuteSnapshotRetentionAction.Request()).get());
+        logger.info("--> waiting for {} snapshot to be deleted", snapshotName);
+        assertBusy(() -> {
+            try {
+                try {
+                    GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster()
+                        .prepareGetSnapshots(REPO).setSnapshots(snapshotName).get();
+                    assertThat(snapshotsStatusResponse.getSnapshots(REPO), empty());
+                } catch (SnapshotMissingException e) {
+                    // This is what we want to happen
+                }
+                logger.info("--> snapshot [{}] has been deleted", snapshotName);
+            } catch (RepositoryException re) {
+                // Concurrent status calls and write operations may lead to failures in determining the current repository generation
+                // TODO: Remove this hack once tracking the current repository generation has been made consistent
+                throw new AssertionError(re);
+            }
+        });
+
+        // Cancel/delete the snapshot
+        try {
+            client().admin().cluster().prepareDeleteSnapshot(REPO, snapshotName).get();
+        } catch (SnapshotMissingException e) {
+            // ignore
+        }
+    }
+
     private SnapshotsStatusResponse getSnapshotStatus(String snapshotName) {
         try {
             return client().admin().cluster().prepareSnapshotStatus(REPO).setSnapshots(snapshotName).get();

+ 13 - 4
x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/slm/SnapshotRetentionTaskTests.java

@@ -67,6 +67,7 @@ import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.hasSize;
 import static org.hamcrest.Matchers.not;
 import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
 
 public class SnapshotRetentionTaskTests extends ESTestCase {
 
@@ -363,6 +364,14 @@ public class SnapshotRetentionTaskTests extends ESTestCase {
             .build();
 
         assertThat(SnapshotRetentionTask.okayToDeleteSnapshots(state), equalTo(false));
+
+        restoreInProgress = mock(RestoreInProgress.class);
+        when(restoreInProgress.isEmpty()).thenReturn(true);
+        state = ClusterState.builder(new ClusterName("cluster"))
+            .putCustom(RestoreInProgress.TYPE, restoreInProgress)
+            .build();
+
+        assertThat(SnapshotRetentionTask.okayToDeleteSnapshots(state), equalTo(true));
     }
 
     public void testSkipWhileStopping() throws Exception {
@@ -420,10 +429,10 @@ public class SnapshotRetentionTaskTests extends ESTestCase {
             final String repoId = "repo";
             SnapshotLifecyclePolicy policy = new SnapshotLifecyclePolicy(policyId, "snap", "1 * * * * ?",
                 repoId, null, new SnapshotRetentionConfiguration(TimeValue.timeValueDays(30), null, null));
-    
+
             ClusterState state = createState(mode, policy);
             ClusterServiceUtils.setState(clusterService, state);
-    
+
             AtomicBoolean retentionWasRun = new AtomicBoolean(false);
             MockSnapshotRetentionTask task = new MockSnapshotRetentionTask(noOpClient, clusterService,
                 new SnapshotLifecycleTaskTests.VerifyingHistoryStore(noOpClient, ZoneOffset.UTC, (historyItem) -> {
@@ -436,10 +445,10 @@ public class SnapshotRetentionTaskTests extends ESTestCase {
                 (deletionPolicyId, repo, snapId, slmStats, listener) -> {
                 },
                 System::nanoTime);
-    
+
             long time = System.currentTimeMillis();
             task.triggered(new SchedulerEngine.Event(SnapshotRetentionService.SLM_RETENTION_MANUAL_JOB_ID, time, time));
-    
+
             assertTrue("retention should be run manually even if SLM is disabled", retentionWasRun.get());
         } finally {
             threadPool.shutdownNow();