|
|
@@ -46,6 +46,7 @@ import org.elasticsearch.action.support.ActiveShardCount;
|
|
|
import org.elasticsearch.client.Client;
|
|
|
import org.elasticsearch.cluster.ClusterState;
|
|
|
import org.elasticsearch.cluster.ClusterStateUpdateTask;
|
|
|
+import org.elasticsearch.cluster.RestoreInProgress;
|
|
|
import org.elasticsearch.cluster.SnapshotsInProgress;
|
|
|
import org.elasticsearch.cluster.SnapshotsInProgress.Entry;
|
|
|
import org.elasticsearch.cluster.SnapshotsInProgress.ShardSnapshotStatus;
|
|
|
@@ -55,6 +56,10 @@ import org.elasticsearch.cluster.metadata.IndexMetaData;
|
|
|
import org.elasticsearch.cluster.metadata.MappingMetaData;
|
|
|
import org.elasticsearch.cluster.metadata.MetaDataIndexStateService;
|
|
|
import org.elasticsearch.cluster.routing.IndexRoutingTable;
|
|
|
+import org.elasticsearch.cluster.routing.RecoverySource;
|
|
|
+import org.elasticsearch.cluster.routing.ShardRouting;
|
|
|
+import org.elasticsearch.cluster.routing.ShardRoutingState;
|
|
|
+import org.elasticsearch.cluster.routing.UnassignedInfo;
|
|
|
import org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider;
|
|
|
import org.elasticsearch.cluster.service.ClusterService;
|
|
|
import org.elasticsearch.common.Strings;
|
|
|
@@ -97,14 +102,15 @@ import java.util.Map;
|
|
|
import java.util.concurrent.CountDownLatch;
|
|
|
import java.util.concurrent.ExecutionException;
|
|
|
import java.util.concurrent.TimeUnit;
|
|
|
+import java.util.function.Consumer;
|
|
|
import java.util.stream.Collectors;
|
|
|
import java.util.stream.Stream;
|
|
|
|
|
|
import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS;
|
|
|
import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS;
|
|
|
+import static org.elasticsearch.cluster.routing.allocation.decider.MaxRetryAllocationDecider.SETTING_ALLOCATION_MAX_RETRY;
|
|
|
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
|
|
|
import static org.elasticsearch.index.IndexSettings.INDEX_REFRESH_INTERVAL_SETTING;
|
|
|
-import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
|
|
|
import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
|
|
|
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
|
|
|
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAliasesExist;
|
|
|
@@ -117,9 +123,11 @@ import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertInde
|
|
|
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoFailures;
|
|
|
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertThrows;
|
|
|
import static org.hamcrest.Matchers.allOf;
|
|
|
+import static org.hamcrest.Matchers.anyOf;
|
|
|
import static org.hamcrest.Matchers.containsString;
|
|
|
import static org.hamcrest.Matchers.equalTo;
|
|
|
import static org.hamcrest.Matchers.greaterThan;
|
|
|
+import static org.hamcrest.Matchers.hasSize;
|
|
|
import static org.hamcrest.Matchers.is;
|
|
|
import static org.hamcrest.Matchers.lessThan;
|
|
|
import static org.hamcrest.Matchers.not;
|
|
|
@@ -824,6 +832,8 @@ public class SharedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTestCas
|
|
|
prepareCreate("test-idx").setSettings(Settings.builder().put("index.allocation.max_retries", Integer.MAX_VALUE)).get();
|
|
|
ensureGreen();
|
|
|
|
|
|
+ final NumShards numShards = getNumShards("test-idx");
|
|
|
+
|
|
|
logger.info("--> indexing some data");
|
|
|
for (int i = 0; i < 100; i++) {
|
|
|
index("test-idx", "doc", Integer.toString(i), "foo", "bar" + i);
|
|
|
@@ -848,14 +858,31 @@ public class SharedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTestCas
|
|
|
logger.info("--> delete index");
|
|
|
cluster().wipeIndices("test-idx");
|
|
|
logger.info("--> restore index after deletion");
|
|
|
- RestoreSnapshotResponse restoreSnapshotResponse = client.admin().cluster().prepareRestoreSnapshot("test-repo", "test-snap").setWaitForCompletion(true).execute().actionGet();
|
|
|
- assertThat(restoreSnapshotResponse.getRestoreInfo().totalShards(), greaterThan(0));
|
|
|
- SearchResponse countResponse = client.prepareSearch("test-idx").setSize(0).get();
|
|
|
- assertThat(countResponse.getHits().getTotalHits(), equalTo(100L));
|
|
|
+ final RestoreSnapshotResponse restoreResponse = client.admin().cluster().prepareRestoreSnapshot("test-repo", "test-snap")
|
|
|
+ .setWaitForCompletion(true)
|
|
|
+ .get();
|
|
|
+
|
|
|
logger.info("--> total number of simulated failures during restore: [{}]", getFailureCount("test-repo"));
|
|
|
+ final RestoreInfo restoreInfo = restoreResponse.getRestoreInfo();
|
|
|
+ assertThat(restoreInfo.totalShards(), equalTo(numShards.numPrimaries));
|
|
|
+
|
|
|
+ if (restoreInfo.successfulShards() == restoreInfo.totalShards()) {
|
|
|
+ // All shards were restored, we must find the exact number of hits
|
|
|
+ assertHitCount(client.prepareSearch("test-idx").setSize(0).get(), 100L);
|
|
|
+ } else {
|
|
|
+ // One or more shards failed to be restored. This can happen when there is
|
|
|
+ // only 1 data node: a shard failed because of the random IO exceptions
|
|
|
+ // during restore and then we don't allow the shard to be assigned on the
|
|
|
+ // same node again during the same reroute operation. Then another reroute
|
|
|
+ // operation is scheduled, but the RestoreInProgressAllocationDecider will
|
|
|
+ // block the shard to be assigned again because it failed during restore.
|
|
|
+ final ClusterStateResponse clusterStateResponse = client.admin().cluster().prepareState().get();
|
|
|
+ assertEquals(1, clusterStateResponse.getState().getNodes().getDataNodes().size());
|
|
|
+ assertEquals(restoreInfo.failedShards(),
|
|
|
+ clusterStateResponse.getState().getRoutingTable().shardsWithState(ShardRoutingState.UNASSIGNED).size());
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
- @TestLogging("org.elasticsearch.cluster.routing:TRACE,org.elasticsearch.snapshots:TRACE")
|
|
|
public void testDataFileCorruptionDuringRestore() throws Exception {
|
|
|
Path repositoryLocation = randomRepoPath();
|
|
|
Client client = client();
|
|
|
@@ -907,6 +934,155 @@ public class SharedClusterSnapshotRestoreIT extends AbstractSnapshotIntegTestCas
|
|
|
cluster().wipeIndices("test-idx");
|
|
|
}
|
|
|
|
|
|
+ /**
|
|
|
+ * Test that restoring a snapshot whose files can't be downloaded at all is not stuck or
|
|
|
+ * does not hang indefinitely.
|
|
|
+ */
|
|
|
+ public void testUnrestorableFilesDuringRestore() throws Exception {
|
|
|
+ final String indexName = "unrestorable-files";
|
|
|
+ final int maxRetries = randomIntBetween(1, 10);
|
|
|
+
|
|
|
+ Settings createIndexSettings = Settings.builder().put(SETTING_ALLOCATION_MAX_RETRY.getKey(), maxRetries).build();
|
|
|
+
|
|
|
+ Settings repositorySettings = Settings.builder()
|
|
|
+ .put("random", randomAlphaOfLength(10))
|
|
|
+ .put("max_failure_number", 10000000L)
|
|
|
+ // No lucene corruptions, we want to test retries
|
|
|
+ .put("use_lucene_corruption", false)
|
|
|
+ // Restoring a file will never complete
|
|
|
+ .put("random_data_file_io_exception_rate", 1.0)
|
|
|
+ .build();
|
|
|
+
|
|
|
+ Consumer<UnassignedInfo> checkUnassignedInfo = unassignedInfo -> {
|
|
|
+ assertThat(unassignedInfo.getReason(), equalTo(UnassignedInfo.Reason.ALLOCATION_FAILED));
|
|
|
+ assertThat(unassignedInfo.getNumFailedAllocations(), anyOf(equalTo(maxRetries), equalTo(1)));
|
|
|
+ };
|
|
|
+
|
|
|
+ unrestorableUseCase(indexName, createIndexSettings, repositorySettings, Settings.EMPTY, checkUnassignedInfo, () -> {});
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Test that restoring an index with shard allocation filtering settings that prevents
|
|
|
+ * its allocation does not hang indefinitely.
|
|
|
+ */
|
|
|
+ public void testUnrestorableIndexDuringRestore() throws Exception {
|
|
|
+ final String indexName = "unrestorable-index";
|
|
|
+ Settings restoreIndexSettings = Settings.builder().put("index.routing.allocation.include._name", randomAlphaOfLength(5)).build();
|
|
|
+
|
|
|
+ Consumer<UnassignedInfo> checkUnassignedInfo = unassignedInfo -> {
|
|
|
+ assertThat(unassignedInfo.getReason(), equalTo(UnassignedInfo.Reason.NEW_INDEX_RESTORED));
|
|
|
+ };
|
|
|
+
|
|
|
+ Runnable fixupAction =() -> {
|
|
|
+ // remove the shard allocation filtering settings and use the Reroute API to retry the failed shards
|
|
|
+ assertAcked(client().admin().indices().prepareUpdateSettings(indexName)
|
|
|
+ .setSettings(Settings.builder()
|
|
|
+ .putNull("index.routing.allocation.include._name")
|
|
|
+ .build()));
|
|
|
+ assertAcked(client().admin().cluster().prepareReroute().setRetryFailed(true));
|
|
|
+ };
|
|
|
+
|
|
|
+ unrestorableUseCase(indexName, Settings.EMPTY, Settings.EMPTY, restoreIndexSettings, checkUnassignedInfo, fixupAction);
|
|
|
+ }
|
|
|
+
|
|
|
+ /** Execute the unrestorable test use case **/
|
|
|
+ private void unrestorableUseCase(final String indexName,
|
|
|
+ final Settings createIndexSettings,
|
|
|
+ final Settings repositorySettings,
|
|
|
+ final Settings restoreIndexSettings,
|
|
|
+ final Consumer<UnassignedInfo> checkUnassignedInfo,
|
|
|
+ final Runnable fixUpAction) throws Exception {
|
|
|
+ // create a test repository
|
|
|
+ final Path repositoryLocation = randomRepoPath();
|
|
|
+ assertAcked(client().admin().cluster().preparePutRepository("test-repo")
|
|
|
+ .setType("fs")
|
|
|
+ .setSettings(Settings.builder().put("location", repositoryLocation)));
|
|
|
+ // create a test index
|
|
|
+ assertAcked(prepareCreate(indexName, Settings.builder().put(createIndexSettings)));
|
|
|
+
|
|
|
+ // index some documents
|
|
|
+ final int nbDocs = scaledRandomIntBetween(10, 100);
|
|
|
+ for (int i = 0; i < nbDocs; i++) {
|
|
|
+ index(indexName, "doc", Integer.toString(i), "foo", "bar" + i);
|
|
|
+ }
|
|
|
+ flushAndRefresh(indexName);
|
|
|
+ assertThat(client().prepareSearch(indexName).setSize(0).get().getHits().getTotalHits(), equalTo((long) nbDocs));
|
|
|
+
|
|
|
+ // create a snapshot
|
|
|
+ final NumShards numShards = getNumShards(indexName);
|
|
|
+ CreateSnapshotResponse snapshotResponse = client().admin().cluster().prepareCreateSnapshot("test-repo", "test-snap")
|
|
|
+ .setWaitForCompletion(true)
|
|
|
+ .setIndices(indexName)
|
|
|
+ .get();
|
|
|
+
|
|
|
+ assertThat(snapshotResponse.getSnapshotInfo().state(), equalTo(SnapshotState.SUCCESS));
|
|
|
+ assertThat(snapshotResponse.getSnapshotInfo().successfulShards(), equalTo(numShards.numPrimaries));
|
|
|
+ assertThat(snapshotResponse.getSnapshotInfo().failedShards(), equalTo(0));
|
|
|
+
|
|
|
+ // delete the test index
|
|
|
+ assertAcked(client().admin().indices().prepareDelete(indexName));
|
|
|
+
|
|
|
+ // update the test repository
|
|
|
+ assertAcked(client().admin().cluster().preparePutRepository("test-repo")
|
|
|
+ .setType("mock")
|
|
|
+ .setSettings(Settings.builder()
|
|
|
+ .put("location", repositoryLocation)
|
|
|
+ .put(repositorySettings)
|
|
|
+ .build()));
|
|
|
+
|
|
|
+ // attempt to restore the snapshot with the given settings
|
|
|
+ RestoreSnapshotResponse restoreResponse = client().admin().cluster().prepareRestoreSnapshot("test-repo", "test-snap")
|
|
|
+ .setIndices(indexName)
|
|
|
+ .setIndexSettings(restoreIndexSettings)
|
|
|
+ .setWaitForCompletion(true)
|
|
|
+ .get();
|
|
|
+
|
|
|
+ // check that all shards failed during restore
|
|
|
+ assertThat(restoreResponse.getRestoreInfo().totalShards(), equalTo(numShards.numPrimaries));
|
|
|
+ assertThat(restoreResponse.getRestoreInfo().successfulShards(), equalTo(0));
|
|
|
+
|
|
|
+ ClusterStateResponse clusterStateResponse = client().admin().cluster().prepareState().setCustoms(true).setRoutingTable(true).get();
|
|
|
+
|
|
|
+ // check that there is no restore in progress
|
|
|
+ RestoreInProgress restoreInProgress = clusterStateResponse.getState().custom(RestoreInProgress.TYPE);
|
|
|
+ assertNotNull("RestoreInProgress must be not null", restoreInProgress);
|
|
|
+ assertThat("RestoreInProgress must be empty", restoreInProgress.entries(), hasSize(0));
|
|
|
+
|
|
|
+ // check that the shards have been created but are not assigned
|
|
|
+ assertThat(clusterStateResponse.getState().getRoutingTable().allShards(indexName), hasSize(numShards.totalNumShards));
|
|
|
+
|
|
|
+ // check that every primary shard is unassigned
|
|
|
+ for (ShardRouting shard : clusterStateResponse.getState().getRoutingTable().allShards(indexName)) {
|
|
|
+ if (shard.primary()) {
|
|
|
+ assertThat(shard.state(), equalTo(ShardRoutingState.UNASSIGNED));
|
|
|
+ assertThat(shard.recoverySource().getType(), equalTo(RecoverySource.Type.SNAPSHOT));
|
|
|
+ assertThat(shard.unassignedInfo().getLastAllocationStatus(), equalTo(UnassignedInfo.AllocationStatus.DECIDERS_NO));
|
|
|
+ checkUnassignedInfo.accept(shard.unassignedInfo());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // update the test repository in order to make it work
|
|
|
+ assertAcked(client().admin().cluster().preparePutRepository("test-repo")
|
|
|
+ .setType("fs")
|
|
|
+ .setSettings(Settings.builder().put("location", repositoryLocation)));
|
|
|
+
|
|
|
+ // execute action to eventually fix the situation
|
|
|
+ fixUpAction.run();
|
|
|
+
|
|
|
+ // delete the index and restore again
|
|
|
+ assertAcked(client().admin().indices().prepareDelete(indexName));
|
|
|
+
|
|
|
+ restoreResponse = client().admin().cluster().prepareRestoreSnapshot("test-repo", "test-snap").setWaitForCompletion(true).get();
|
|
|
+ assertThat(restoreResponse.getRestoreInfo().totalShards(), equalTo(numShards.numPrimaries));
|
|
|
+ assertThat(restoreResponse.getRestoreInfo().successfulShards(), equalTo(numShards.numPrimaries));
|
|
|
+
|
|
|
+ // Wait for the shards to be assigned
|
|
|
+ ensureGreen(indexName);
|
|
|
+ refresh(indexName);
|
|
|
+
|
|
|
+ assertThat(client().prepareSearch(indexName).setSize(0).get().getHits().getTotalHits(), equalTo((long) nbDocs));
|
|
|
+ }
|
|
|
+
|
|
|
public void testDeletionOfFailingToRecoverIndexShouldStopRestore() throws Exception {
|
|
|
Path repositoryLocation = randomRepoPath();
|
|
|
Client client = client();
|