|
@@ -24,6 +24,7 @@ import com.google.common.collect.ImmutableMap;
|
|
|
import org.apache.lucene.util.CollectionUtil;
|
|
|
import org.elasticsearch.ElasticsearchException;
|
|
|
import org.elasticsearch.ExceptionsHelper;
|
|
|
+import org.elasticsearch.Version;
|
|
|
import org.elasticsearch.action.search.ShardSearchFailure;
|
|
|
import org.elasticsearch.action.support.IndicesOptions;
|
|
|
import org.elasticsearch.cluster.*;
|
|
@@ -33,6 +34,8 @@ import org.elasticsearch.cluster.metadata.SnapshotMetaData.State;
|
|
|
import org.elasticsearch.cluster.node.DiscoveryNode;
|
|
|
import org.elasticsearch.cluster.node.DiscoveryNodes;
|
|
|
import org.elasticsearch.cluster.routing.IndexRoutingTable;
|
|
|
+import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
|
|
|
+import org.elasticsearch.cluster.routing.RoutingTable;
|
|
|
import org.elasticsearch.cluster.routing.ShardRouting;
|
|
|
import org.elasticsearch.common.Strings;
|
|
|
import org.elasticsearch.common.component.AbstractLifecycleComponent;
|
|
@@ -63,7 +66,6 @@ import java.util.concurrent.locks.ReentrantLock;
|
|
|
|
|
|
import static com.google.common.collect.Lists.newArrayList;
|
|
|
import static com.google.common.collect.Maps.newHashMap;
|
|
|
-import static com.google.common.collect.Maps.newHashMapWithExpectedSize;
|
|
|
import static com.google.common.collect.Sets.newHashSet;
|
|
|
|
|
|
/**
|
|
@@ -503,6 +505,9 @@ public class SnapshotsService extends AbstractLifecycleComponent<SnapshotsServic
|
|
|
if (event.nodesRemoved()) {
|
|
|
processSnapshotsOnRemovedNodes(event);
|
|
|
}
|
|
|
+ if (event.routingTableChanged()) {
|
|
|
+ processStartedShards(event);
|
|
|
+ }
|
|
|
}
|
|
|
SnapshotMetaData prev = event.previousState().metaData().custom(SnapshotMetaData.TYPE);
|
|
|
SnapshotMetaData curr = event.state().metaData().custom(SnapshotMetaData.TYPE);
|
|
@@ -605,6 +610,112 @@ public class SnapshotsService extends AbstractLifecycleComponent<SnapshotsServic
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ private void processStartedShards(ClusterChangedEvent event) {
|
|
|
+ if (waitingShardsStartedOrUnassigned(event)) {
|
|
|
+ clusterService.submitStateUpdateTask("update snapshot state after shards started", new ClusterStateUpdateTask() {
|
|
|
+ @Override
|
|
|
+ public ClusterState execute(ClusterState currentState) throws Exception {
|
|
|
+ MetaData metaData = currentState.metaData();
|
|
|
+ RoutingTable routingTable = currentState.routingTable();
|
|
|
+ MetaData.Builder mdBuilder = MetaData.builder(currentState.metaData());
|
|
|
+ SnapshotMetaData snapshots = metaData.custom(SnapshotMetaData.TYPE);
|
|
|
+ if (snapshots != null) {
|
|
|
+ boolean changed = false;
|
|
|
+ ArrayList<SnapshotMetaData.Entry> entries = newArrayList();
|
|
|
+ for (final SnapshotMetaData.Entry snapshot : snapshots.entries()) {
|
|
|
+ SnapshotMetaData.Entry updatedSnapshot = snapshot;
|
|
|
+ if (snapshot.state() == State.STARTED) {
|
|
|
+ ImmutableMap<ShardId, ShardSnapshotStatus> shards = processWaitingShards(snapshot.shards(), routingTable);
|
|
|
+ if (shards != null) {
|
|
|
+ changed = true;
|
|
|
+ if (!snapshot.state().completed() && completed(shards.values())) {
|
|
|
+ updatedSnapshot = new SnapshotMetaData.Entry(snapshot.snapshotId(), snapshot.includeGlobalState(), State.SUCCESS, snapshot.indices(), shards);
|
|
|
+ endSnapshot(updatedSnapshot);
|
|
|
+ } else {
|
|
|
+ updatedSnapshot = new SnapshotMetaData.Entry(snapshot.snapshotId(), snapshot.includeGlobalState(), snapshot.state(), snapshot.indices(), shards);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ entries.add(updatedSnapshot);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (changed) {
|
|
|
+ snapshots = new SnapshotMetaData(entries.toArray(new SnapshotMetaData.Entry[entries.size()]));
|
|
|
+ mdBuilder.putCustom(SnapshotMetaData.TYPE, snapshots);
|
|
|
+ return ClusterState.builder(currentState).metaData(mdBuilder).build();
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return currentState;
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public void onFailure(String source, Throwable t) {
|
|
|
+ logger.warn("failed to update snapshot state after shards started from [{}] ", t, source);
|
|
|
+ }
|
|
|
+ });
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private ImmutableMap<ShardId, ShardSnapshotStatus> processWaitingShards(ImmutableMap<ShardId, ShardSnapshotStatus> snapshotShards, RoutingTable routingTable) {
|
|
|
+ boolean snapshotChanged = false;
|
|
|
+ ImmutableMap.Builder<ShardId, ShardSnapshotStatus> shards = ImmutableMap.builder();
|
|
|
+ for (ImmutableMap.Entry<ShardId, ShardSnapshotStatus> shardEntry : snapshotShards.entrySet()) {
|
|
|
+ ShardSnapshotStatus shardStatus = shardEntry.getValue();
|
|
|
+ if (shardStatus.state() == State.WAITING) {
|
|
|
+ ShardId shardId = shardEntry.getKey();
|
|
|
+ IndexRoutingTable indexShardRoutingTable = routingTable.index(shardId.getIndex());
|
|
|
+ if (indexShardRoutingTable != null) {
|
|
|
+ IndexShardRoutingTable shardRouting = indexShardRoutingTable.shard(shardId.id());
|
|
|
+ if (shardRouting != null && shardRouting.primaryShard() != null) {
|
|
|
+ if (shardRouting.primaryShard().started()) {
|
|
|
+ // Shard that we were waiting for has started on a node, let's process it
|
|
|
+ snapshotChanged = true;
|
|
|
+ logger.trace("starting shard that we were waiting for [{}] on node [{}]", shardEntry.getKey(), shardStatus.nodeId());
|
|
|
+ shards.put(shardEntry.getKey(), new ShardSnapshotStatus(shardRouting.primaryShard().currentNodeId()));
|
|
|
+ continue;
|
|
|
+ } else if (shardRouting.primaryShard().initializing() || shardRouting.primaryShard().relocating()) {
|
|
|
+ // Shard that we were waiting for hasn't started yet or still relocating - will continue to wait
|
|
|
+ shards.put(shardEntry);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // Shard that we were waiting for went into unassigned state or disappeared - giving up
|
|
|
+ snapshotChanged = true;
|
|
|
+ logger.warn("failing snapshot of shard [{}] on unassigned shard [{}]", shardEntry.getKey(), shardStatus.nodeId());
|
|
|
+ shards.put(shardEntry.getKey(), new ShardSnapshotStatus(shardStatus.nodeId(), State.FAILED, "shard is unassigned"));
|
|
|
+ } else {
|
|
|
+ shards.put(shardEntry);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (snapshotChanged) {
|
|
|
+ return shards.build();
|
|
|
+ } else {
|
|
|
+ return null;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private boolean waitingShardsStartedOrUnassigned(ClusterChangedEvent event) {
|
|
|
+ SnapshotMetaData curr = event.state().metaData().custom(SnapshotMetaData.TYPE);
|
|
|
+ if (curr != null) {
|
|
|
+ for (SnapshotMetaData.Entry entry : curr.entries()) {
|
|
|
+ if (entry.state() == State.STARTED && !entry.waitingIndices().isEmpty()) {
|
|
|
+ for (String index : entry.waitingIndices().keySet()) {
|
|
|
+ if (event.indexRoutingTableChanged(index)) {
|
|
|
+ IndexRoutingTable indexShardRoutingTable = event.state().getRoutingTable().index(index);
|
|
|
+ for (ShardId shardId : entry.waitingIndices().get(index)) {
|
|
|
+ ShardRouting shardRouting = indexShardRoutingTable.shard(shardId.id()).primaryShard();
|
|
|
+ if (shardRouting != null && (shardRouting.started() || shardRouting.unassigned())) {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+
|
|
|
private boolean removedNodesCleanupNeeded(ClusterChangedEvent event) {
|
|
|
// Check if we just became the master
|
|
|
boolean newMaster = !event.previousState().nodes().localNodeMaster();
|
|
@@ -646,44 +757,51 @@ public class SnapshotsService extends AbstractLifecycleComponent<SnapshotsServic
|
|
|
|
|
|
// For now we will be mostly dealing with a single snapshot at a time but might have multiple simultaneously running
|
|
|
// snapshots in the future
|
|
|
- HashMap<SnapshotId, SnapshotShards> newSnapshots = null;
|
|
|
+ Map<SnapshotId, Map<ShardId, IndexShardSnapshotStatus>> newSnapshots = newHashMap();
|
|
|
// Now go through all snapshots and update existing or create missing
|
|
|
final String localNodeId = clusterService.localNode().id();
|
|
|
for (SnapshotMetaData.Entry entry : snapshotMetaData.entries()) {
|
|
|
- HashMap<ShardId, IndexShardSnapshotStatus> startedShards = null;
|
|
|
- for (Map.Entry<ShardId, SnapshotMetaData.ShardSnapshotStatus> shard : entry.shards().entrySet()) {
|
|
|
- // Check if we have new shards to start processing on
|
|
|
- if (localNodeId.equals(shard.getValue().nodeId())) {
|
|
|
- if (entry.state() == State.STARTED) {
|
|
|
- if (startedShards == null) {
|
|
|
- startedShards = newHashMap();
|
|
|
- }
|
|
|
- startedShards.put(shard.getKey(), new IndexShardSnapshotStatus());
|
|
|
- } else if (entry.state() == State.ABORTED) {
|
|
|
- SnapshotShards snapshotShards = shardSnapshots.get(entry.snapshotId());
|
|
|
- if (snapshotShards != null) {
|
|
|
- IndexShardSnapshotStatus snapshotStatus = snapshotShards.shards.get(shard.getKey());
|
|
|
- if (snapshotStatus != null) {
|
|
|
- snapshotStatus.abort();
|
|
|
- }
|
|
|
+ if (entry.state() == State.STARTED) {
|
|
|
+ Map<ShardId, IndexShardSnapshotStatus> startedShards = newHashMap();
|
|
|
+ SnapshotShards snapshotShards = shardSnapshots.get(entry.snapshotId());
|
|
|
+ for (Map.Entry<ShardId, SnapshotMetaData.ShardSnapshotStatus> shard : entry.shards().entrySet()) {
|
|
|
+ // Add all new shards to start processing on
|
|
|
+ if (localNodeId.equals(shard.getValue().nodeId())) {
|
|
|
+ if (shard.getValue().state() == State.INIT && (snapshotShards == null || !snapshotShards.shards.containsKey(shard.getKey()))) {
|
|
|
+ logger.trace("[{}] - Adding shard to the queue", shard.getKey());
|
|
|
+ startedShards.put(shard.getKey(), new IndexShardSnapshotStatus());
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- }
|
|
|
- if (startedShards != null) {
|
|
|
- if (!survivors.containsKey(entry.snapshotId())) {
|
|
|
- if (newSnapshots == null) {
|
|
|
- newSnapshots = newHashMapWithExpectedSize(2);
|
|
|
+ if (!startedShards.isEmpty()) {
|
|
|
+ newSnapshots.put(entry.snapshotId(), startedShards);
|
|
|
+ if (snapshotShards != null) {
|
|
|
+ // We already saw this snapshot but we need to add more started shards
|
|
|
+ ImmutableMap.Builder<ShardId, IndexShardSnapshotStatus> shards = ImmutableMap.builder();
|
|
|
+ // Put all shards that were already running on this node
|
|
|
+ shards.putAll(snapshotShards.shards);
|
|
|
+ // Put all newly started shards
|
|
|
+ shards.putAll(startedShards);
|
|
|
+ survivors.put(entry.snapshotId(), new SnapshotShards(shards.build()));
|
|
|
+ } else {
|
|
|
+ // Brand new snapshot that we haven't seen before
|
|
|
+ survivors.put(entry.snapshotId(), new SnapshotShards(ImmutableMap.copyOf(startedShards)));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else if (entry.state() == State.ABORTED) {
|
|
|
+ // Abort all running shards for this snapshot
|
|
|
+ SnapshotShards snapshotShards = shardSnapshots.get(entry.snapshotId());
|
|
|
+ if (snapshotShards != null) {
|
|
|
+ for (Map.Entry<ShardId, SnapshotMetaData.ShardSnapshotStatus> shard : entry.shards().entrySet()) {
|
|
|
+ IndexShardSnapshotStatus snapshotStatus = snapshotShards.shards.get(shard.getKey());
|
|
|
+ if (snapshotStatus != null) {
|
|
|
+ snapshotStatus.abort();
|
|
|
+ }
|
|
|
}
|
|
|
- newSnapshots.put(entry.snapshotId(), new SnapshotShards(ImmutableMap.copyOf(startedShards)));
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- if (newSnapshots != null) {
|
|
|
- survivors.putAll(newSnapshots);
|
|
|
- }
|
|
|
-
|
|
|
// Update the list of snapshots that we saw and tried to started
|
|
|
// If startup of these shards fails later, we don't want to try starting these shards again
|
|
|
shutdownLock.lock();
|
|
@@ -697,10 +815,10 @@ public class SnapshotsService extends AbstractLifecycleComponent<SnapshotsServic
|
|
|
shutdownLock.unlock();
|
|
|
}
|
|
|
|
|
|
- // We have new snapshots to process -
|
|
|
- if (newSnapshots != null) {
|
|
|
- for (final Map.Entry<SnapshotId, SnapshotShards> entry : newSnapshots.entrySet()) {
|
|
|
- for (final Map.Entry<ShardId, IndexShardSnapshotStatus> shardEntry : entry.getValue().shards.entrySet()) {
|
|
|
+ // We have new shards to starts
|
|
|
+ if (!newSnapshots.isEmpty()) {
|
|
|
+ for (final Map.Entry<SnapshotId, Map<ShardId, IndexShardSnapshotStatus>> entry : newSnapshots.entrySet()) {
|
|
|
+ for (final Map.Entry<ShardId, IndexShardSnapshotStatus> shardEntry : entry.getValue().entrySet()) {
|
|
|
try {
|
|
|
final IndexShardSnapshotAndRestoreService shardSnapshotService = indicesService.indexServiceSafe(shardEntry.getKey().getIndex()).shardInjectorSafe(shardEntry.getKey().id())
|
|
|
.getInstance(IndexShardSnapshotAndRestoreService.class);
|
|
@@ -1089,6 +1207,9 @@ public class SnapshotsService extends AbstractLifecycleComponent<SnapshotsServic
|
|
|
ShardRouting primary = indexRoutingTable.shard(i).primaryShard();
|
|
|
if (primary == null || !primary.assignedToNode()) {
|
|
|
builder.put(shardId, new SnapshotMetaData.ShardSnapshotStatus(null, State.MISSING, "primary shard is not allocated"));
|
|
|
+ } else if (clusterState.getNodes().smallestVersion().onOrAfter(Version.V_1_2_0) && (primary.relocating() || primary.initializing())) {
|
|
|
+ // The WAITING state was introduced in V1.2.0 - don't use it if there are nodes with older version in the cluster
|
|
|
+ builder.put(shardId, new SnapshotMetaData.ShardSnapshotStatus(primary.currentNodeId(), State.WAITING));
|
|
|
} else if (!primary.started()) {
|
|
|
builder.put(shardId, new SnapshotMetaData.ShardSnapshotStatus(primary.currentNodeId(), State.MISSING, "primary shard hasn't been started yet"));
|
|
|
} else {
|