|
@@ -36,7 +36,6 @@ import org.elasticsearch.cluster.ClusterService;
|
|
|
import org.elasticsearch.cluster.ClusterState;
|
|
|
import org.elasticsearch.cluster.ClusterStateObserver;
|
|
|
import org.elasticsearch.cluster.action.index.MappingUpdatedAction;
|
|
|
-import org.elasticsearch.cluster.action.shard.NoOpShardStateActionListener;
|
|
|
import org.elasticsearch.cluster.action.shard.ShardStateAction;
|
|
|
import org.elasticsearch.cluster.block.ClusterBlockException;
|
|
|
import org.elasticsearch.cluster.block.ClusterBlockLevel;
|
|
@@ -81,6 +80,8 @@ import java.util.function.Supplier;
|
|
|
*/
|
|
|
public abstract class TransportReplicationAction<Request extends ReplicationRequest, ReplicaRequest extends ReplicationRequest, Response extends ActionWriteResponse> extends TransportAction<Request, Response> {
|
|
|
|
|
|
+ public static final String SHARD_FAILURE_TIMEOUT = "action.support.replication.shard.failure_timeout";
|
|
|
+
|
|
|
protected final TransportService transportService;
|
|
|
protected final ClusterService clusterService;
|
|
|
protected final IndicesService indicesService;
|
|
@@ -88,6 +89,7 @@ public abstract class TransportReplicationAction<Request extends ReplicationRequ
|
|
|
protected final WriteConsistencyLevel defaultWriteConsistencyLevel;
|
|
|
protected final TransportRequestOptions transportOptions;
|
|
|
protected final MappingUpdatedAction mappingUpdatedAction;
|
|
|
+ private final TimeValue shardFailedTimeout;
|
|
|
|
|
|
final String transportReplicaAction;
|
|
|
final String executor;
|
|
@@ -117,6 +119,8 @@ public abstract class TransportReplicationAction<Request extends ReplicationRequ
|
|
|
this.transportOptions = transportOptions();
|
|
|
|
|
|
this.defaultWriteConsistencyLevel = WriteConsistencyLevel.fromString(settings.get("action.write_consistency", "quorum"));
|
|
|
+ // TODO: set a default timeout
|
|
|
+ shardFailedTimeout = settings.getAsTime(SHARD_FAILURE_TIMEOUT, null);
|
|
|
}
|
|
|
|
|
|
@Override
|
|
@@ -351,7 +355,6 @@ public abstract class TransportReplicationAction<Request extends ReplicationRequ
|
|
|
private final AtomicBoolean finished = new AtomicBoolean(false);
|
|
|
private volatile Releasable indexShardReference;
|
|
|
|
|
|
-
|
|
|
PrimaryPhase(Request request, ActionListener<Response> listener) {
|
|
|
this.internalRequest = new InternalRequest(request);
|
|
|
this.listener = listener;
|
|
@@ -578,7 +581,7 @@ public abstract class TransportReplicationAction<Request extends ReplicationRequ
|
|
|
PrimaryOperationRequest por = new PrimaryOperationRequest(primary.id(), internalRequest.concreteIndex(), internalRequest.request());
|
|
|
Tuple<Response, ReplicaRequest> primaryResponse = shardOperationOnPrimary(observer.observedState(), por);
|
|
|
logger.trace("operation completed on primary [{}]", primary);
|
|
|
- replicationPhase = new ReplicationPhase(shardsIt, primaryResponse.v2(), primaryResponse.v1(), observer, primary, internalRequest, listener, indexShardReference);
|
|
|
+ replicationPhase = new ReplicationPhase(shardsIt, primaryResponse.v2(), primaryResponse.v1(), observer, primary, internalRequest, listener, indexShardReference, shardFailedTimeout);
|
|
|
} catch (Throwable e) {
|
|
|
// shard has not been allocated yet, retry it here
|
|
|
if (retryPrimaryException(e)) {
|
|
@@ -687,7 +690,7 @@ public abstract class TransportReplicationAction<Request extends ReplicationRequ
|
|
|
/**
|
|
|
* inner class is responsible for send the requests to all replica shards and manage the responses
|
|
|
*/
|
|
|
- final class ReplicationPhase extends AbstractRunnable implements ShardStateAction.Listener {
|
|
|
+ final class ReplicationPhase extends AbstractRunnable {
|
|
|
|
|
|
private final ReplicaRequest replicaRequest;
|
|
|
private final Response finalResponse;
|
|
@@ -702,6 +705,7 @@ public abstract class TransportReplicationAction<Request extends ReplicationRequ
|
|
|
private final int totalShards;
|
|
|
private final ClusterStateObserver observer;
|
|
|
private final Releasable indexShardReference;
|
|
|
+ private final TimeValue shardFailedTimeout;
|
|
|
|
|
|
/**
|
|
|
* the constructor doesn't take any action, just calculates state. Call {@link #run()} to start
|
|
@@ -709,7 +713,8 @@ public abstract class TransportReplicationAction<Request extends ReplicationRequ
|
|
|
*/
|
|
|
public ReplicationPhase(ShardIterator originalShardIt, ReplicaRequest replicaRequest, Response finalResponse,
|
|
|
ClusterStateObserver observer, ShardRouting originalPrimaryShard,
|
|
|
- InternalRequest internalRequest, ActionListener<Response> listener, Releasable indexShardReference) {
|
|
|
+ InternalRequest internalRequest, ActionListener<Response> listener, Releasable indexShardReference,
|
|
|
+ TimeValue shardFailedTimeout) {
|
|
|
this.replicaRequest = replicaRequest;
|
|
|
this.listener = listener;
|
|
|
this.finalResponse = finalResponse;
|
|
@@ -717,6 +722,7 @@ public abstract class TransportReplicationAction<Request extends ReplicationRequ
|
|
|
this.observer = observer;
|
|
|
indexMetaData = observer.observedState().metaData().index(internalRequest.concreteIndex());
|
|
|
this.indexShardReference = indexShardReference;
|
|
|
+ this.shardFailedTimeout = shardFailedTimeout;
|
|
|
|
|
|
ShardRouting shard;
|
|
|
// we double check on the state, if it got changed we need to make sure we take the latest one cause
|
|
@@ -822,16 +828,6 @@ public abstract class TransportReplicationAction<Request extends ReplicationRequ
|
|
|
forceFinishAsFailed(t);
|
|
|
}
|
|
|
|
|
|
- @Override
|
|
|
- public void onShardFailedNoMaster() {
|
|
|
-
|
|
|
- }
|
|
|
-
|
|
|
- @Override
|
|
|
- public void onShardFailedFailure(DiscoveryNode master, TransportException e) {
|
|
|
-
|
|
|
- }
|
|
|
-
|
|
|
/**
|
|
|
* start sending current requests to replicas
|
|
|
*/
|
|
@@ -893,14 +889,14 @@ public abstract class TransportReplicationAction<Request extends ReplicationRequ
|
|
|
|
|
|
@Override
|
|
|
public void handleException(TransportException exp) {
|
|
|
- onReplicaFailure(nodeId, exp);
|
|
|
logger.trace("[{}] transport failure during replica request [{}] ", exp, node, replicaRequest);
|
|
|
- if (ignoreReplicaException(exp) == false) {
|
|
|
+ if (ignoreReplicaException(exp)) {
|
|
|
+ onReplicaFailure(nodeId, exp);
|
|
|
+ } else {
|
|
|
logger.warn("{} failed to perform {} on node {}", exp, shardIt.shardId(), actionName, node);
|
|
|
- shardStateAction.shardFailed(shard, indexMetaData.getIndexUUID(), "failed to perform " + actionName + " on replica on node " + node, exp, ReplicationPhase.this);
|
|
|
+ shardStateAction.shardFailed(shard, indexMetaData.getIndexUUID(), "failed to perform " + actionName + " on replica on node " + node, exp, shardFailedTimeout, new ReplicationFailedShardStateListener(nodeId, exp));
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
});
|
|
|
} else {
|
|
|
try {
|
|
@@ -989,6 +985,33 @@ public abstract class TransportReplicationAction<Request extends ReplicationRequ
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ public class ReplicationFailedShardStateListener implements ShardStateAction.Listener {
|
|
|
+ private final String nodeId;
|
|
|
+ private Throwable failure;
|
|
|
+
|
|
|
+ public ReplicationFailedShardStateListener(String nodeId, Throwable failure) {
|
|
|
+ this.nodeId = nodeId;
|
|
|
+ this.failure = failure;
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public void onSuccess() {
|
|
|
+ onReplicaFailure(nodeId, failure);
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public void onShardFailedNoMaster() {
|
|
|
+ onReplicaFailure(nodeId, failure);
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public void onShardFailedFailure(DiscoveryNode master, TransportException e) {
|
|
|
+ if (e instanceof ReceiveTimeoutTransportException) {
|
|
|
+ logger.trace("timeout sending shard failure to master [{}]", e, master);
|
|
|
+ }
|
|
|
+ onReplicaFailure(nodeId, failure);
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
/**
|