|
@@ -34,6 +34,7 @@ import org.elasticsearch.common.xcontent.ToXContent;
|
|
|
import org.elasticsearch.common.xcontent.XContentBuilder;
|
|
|
|
|
|
import java.io.IOException;
|
|
|
+import java.util.concurrent.TimeUnit;
|
|
|
|
|
|
/**
|
|
|
* Holds additional information as to why the shard is in unassigned state.
|
|
@@ -103,21 +104,24 @@ public class UnassignedInfo implements ToXContent, Writeable<UnassignedInfo> {
|
|
|
}
|
|
|
|
|
|
private final Reason reason;
|
|
|
- private final long timestamp;
|
|
|
+ private final long unassignedTimeMillis; // used for display and log messages, in milliseconds
|
|
|
+ private final long unassignedTimeNanos; // in nanoseconds, used to calculate delay for delayed shard allocation
|
|
|
+ private long lastComputedLeftDelayNanos = 0l; // how long to delay shard allocation, not serialized (always positive, 0 means no delay)
|
|
|
private final String message;
|
|
|
private final Throwable failure;
|
|
|
|
|
|
public UnassignedInfo(Reason reason, String message) {
|
|
|
- this(reason, System.currentTimeMillis(), message, null);
|
|
|
+ this(reason, System.currentTimeMillis(), System.nanoTime(), message, null);
|
|
|
}
|
|
|
|
|
|
public UnassignedInfo(Reason reason, @Nullable String message, @Nullable Throwable failure) {
|
|
|
- this(reason, System.currentTimeMillis(), message, failure);
|
|
|
+ this(reason, System.currentTimeMillis(), System.nanoTime(), message, failure);
|
|
|
}
|
|
|
|
|
|
- private UnassignedInfo(Reason reason, long timestamp, String message, Throwable failure) {
|
|
|
+ private UnassignedInfo(Reason reason, long unassignedTimeMillis, long timestampNanos, String message, Throwable failure) {
|
|
|
this.reason = reason;
|
|
|
- this.timestamp = timestamp;
|
|
|
+ this.unassignedTimeMillis = unassignedTimeMillis;
|
|
|
+ this.unassignedTimeNanos = timestampNanos;
|
|
|
this.message = message;
|
|
|
this.failure = failure;
|
|
|
assert !(message == null && failure != null) : "provide a message if a failure exception is provided";
|
|
@@ -125,14 +129,18 @@ public class UnassignedInfo implements ToXContent, Writeable<UnassignedInfo> {
|
|
|
|
|
|
UnassignedInfo(StreamInput in) throws IOException {
|
|
|
this.reason = Reason.values()[(int) in.readByte()];
|
|
|
- this.timestamp = in.readLong();
|
|
|
+ this.unassignedTimeMillis = in.readLong();
|
|
|
+ // As System.nanoTime() cannot be compared across different JVMs, reset it to now.
|
|
|
+ // This means that in master failover situations, elapsed delay time is forgotten.
|
|
|
+ this.unassignedTimeNanos = System.nanoTime();
|
|
|
this.message = in.readOptionalString();
|
|
|
this.failure = in.readThrowable();
|
|
|
}
|
|
|
|
|
|
public void writeTo(StreamOutput out) throws IOException {
|
|
|
out.writeByte((byte) reason.ordinal());
|
|
|
- out.writeLong(timestamp);
|
|
|
+ out.writeLong(unassignedTimeMillis);
|
|
|
+ // Do not serialize unassignedTimeNanos as System.nanoTime() cannot be compared across different JVMs
|
|
|
out.writeOptionalString(message);
|
|
|
out.writeThrowable(failure);
|
|
|
}
|
|
@@ -149,13 +157,20 @@ public class UnassignedInfo implements ToXContent, Writeable<UnassignedInfo> {
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * The timestamp in milliseconds since epoch. Note, we use timestamp here since
|
|
|
- * we want to make sure its preserved across node serializations. Extra care need
|
|
|
- * to be made if its used to calculate diff (handle negative values) in case of
|
|
|
- * time drift.
|
|
|
+ * The timestamp in milliseconds when the shard became unassigned, based on System.currentTimeMillis().
|
|
|
+ * Note, we use timestamp here since we want to make sure its preserved across node serializations.
|
|
|
*/
|
|
|
- public long getTimestampInMillis() {
|
|
|
- return this.timestamp;
|
|
|
+ public long getUnassignedTimeInMillis() {
|
|
|
+ return this.unassignedTimeMillis;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * The timestamp in nanoseconds when the shard became unassigned, based on System.nanoTime().
|
|
|
+ * Used to calculate the delay for delayed shard allocation.
|
|
|
+ * ONLY EXPOSED FOR TESTS!
|
|
|
+ */
|
|
|
+ public long getUnassignedTimeInNanos() {
|
|
|
+ return this.unassignedTimeNanos;
|
|
|
}
|
|
|
|
|
|
/**
|
|
@@ -186,7 +201,7 @@ public class UnassignedInfo implements ToXContent, Writeable<UnassignedInfo> {
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * The allocation delay value associated with the index (defaulting to node settings if not set).
|
|
|
+ * The allocation delay value in milliseconds associated with the index (defaulting to node settings if not set).
|
|
|
*/
|
|
|
public long getAllocationDelayTimeoutSetting(Settings settings, Settings indexSettings) {
|
|
|
if (reason != Reason.NODE_LEFT) {
|
|
@@ -197,31 +212,40 @@ public class UnassignedInfo implements ToXContent, Writeable<UnassignedInfo> {
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * The time in millisecond until this unassigned shard can be reassigned.
|
|
|
+ * The delay in nanoseconds until this unassigned shard can be reassigned. This value is cached and might be slightly out-of-date.
|
|
|
+ * See also the {@link #updateDelay(long, Settings, Settings)} method.
|
|
|
*/
|
|
|
- public long getDelayAllocationExpirationIn(long unassignedShardsAllocatedTimestamp, Settings settings, Settings indexSettings) {
|
|
|
- long delayTimeout = getAllocationDelayTimeoutSetting(settings, indexSettings);
|
|
|
- if (delayTimeout == 0) {
|
|
|
- return 0;
|
|
|
- }
|
|
|
- long delta = unassignedShardsAllocatedTimestamp - timestamp;
|
|
|
- // account for time drift, treat it as no timeout
|
|
|
- if (delta < 0) {
|
|
|
- return 0;
|
|
|
- }
|
|
|
- return delayTimeout - delta;
|
|
|
+ public long getLastComputedLeftDelayNanos() {
|
|
|
+ return lastComputedLeftDelayNanos;
|
|
|
}
|
|
|
|
|
|
+ /**
|
|
|
+ * Updates delay left based on current time (in nanoseconds) and index/node settings.
|
|
|
+ * Should only be called from ReplicaShardAllocator.
|
|
|
+ * @return updated delay in nanoseconds
|
|
|
+ */
|
|
|
+ public long updateDelay(long nanoTimeNow, Settings settings, Settings indexSettings) {
|
|
|
+ long delayTimeoutMillis = getAllocationDelayTimeoutSetting(settings, indexSettings);
|
|
|
+ final long newComputedLeftDelayNanos;
|
|
|
+ if (delayTimeoutMillis == 0l) {
|
|
|
+ newComputedLeftDelayNanos = 0l;
|
|
|
+ } else {
|
|
|
+ assert nanoTimeNow >= unassignedTimeNanos;
|
|
|
+ long delayTimeoutNanos = TimeUnit.NANOSECONDS.convert(delayTimeoutMillis, TimeUnit.MILLISECONDS);
|
|
|
+ newComputedLeftDelayNanos = Math.max(0l, delayTimeoutNanos - (nanoTimeNow - unassignedTimeNanos));
|
|
|
+ }
|
|
|
+ lastComputedLeftDelayNanos = newComputedLeftDelayNanos;
|
|
|
+ return newComputedLeftDelayNanos;
|
|
|
+ }
|
|
|
|
|
|
/**
|
|
|
* Returns the number of shards that are unassigned and currently being delayed.
|
|
|
*/
|
|
|
- public static int getNumberOfDelayedUnassigned(long unassignedShardsAllocatedTimestamp, Settings settings, ClusterState state) {
|
|
|
+ public static int getNumberOfDelayedUnassigned(ClusterState state) {
|
|
|
int count = 0;
|
|
|
for (ShardRouting shard : state.routingTable().shardsWithState(ShardRoutingState.UNASSIGNED)) {
|
|
|
if (shard.primary() == false) {
|
|
|
- IndexMetaData indexMetaData = state.metaData().index(shard.getIndex());
|
|
|
- long delay = shard.unassignedInfo().getDelayAllocationExpirationIn(unassignedShardsAllocatedTimestamp, settings, indexMetaData.getSettings());
|
|
|
+ long delay = shard.unassignedInfo().getLastComputedLeftDelayNanos();
|
|
|
if (delay > 0) {
|
|
|
count++;
|
|
|
}
|
|
@@ -231,15 +255,16 @@ public class UnassignedInfo implements ToXContent, Writeable<UnassignedInfo> {
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * Finds the smallest delay expiration setting of an unassigned shard. Returns 0 if there are none.
|
|
|
+ * Finds the smallest delay expiration setting in milliseconds of all unassigned shards that are still delayed. Returns 0 if there are none.
|
|
|
*/
|
|
|
public static long findSmallestDelayedAllocationSetting(Settings settings, ClusterState state) {
|
|
|
long nextDelaySetting = Long.MAX_VALUE;
|
|
|
for (ShardRouting shard : state.routingTable().shardsWithState(ShardRoutingState.UNASSIGNED)) {
|
|
|
if (shard.primary() == false) {
|
|
|
IndexMetaData indexMetaData = state.metaData().index(shard.getIndex());
|
|
|
+ long leftDelayNanos = shard.unassignedInfo().getLastComputedLeftDelayNanos();
|
|
|
long delayTimeoutSetting = shard.unassignedInfo().getAllocationDelayTimeoutSetting(settings, indexMetaData.getSettings());
|
|
|
- if (delayTimeoutSetting > 0 && delayTimeoutSetting < nextDelaySetting) {
|
|
|
+ if (leftDelayNanos > 0 && delayTimeoutSetting > 0 && delayTimeoutSetting < nextDelaySetting) {
|
|
|
nextDelaySetting = delayTimeoutSetting;
|
|
|
}
|
|
|
}
|
|
@@ -249,14 +274,13 @@ public class UnassignedInfo implements ToXContent, Writeable<UnassignedInfo> {
|
|
|
|
|
|
|
|
|
/**
|
|
|
- * Finds the next (closest) delay expiration of an unassigned shard. Returns 0 if there are none.
|
|
|
+ * Finds the next (closest) delay expiration of an unassigned shard in nanoseconds. Returns 0 if there are none.
|
|
|
*/
|
|
|
- public static long findNextDelayedAllocationIn(long unassignedShardsAllocatedTimestamp, Settings settings, ClusterState state) {
|
|
|
+ public static long findNextDelayedAllocationIn(ClusterState state) {
|
|
|
long nextDelay = Long.MAX_VALUE;
|
|
|
for (ShardRouting shard : state.routingTable().shardsWithState(ShardRoutingState.UNASSIGNED)) {
|
|
|
if (shard.primary() == false) {
|
|
|
- IndexMetaData indexMetaData = state.metaData().index(shard.getIndex());
|
|
|
- long nextShardDelay = shard.unassignedInfo().getDelayAllocationExpirationIn(unassignedShardsAllocatedTimestamp, settings, indexMetaData.getSettings());
|
|
|
+ long nextShardDelay = shard.unassignedInfo().getLastComputedLeftDelayNanos();
|
|
|
if (nextShardDelay > 0 && nextShardDelay < nextDelay) {
|
|
|
nextDelay = nextShardDelay;
|
|
|
}
|
|
@@ -268,7 +292,7 @@ public class UnassignedInfo implements ToXContent, Writeable<UnassignedInfo> {
|
|
|
public String shortSummary() {
|
|
|
StringBuilder sb = new StringBuilder();
|
|
|
sb.append("[reason=").append(reason).append("]");
|
|
|
- sb.append(", at[").append(DATE_TIME_FORMATTER.printer().print(timestamp)).append("]");
|
|
|
+ sb.append(", at[").append(DATE_TIME_FORMATTER.printer().print(unassignedTimeMillis)).append("]");
|
|
|
String details = getDetails();
|
|
|
if (details != null) {
|
|
|
sb.append(", details[").append(details).append("]");
|
|
@@ -285,7 +309,7 @@ public class UnassignedInfo implements ToXContent, Writeable<UnassignedInfo> {
|
|
|
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
|
|
|
builder.startObject("unassigned_info");
|
|
|
builder.field("reason", reason);
|
|
|
- builder.field("at", DATE_TIME_FORMATTER.printer().print(timestamp));
|
|
|
+ builder.field("at", DATE_TIME_FORMATTER.printer().print(unassignedTimeMillis));
|
|
|
String details = getDetails();
|
|
|
if (details != null) {
|
|
|
builder.field("details", details);
|
|
@@ -301,7 +325,7 @@ public class UnassignedInfo implements ToXContent, Writeable<UnassignedInfo> {
|
|
|
|
|
|
UnassignedInfo that = (UnassignedInfo) o;
|
|
|
|
|
|
- if (timestamp != that.timestamp) return false;
|
|
|
+ if (unassignedTimeMillis != that.unassignedTimeMillis) return false;
|
|
|
if (reason != that.reason) return false;
|
|
|
if (message != null ? !message.equals(that.message) : that.message != null) return false;
|
|
|
return !(failure != null ? !failure.equals(that.failure) : that.failure != null);
|
|
@@ -311,7 +335,7 @@ public class UnassignedInfo implements ToXContent, Writeable<UnassignedInfo> {
|
|
|
@Override
|
|
|
public int hashCode() {
|
|
|
int result = reason != null ? reason.hashCode() : 0;
|
|
|
- result = 31 * result + Long.hashCode(timestamp);
|
|
|
+ result = 31 * result + Long.hashCode(unassignedTimeMillis);
|
|
|
result = 31 * result + (message != null ? message.hashCode() : 0);
|
|
|
result = 31 * result + (failure != null ? failure.hashCode() : 0);
|
|
|
return result;
|