Kaynağa Gözat

Improve Node Shutdown Observability (#78727)

* Increase node shutdown logging level when stalled

* Add allocation explanation to STALLED response

* Adjust logging levels per review

* Include all SingleNodeShutdownMetadata fields in logs

* Depluralize `node_shutdown_decisions` per review

* Repluralize node_allocation_decisions when we're actually reading from the Allocation Explain API
Gordon Brown 4 yıl önce
ebeveyn
işleme
65d2b0909f

+ 35 - 3
server/src/main/java/org/elasticsearch/cluster/metadata/ShutdownShardMigrationStatus.java

@@ -8,6 +8,8 @@
 
 package org.elasticsearch.cluster.metadata;
 
+import org.elasticsearch.Version;
+import org.elasticsearch.cluster.routing.allocation.ShardAllocationDecision;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
@@ -20,25 +22,42 @@ import java.io.IOException;
 import java.util.Objects;
 
 public class ShutdownShardMigrationStatus implements Writeable, ToXContentObject {
+    private static final Version ALLOCATION_DECISION_ADDED_VERSION = Version.V_8_0_0;
 
     private final SingleNodeShutdownMetadata.Status status;
     private final long shardsRemaining;
     @Nullable private final String explanation;
+    @Nullable private final ShardAllocationDecision allocationDecision;
 
     public ShutdownShardMigrationStatus(SingleNodeShutdownMetadata.Status status, long shardsRemaining) {
-        this(status, shardsRemaining, null);
+        this(status, shardsRemaining, null, null);
     }
 
     public ShutdownShardMigrationStatus(SingleNodeShutdownMetadata.Status status, long shardsRemaining, @Nullable String explanation) {
+        this(status, shardsRemaining, explanation, null);
+    }
+
+    public ShutdownShardMigrationStatus(
+        SingleNodeShutdownMetadata.Status status,
+        long shardsRemaining,
+        @Nullable String explanation,
+        @Nullable ShardAllocationDecision allocationDecision
+    ) {
         this.status = Objects.requireNonNull(status, "status must not be null");
         this.shardsRemaining = shardsRemaining;
         this.explanation = explanation;
+        this.allocationDecision = allocationDecision;
     }
 
     public ShutdownShardMigrationStatus(StreamInput in) throws IOException {
         this.status = in.readEnum(SingleNodeShutdownMetadata.Status.class);
         this.shardsRemaining = in.readLong();
         this.explanation = in.readOptionalString();
+        if (in.getVersion().onOrAfter(ALLOCATION_DECISION_ADDED_VERSION)) {
+            this.allocationDecision = in.readOptionalWriteable(ShardAllocationDecision::new);
+        } else {
+            this.allocationDecision = null;
+        }
     }
 
     public long getShardsRemaining() {
@@ -61,6 +80,13 @@ public class ShutdownShardMigrationStatus implements Writeable, ToXContentObject
         if (Objects.nonNull(explanation)) {
             builder.field("explanation", explanation);
         }
+        if (Objects.nonNull(allocationDecision)) {
+            builder.startObject("node_allocation_decision");
+            {
+                allocationDecision.toXContent(builder, params);
+            }
+            builder.endObject();
+        }
         builder.endObject();
         return builder;
     }
@@ -70,6 +96,9 @@ public class ShutdownShardMigrationStatus implements Writeable, ToXContentObject
         out.writeEnum(status);
         out.writeLong(shardsRemaining);
         out.writeOptionalString(explanation);
+        if (out.getVersion().onOrAfter(ALLOCATION_DECISION_ADDED_VERSION)) {
+            out.writeOptionalWriteable(allocationDecision);
+        }
     }
 
     @Override
@@ -77,12 +106,15 @@ public class ShutdownShardMigrationStatus implements Writeable, ToXContentObject
         if (this == o) return true;
         if ((o instanceof ShutdownShardMigrationStatus) == false) return false;
         ShutdownShardMigrationStatus that = (ShutdownShardMigrationStatus) o;
-        return shardsRemaining == that.shardsRemaining && status == that.status && Objects.equals(explanation, that.explanation);
+        return shardsRemaining == that.shardsRemaining
+            && status == that.status
+            && Objects.equals(explanation, that.explanation)
+            && Objects.equals(allocationDecision, that.allocationDecision);
     }
 
     @Override
     public int hashCode() {
-        return Objects.hash(status, shardsRemaining, explanation);
+        return Objects.hash(status, shardsRemaining, explanation, allocationDecision);
     }
 
     @Override

+ 28 - 0
server/src/main/java/org/elasticsearch/cluster/metadata/SingleNodeShutdownMetadata.java

@@ -256,6 +256,34 @@ public class SingleNodeShutdownMetadata extends AbstractDiffable<SingleNodeShutd
         );
     }
 
+    @Override public String toString() {
+        final StringBuilder stringBuilder = new StringBuilder();
+        stringBuilder
+            .append("{")
+            .append("nodeId=[")
+            .append(nodeId)
+            .append(']')
+            .append(", type=[")
+            .append(type)
+            .append("], reason=[")
+            .append(reason)
+            .append(']');
+        if (allocationDelay != null) {
+            stringBuilder
+                .append(", allocationDelay=[")
+                .append(allocationDelay)
+                .append("]");
+        }
+        if (targetNodeName != null) {
+            stringBuilder
+                .append(", targetNodeName=[")
+                .append(targetNodeName)
+                .append("]");
+        }
+        stringBuilder.append("}");
+        return stringBuilder.toString();
+    }
+
     public static Builder builder() {
         return new Builder();
     }

+ 2 - 0
x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportDeleteShutdownNodeAction.java

@@ -71,6 +71,8 @@ public class TransportDeleteShutdownNodeAction extends AcknowledgedTransportMast
             public ClusterState execute(ClusterState currentState) throws Exception {
                 NodesShutdownMetadata currentShutdownMetadata = currentState.metadata().custom(NodesShutdownMetadata.TYPE);
 
+                logger.info("removing shutdown record for node [{}]", request.getNodeId());
+
                 return ClusterState.builder(currentState)
                     .metadata(
                         Metadata.builder(currentState.metadata())

+ 12 - 12
x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusAction.java

@@ -262,17 +262,15 @@ public class TransportGetShutdownStatusAction extends TransportMasterNodeAction<
                 return hasShardCopyOnOtherNode == false;
             })
             .peek(pair -> {
-                if (logger.isTraceEnabled()) { // don't serialize the decision unless we have to
-                    logger.trace(
-                        "node [{}] shutdown of type [{}] stalled: found shard [{}][{}] from index [{}] with negative decision: [{}]",
-                        nodeId,
-                        shutdownType,
-                        pair.v1().getId(),
-                        pair.v1().primary() ? "primary" : "replica",
-                        pair.v1().shardId().getIndexName(),
-                        Strings.toString(pair.v2())
-                    );
-                }
+                logger.debug(
+                    "node [{}] shutdown of type [{}] stalled: found shard [{}][{}] from index [{}] with negative decision: [{}]",
+                    nodeId,
+                    shutdownType,
+                    pair.v1().getId(),
+                    pair.v1().primary() ? "primary" : "replica",
+                    pair.v1().shardId().getIndexName(),
+                    Strings.toString(pair.v2())
+                );
             })
             .findFirst();
 
@@ -287,6 +285,7 @@ public class TransportGetShutdownStatusAction extends TransportMasterNodeAction<
         } else if (unmovableShard.isPresent()) {
             // We found a shard that can't be moved, so shard relocation is stalled. Blame the unmovable shard.
             ShardRouting shardRouting = unmovableShard.get().v1();
+            ShardAllocationDecision decision = unmovableShard.get().v2();
 
             return new ShutdownShardMigrationStatus(
                 SingleNodeShutdownMetadata.Status.STALLED,
@@ -296,7 +295,8 @@ public class TransportGetShutdownStatusAction extends TransportMasterNodeAction<
                     shardRouting.shardId().getId(),
                     shardRouting.primary() ? "primary" : "replica",
                     shardRouting.index().getName()
-                ).getFormattedMessage()
+                ).getFormattedMessage(),
+                decision
             );
         } else {
             return new ShutdownShardMigrationStatus(SingleNodeShutdownMetadata.Status.IN_PROGRESS, totalRemainingShards);

+ 8 - 13
x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportPutShutdownNodeAction.java

@@ -69,19 +69,6 @@ public class TransportPutShutdownNodeAction extends AcknowledgedTransportMasterN
                     currentShutdownMetadata = new NodesShutdownMetadata(new HashMap<>());
                 }
 
-                // Verify that there's not already a shutdown metadata for this node
-                SingleNodeShutdownMetadata existingRecord = currentShutdownMetadata.getAllNodeMetadataMap().get(request.getNodeId());
-                if (existingRecord != null) {
-                    logger.info(
-                        "updating existing shutdown record for node [{}] of type [{}] with reason [{}] with new type [{}] and reason [{}]",
-                        existingRecord.getNodeId(),
-                        existingRecord.getType(),
-                        existingRecord.getReason(),
-                        request.getType(),
-                        request.getReason()
-                    );
-                }
-
                 final boolean nodeSeen = currentState.getNodes().nodeExists(request.getNodeId());
 
                 SingleNodeShutdownMetadata newNodeMetadata = SingleNodeShutdownMetadata.builder()
@@ -94,6 +81,14 @@ public class TransportPutShutdownNodeAction extends AcknowledgedTransportMasterN
                     .setTargetNodeName(request.getTargetNodeName())
                     .build();
 
+                // Verify that there's not already a shutdown metadata for this node
+                SingleNodeShutdownMetadata existingRecord = currentShutdownMetadata.getAllNodeMetadataMap().get(request.getNodeId());
+                if (existingRecord != null) {
+                    logger.info("updating existing shutdown record {} with new record {}", existingRecord, newNodeMetadata);
+                } else {
+                    logger.info("creating shutdown record {}", newNodeMetadata);
+                }
+
                 return ClusterState.builder(currentState)
                     .metadata(
                         Metadata.builder(currentState.metadata())