|
|
@@ -20,9 +20,12 @@ import org.elasticsearch.core.TimeValue;
|
|
|
|
|
|
import java.io.PrintWriter;
|
|
|
import java.io.StringWriter;
|
|
|
+import java.util.Collection;
|
|
|
+import java.util.HashSet;
|
|
|
import java.util.List;
|
|
|
import java.util.Locale;
|
|
|
import java.util.Objects;
|
|
|
+import java.util.Set;
|
|
|
import java.util.concurrent.TimeUnit;
|
|
|
import java.util.stream.Collectors;
|
|
|
|
|
|
@@ -40,6 +43,7 @@ import java.util.stream.Collectors;
|
|
|
*/
|
|
|
public class CoordinationDiagnosticsService implements ClusterStateListener {
|
|
|
private final ClusterService clusterService;
|
|
|
+ private final Coordinator coordinator;
|
|
|
private final MasterHistoryService masterHistoryService;
|
|
|
/**
|
|
|
* This is the amount of time we use to make the initial decision -- have we seen a master node in the very recent past?
|
|
|
@@ -88,8 +92,13 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
|
|
|
Setting.Property.NodeScope
|
|
|
);
|
|
|
|
|
|
- public CoordinationDiagnosticsService(ClusterService clusterService, MasterHistoryService masterHistoryService) {
|
|
|
+ public CoordinationDiagnosticsService(
|
|
|
+ ClusterService clusterService,
|
|
|
+ Coordinator coordinator,
|
|
|
+ MasterHistoryService masterHistoryService
|
|
|
+ ) {
|
|
|
this.clusterService = clusterService;
|
|
|
+ this.coordinator = coordinator;
|
|
|
this.masterHistoryService = masterHistoryService;
|
|
|
this.nodeHasMasterLookupTimeframe = NODE_HAS_MASTER_LOOKUP_TIMEFRAME_SETTING.get(clusterService.getSettings());
|
|
|
this.unacceptableNullTransitions = NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.get(clusterService.getSettings());
|
|
|
@@ -156,7 +165,7 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
|
|
|
masterChanges,
|
|
|
localMasterHistory.getMaxHistoryAge()
|
|
|
);
|
|
|
- CoordinationDiagnosticsDetails details = getDetails(explain, localMasterHistory);
|
|
|
+ CoordinationDiagnosticsDetails details = getDetails(explain, localMasterHistory, null);
|
|
|
return new CoordinationDiagnosticsResult(coordinationDiagnosticsStatus, summary, details);
|
|
|
}
|
|
|
|
|
|
@@ -172,13 +181,17 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
|
|
|
* @return An empty CoordinationDiagnosticsDetails if explain is false, otherwise a CoordinationDiagnosticsDetails containing only
|
|
|
* "current_master" and "recent_masters"
|
|
|
*/
|
|
|
- private CoordinationDiagnosticsDetails getDetails(boolean explain, MasterHistory localMasterHistory) {
|
|
|
+ private CoordinationDiagnosticsDetails getDetails(
|
|
|
+ boolean explain,
|
|
|
+ MasterHistory localMasterHistory,
|
|
|
+ @Nullable String clusterFormationMessage
|
|
|
+ ) {
|
|
|
if (explain == false) {
|
|
|
return CoordinationDiagnosticsDetails.EMPTY;
|
|
|
}
|
|
|
DiscoveryNode masterNode = localMasterHistory.getMostRecentMaster();
|
|
|
List<DiscoveryNode> recentNonNullMasters = localMasterHistory.getNodes().stream().filter(Objects::nonNull).toList();
|
|
|
- return new CoordinationDiagnosticsDetails(masterNode, recentNonNullMasters);
|
|
|
+ return new CoordinationDiagnosticsDetails(masterNode, recentNonNullMasters, null, null, clusterFormationMessage);
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
@@ -267,7 +280,7 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
|
|
|
private CoordinationDiagnosticsResult getMasterIsStableResult(boolean explain, MasterHistory localMasterHistory) {
|
|
|
String summary = "The cluster has a stable master node";
|
|
|
logger.trace("The cluster has a stable master node");
|
|
|
- CoordinationDiagnosticsDetails details = getDetails(explain, localMasterHistory);
|
|
|
+ CoordinationDiagnosticsDetails details = getDetails(explain, localMasterHistory, null);
|
|
|
return new CoordinationDiagnosticsResult(CoordinationDiagnosticsStatus.GREEN, summary, details);
|
|
|
}
|
|
|
|
|
|
@@ -278,21 +291,95 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
|
|
|
* @return The CoordinationDiagnosticsResult for the given localMasterHistory
|
|
|
*/
|
|
|
private CoordinationDiagnosticsResult diagnoseOnHaveNotSeenMasterRecently(MasterHistory localMasterHistory, boolean explain) {
|
|
|
- // NOTE: The logic in this method will be implemented in a future PR
|
|
|
- String summary = "No master has been observed recently";
|
|
|
- CoordinationDiagnosticsDetails details = CoordinationDiagnosticsDetails.EMPTY;
|
|
|
+ Collection<DiscoveryNode> masterEligibleNodes = getMasterEligibleNodes();
|
|
|
+ final CoordinationDiagnosticsResult result;
|
|
|
+ boolean leaderHasBeenElected = coordinator.getPeerFinder().getLeader().isPresent();
|
|
|
+ if (masterEligibleNodes.isEmpty() && leaderHasBeenElected == false) {
|
|
|
+ result = getResultOnNoMasterEligibleNodes(localMasterHistory, explain);
|
|
|
+ } else if (leaderHasBeenElected) {
|
|
|
+ DiscoveryNode currentMaster = coordinator.getPeerFinder().getLeader().get();
|
|
|
+ result = getResultOnCannotJoinLeader(localMasterHistory, currentMaster, explain);
|
|
|
+ } else {
|
|
|
+ // NOTE: The logic in this block will be implemented in a future PR
|
|
|
+ result = new CoordinationDiagnosticsResult(
|
|
|
+ CoordinationDiagnosticsStatus.RED,
|
|
|
+ "No master has been observed recently",
|
|
|
+ CoordinationDiagnosticsDetails.EMPTY
|
|
|
+ );
|
|
|
+ }
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Creates a CoordinationDiagnosticsResult in the case that there has been no master in the last few seconds, there is no elected
|
|
|
+ * master known, and there are no master eligible nodes. The status will be RED, and the details (if explain is true) will contain
|
|
|
+ * the list of any masters seen previously and a description of known problems from this node's Coordinator.
|
|
|
+ * @param localMasterHistory Used to pull recent master nodes for the details if explain is true
|
|
|
+ * @param explain If true, details are returned
|
|
|
+ * @return A CoordinationDiagnosticsResult with a RED status
|
|
|
+ */
|
|
|
+ private CoordinationDiagnosticsResult getResultOnNoMasterEligibleNodes(MasterHistory localMasterHistory, boolean explain) {
|
|
|
+ String summary = "No master eligible nodes found in the cluster";
|
|
|
+ CoordinationDiagnosticsDetails details = getDetails(
|
|
|
+ explain,
|
|
|
+ localMasterHistory,
|
|
|
+ coordinator.getClusterFormationState().getDescription()
|
|
|
+ );
|
|
|
+ return new CoordinationDiagnosticsResult(CoordinationDiagnosticsStatus.RED, summary, details);
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Creates a CoordinationDiagnosticsResult in the case that there has been no master in the last few seconds in this node's cluster
|
|
|
+ * state, but PeerFinder reports that there is an elected master. The assumption is that this node is having a problem joining the
|
|
|
+ * elected master. The status will be RED, and the details (if explain is true) will contain the list of any masters seen previously
|
|
|
+ * and a description of known problems from this node's Coordinator.
|
|
|
+ * @param localMasterHistory Used to pull recent master nodes for the details if explain is true
|
|
|
+ * @param currentMaster The node that PeerFinder reports as the elected master
|
|
|
+ * @param explain If true, details are returned
|
|
|
+ * @return A CoordinationDiagnosticsResult with a RED status
|
|
|
+ */
|
|
|
+ private CoordinationDiagnosticsResult getResultOnCannotJoinLeader(
|
|
|
+ MasterHistory localMasterHistory,
|
|
|
+ DiscoveryNode currentMaster,
|
|
|
+ boolean explain
|
|
|
+ ) {
|
|
|
+ String summary = String.format(
|
|
|
+ Locale.ROOT,
|
|
|
+ "%s has been elected master, but the node being queried, %s, is unable to join it",
|
|
|
+ currentMaster,
|
|
|
+ clusterService.localNode()
|
|
|
+ );
|
|
|
+ CoordinationDiagnosticsDetails details = getDetails(
|
|
|
+ explain,
|
|
|
+ localMasterHistory,
|
|
|
+ coordinator.getClusterFormationState().getDescription()
|
|
|
+ );
|
|
|
return new CoordinationDiagnosticsResult(CoordinationDiagnosticsStatus.RED, summary, details);
|
|
|
}
|
|
|
|
|
|
+ /**
|
|
|
+ * Returns the master eligible nodes as found in this node's Coordinator, plus the local node if it is master eligible.
|
|
|
+ * @return All known master eligible nodes in this cluster
|
|
|
+ */
|
|
|
+ private Collection<DiscoveryNode> getMasterEligibleNodes() {
|
|
|
+ Set<DiscoveryNode> masterEligibleNodes = new HashSet<>();
|
|
|
+ coordinator.getFoundPeers().forEach(node -> {
|
|
|
+ if (node.isMasterNode()) {
|
|
|
+ masterEligibleNodes.add(node);
|
|
|
+ }
|
|
|
+ });
|
|
|
+ // Coordinator does not report the local node, so add it:
|
|
|
+ if (clusterService.localNode().isMasterNode()) {
|
|
|
+ masterEligibleNodes.add(clusterService.localNode());
|
|
|
+ }
|
|
|
+ return masterEligibleNodes;
|
|
|
+ }
|
|
|
+
|
|
|
/**
|
|
|
* This returns true if this node has seen a master node within the last few seconds
|
|
|
* @return true if this node has seen a master node within the last few seconds, false otherwise
|
|
|
*/
|
|
|
private boolean hasSeenMasterInHasMasterLookupTimeframe() {
|
|
|
- // If there is currently a master, there's no point in looking at the history:
|
|
|
- if (clusterService.state().nodes().getMasterNode() != null) {
|
|
|
- return true;
|
|
|
- }
|
|
|
return masterHistoryService.getLocalMasterHistory().hasSeenMasterInLastNSeconds((int) nodeHasMasterLookupTimeframe.seconds());
|
|
|
}
|
|
|
|
|
|
@@ -337,16 +424,17 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
|
|
|
public record CoordinationDiagnosticsDetails(
|
|
|
DiscoveryNode currentMaster,
|
|
|
List<DiscoveryNode> recentMasters,
|
|
|
- String remoteExceptionMessage,
|
|
|
- String remoteExceptionStackTrace
|
|
|
+ @Nullable String remoteExceptionMessage,
|
|
|
+ @Nullable String remoteExceptionStackTrace,
|
|
|
+ @Nullable String clusterFormationDescription
|
|
|
) {
|
|
|
|
|
|
public CoordinationDiagnosticsDetails(DiscoveryNode currentMaster, List<DiscoveryNode> recentMasters) {
|
|
|
- this(currentMaster, recentMasters, null, null);
|
|
|
+ this(currentMaster, recentMasters, null, null, null);
|
|
|
}
|
|
|
|
|
|
public CoordinationDiagnosticsDetails(DiscoveryNode currentMaster, Exception remoteException) {
|
|
|
- this(currentMaster, null, remoteException == null ? null : remoteException.getMessage(), getStackTrace(remoteException));
|
|
|
+ this(currentMaster, null, remoteException == null ? null : remoteException.getMessage(), getStackTrace(remoteException), null);
|
|
|
}
|
|
|
|
|
|
private static String getStackTrace(Exception e) {
|
|
|
@@ -358,6 +446,6 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
|
|
|
return stringWriter.toString();
|
|
|
}
|
|
|
|
|
|
- public static final CoordinationDiagnosticsDetails EMPTY = new CoordinationDiagnosticsDetails(null, null, null, null);
|
|
|
+ public static final CoordinationDiagnosticsDetails EMPTY = new CoordinationDiagnosticsDetails(null, null, null, null, null);
|
|
|
}
|
|
|
}
|