Преглед изворни кода

Adding logic to master_is_stable indicator to check for discovery problems (#88020)

Keith Massey пре 3 година
родитељ
комит
41d7280295

+ 5 - 0
docs/changelog/88020.yaml

@@ -0,0 +1,5 @@
+pr: 88020
+summary: Adding logic to `master_is_stable` indicator to check for discovery problems
+area: Health
+type: enhancement
+issues: []

+ 54 - 1
server/src/internalClusterTest/java/org/elasticsearch/discovery/StableMasterDisruptionIT.java

@@ -46,6 +46,7 @@ import org.elasticsearch.xcontent.ToXContent;
 import org.elasticsearch.xcontent.ToXContentObject;
 import org.elasticsearch.xcontent.XContentBuilder;
 import org.elasticsearch.xcontent.json.JsonXContent;
+import org.junit.Before;
 
 import java.io.IOException;
 import java.util.ArrayList;
@@ -69,9 +70,14 @@ import static org.hamcrest.Matchers.equalTo;
  * Tests relating to the loss of the master, but which work with the default fault detection settings which are rather lenient and will
  * not detect a master failure too quickly.
  */
-@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0)
+@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0, autoManageMasterNodes = false)
 public class StableMasterDisruptionIT extends ESIntegTestCase {
 
+    @Before
+    private void setBootstrapMasterNodeIndex() {
+        internalCluster().setBootstrapMasterNodeIndex(0);
+    }
+
     @Override
     protected Collection<Class<? extends Plugin>> nodePlugins() {
         return Collections.singletonList(MockTransportService.TestPlugin.class);
@@ -560,4 +566,51 @@ public class StableMasterDisruptionIT extends ESIntegTestCase {
             "has been elected master, but the node being queried"
         );
     }
+
+    public void testNoQuorum() throws Exception {
+        /*
+         * In this test we have three master-eligible nodes. We make it so that the two non-active ones cannot communicate, and then we
+         * stop the active master node. Now there is no quorum so a new master cannot be elected. We set the master lookup threshold very
+         * low on the data nodes, so when we run the master stability check on each of the master nodes, it will see that there has been no
+         * master recently and because there is no quorum, so it returns a RED status.
+         */
+        final List<String> masterNodes = internalCluster().startMasterOnlyNodes(
+            3,
+            Settings.builder()
+                .put(LeaderChecker.LEADER_CHECK_TIMEOUT_SETTING.getKey(), "1s")
+                .put(Coordinator.PUBLISH_TIMEOUT_SETTING.getKey(), "1s")
+                .put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), 1)
+                .put(ThreadPool.ESTIMATED_TIME_INTERVAL_SETTING.getKey(), TimeValue.ZERO)
+                .put(CoordinationDiagnosticsService.NODE_HAS_MASTER_LOOKUP_TIMEFRAME_SETTING.getKey(), new TimeValue(1, TimeUnit.SECONDS))
+                .build()
+        );
+        final List<String> dataNodes = internalCluster().startDataOnlyNodes(
+            2,
+            Settings.builder()
+                .put(LeaderChecker.LEADER_CHECK_TIMEOUT_SETTING.getKey(), "1s")
+                .put(Coordinator.PUBLISH_TIMEOUT_SETTING.getKey(), "1s")
+                .put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), 1)
+                .put(ThreadPool.ESTIMATED_TIME_INTERVAL_SETTING.getKey(), TimeValue.ZERO)
+                .put(CoordinationDiagnosticsService.NODE_HAS_MASTER_LOOKUP_TIMEFRAME_SETTING.getKey(), new TimeValue(1, TimeUnit.SECONDS))
+                .build()
+        );
+        ensureStableCluster(5);
+        String firstMasterNode = internalCluster().getMasterName();
+        List<String> nonActiveMasterNodes = masterNodes.stream().filter(nodeName -> firstMasterNode.equals(nodeName) == false).toList();
+        NetworkDisruption networkDisconnect = new NetworkDisruption(
+            new NetworkDisruption.TwoPartitions(
+                Set.of(nonActiveMasterNodes.get(0), dataNodes.get(0)),
+                Set.of(nonActiveMasterNodes.get(1), dataNodes.get(1))
+            ),
+            NetworkDisruption.UNRESPONSIVE
+        );
+
+        internalCluster().clearDisruptionScheme();
+        setDisruptionScheme(networkDisconnect);
+        networkDisconnect.startDisrupting();
+        internalCluster().stopNode(firstMasterNode);
+        for (String nonActiveMasterNode : nonActiveMasterNodes) {
+            assertMasterStability(internalCluster().client(nonActiveMasterNode), HealthStatus.RED, "unable to form a quorum");
+        }
+    }
 }

+ 236 - 46
server/src/main/java/org/elasticsearch/cluster/coordination/CoordinationDiagnosticsService.java

@@ -38,9 +38,11 @@ import java.io.IOException;
 import java.io.PrintWriter;
 import java.io.StringWriter;
 import java.util.Collection;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
+import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
@@ -93,7 +95,8 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
      * This field holds the results of the tasks in the clusterFormationInfoTasks field above. The field is accessed (reads/writes) from
      * multiple threads, but the reference itself is only ever changed on the cluster change event thread.
      */
-    private volatile ConcurrentMap<DiscoveryNode, ClusterFormationStateOrException> clusterFormationResponses = null;
+    // Non-private for testing
+    volatile ConcurrentMap<DiscoveryNode, ClusterFormationStateOrException> clusterFormationResponses = null;
 
     private static final Logger logger = LogManager.getLogger(CoordinationDiagnosticsService.class);
 
@@ -203,7 +206,7 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
             masterChanges,
             localMasterHistory.getMaxHistoryAge()
         );
-        CoordinationDiagnosticsDetails details = getDetails(explain, localMasterHistory, null);
+        CoordinationDiagnosticsDetails details = getDetails(explain, localMasterHistory, null, null);
         return new CoordinationDiagnosticsResult(coordinationDiagnosticsStatus, summary, details);
     }
 
@@ -219,17 +222,18 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
      * @return An empty CoordinationDiagnosticsDetails if explain is false, otherwise a CoordinationDiagnosticsDetails containing only
      * "current_master" and "recent_masters"
      */
-    private CoordinationDiagnosticsDetails getDetails(
+    private static CoordinationDiagnosticsDetails getDetails(
         boolean explain,
         MasterHistory localMasterHistory,
-        @Nullable String clusterFormationMessage
+        @Nullable Exception remoteException,
+        @Nullable Map<String, String> clusterFormationMessages
     ) {
         if (explain == false) {
             return CoordinationDiagnosticsDetails.EMPTY;
         }
         DiscoveryNode masterNode = localMasterHistory.getMostRecentMaster();
         List<DiscoveryNode> recentNonNullMasters = localMasterHistory.getNodes().stream().filter(Objects::nonNull).toList();
-        return new CoordinationDiagnosticsDetails(masterNode, recentNonNullMasters, null, null, clusterFormationMessage);
+        return new CoordinationDiagnosticsDetails(masterNode, recentNonNullMasters, remoteException, clusterFormationMessages);
     }
 
     /**
@@ -280,11 +284,7 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
                 localMasterHistory.getNodes().stream().filter(Objects::nonNull).collect(Collectors.toSet()),
                 localMasterHistory.getMaxHistoryAge()
             );
-            final CoordinationDiagnosticsDetails details = getDetailsOnMasterHasFlappedNull(
-                explain,
-                localMasterHistory,
-                remoteHistoryException
-            );
+            final CoordinationDiagnosticsDetails details = getDetails(explain, localMasterHistory, remoteHistoryException, null);
             return new CoordinationDiagnosticsResult(CoordinationDiagnosticsStatus.YELLOW, summary, details);
         } else {
             logger.trace("This node thinks the master is unstable, but the master node {} thinks it is stable", master);
@@ -292,25 +292,6 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
         }
     }
 
-    /**
-     * Returns the details for the calculateOnMasterHasFlappedNull method. This method populates the CoordinationDiagnosticsDetails
-     * with the currentMaster, and optionally the remoteExceptionMessage and remoteExceptionStackTrace.
-     * @param explain If false, nothing is calculated and CoordinationDiagnosticsDetails.EMPTY is returned
-     * @param localMasterHistory The localMasterHistory
-     * @param remoteHistoryException An exception that was found when retrieving the remote master history. Can be null
-     * @return The CoordinationDiagnosticsDetails
-     */
-    private CoordinationDiagnosticsDetails getDetailsOnMasterHasFlappedNull(
-        boolean explain,
-        MasterHistory localMasterHistory,
-        @Nullable Exception remoteHistoryException
-    ) {
-        if (explain == false) {
-            return CoordinationDiagnosticsDetails.EMPTY;
-        }
-        return new CoordinationDiagnosticsDetails(localMasterHistory.getMostRecentMaster(), remoteHistoryException);
-    }
-
     /**
      * Returns a CoordinationDiagnosticsResult for the case when the master is seen as stable
      * @return A CoordinationDiagnosticsResult for the case when the master is seen as stable (GREEN status, no impacts or details)
@@ -318,7 +299,7 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
     private CoordinationDiagnosticsResult getMasterIsStableResult(boolean explain, MasterHistory localMasterHistory) {
         String summary = "The cluster has a stable master node";
         logger.trace("The cluster has a stable master node");
-        CoordinationDiagnosticsDetails details = getDetails(explain, localMasterHistory, null);
+        CoordinationDiagnosticsDetails details = getDetails(explain, localMasterHistory, null, null);
         return new CoordinationDiagnosticsResult(CoordinationDiagnosticsStatus.GREEN, summary, details);
     }
 
@@ -331,23 +312,205 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
     private CoordinationDiagnosticsResult diagnoseOnHaveNotSeenMasterRecently(MasterHistory localMasterHistory, boolean explain) {
         Collection<DiscoveryNode> masterEligibleNodes = getMasterEligibleNodes();
         final CoordinationDiagnosticsResult result;
-        boolean leaderHasBeenElected = coordinator.getPeerFinder().getLeader().isPresent();
-        if (masterEligibleNodes.isEmpty() && leaderHasBeenElected == false) {
+        boolean clusterHasLeader = coordinator.getPeerFinder().getLeader().isPresent();
+        boolean noLeaderAndNoMasters = clusterHasLeader == false && masterEligibleNodes.isEmpty();
+        boolean isLocalNodeMasterEligible = clusterService.localNode().isMasterNode();
+        if (noLeaderAndNoMasters) {
             result = getResultOnNoMasterEligibleNodes(localMasterHistory, explain);
-        } else if (leaderHasBeenElected) {
+        } else if (clusterHasLeader) {
             DiscoveryNode currentMaster = coordinator.getPeerFinder().getLeader().get();
             result = getResultOnCannotJoinLeader(localMasterHistory, currentMaster, explain);
-        } else {
+        } else if (isLocalNodeMasterEligible == false) { // none is elected master and we aren't master eligible
             // NOTE: The logic in this block will be implemented in a future PR
             result = new CoordinationDiagnosticsResult(
                 CoordinationDiagnosticsStatus.RED,
                 "No master has been observed recently",
                 CoordinationDiagnosticsDetails.EMPTY
             );
+        } else { // none is elected master and we are master eligible
+            result = diagnoseOnHaveNotSeenMasterRecentlyAndWeAreMasterEligible(
+                localMasterHistory,
+                masterEligibleNodes,
+                coordinator,
+                clusterFormationResponses,
+                nodeHasMasterLookupTimeframe,
+                explain
+            );
+        }
+        return result;
+    }
+
+    /**
+     * This method handles the case when we have not had an elected master node recently, and we are on a master-eligible node. In this
+     * case we look at the cluster formation information from all master-eligible nodes, trying to understand if we have a discovery
+     * problem, a problem forming a quorum, or something else.
+     * @param localMasterHistory The master history, as seen from this node
+     * @param masterEligibleNodes The known master eligible nodes in the cluster
+     * @param coordinator The Coordinator for this node
+     * @param clusterFormationResponses A map that contains the cluster formation information (or exception encountered while requesting
+     *                                  it) from each master eligible node in the cluster
+     * @param nodeHasMasterLookupTimeframe The value of health.master_history.has_master_lookup_timeframe
+     * @param explain If true, details are returned
+     * @return A CoordinationDiagnosticsResult with a RED status
+     */
+    static CoordinationDiagnosticsResult diagnoseOnHaveNotSeenMasterRecentlyAndWeAreMasterEligible(
+        MasterHistory localMasterHistory,
+        Collection<DiscoveryNode> masterEligibleNodes,
+        Coordinator coordinator,
+        ConcurrentMap<DiscoveryNode, ClusterFormationStateOrException> clusterFormationResponses,
+        TimeValue nodeHasMasterLookupTimeframe,
+        boolean explain
+
+    ) {
+        final CoordinationDiagnosticsResult result;
+        /*
+         * We want to make sure that the same elements are in this set every time we loop through it. We don't care if values are added
+         * while we're copying it, which is why this is not synchronized. We only care that once we have a copy it is not changed.
+         */
+        final Map<DiscoveryNode, ClusterFormationStateOrException> nodeToClusterFormationResponses = Map.copyOf(clusterFormationResponses);
+        for (Map.Entry<DiscoveryNode, ClusterFormationStateOrException> entry : nodeToClusterFormationResponses.entrySet()) {
+            Exception remoteException = entry.getValue().exception();
+            if (remoteException != null) {
+                return new CoordinationDiagnosticsResult(
+                    CoordinationDiagnosticsStatus.RED,
+                    String.format(
+                        Locale.ROOT,
+                        "No master node observed in the last %s, and an exception occurred while reaching out " + "to %s for diagnosis",
+                        nodeHasMasterLookupTimeframe,
+                        entry.getKey().getName()
+                    ),
+                    getDetails(
+                        explain,
+                        localMasterHistory,
+                        remoteException,
+                        Map.of(coordinator.getLocalNode().getId(), coordinator.getClusterFormationState().getDescription())
+                    )
+                );
+            }
+        }
+        Map<DiscoveryNode, ClusterFormationFailureHelper.ClusterFormationState> nodeClusterFormationStateMap =
+            nodeToClusterFormationResponses.entrySet()
+                .stream()
+                .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().clusterFormationState()));
+        Map<String, String> nodeIdToClusterFormationDescription = nodeClusterFormationStateMap.entrySet()
+            .stream()
+            .collect(Collectors.toMap(entry -> entry.getKey().getId(), entry -> entry.getValue().getDescription()));
+        if (anyNodeInClusterReportsDiscoveryProblems(masterEligibleNodes, nodeClusterFormationStateMap)) {
+            result = new CoordinationDiagnosticsResult(
+                CoordinationDiagnosticsStatus.RED,
+                String.format(
+                    Locale.ROOT,
+                    "No master node observed in the last %s, and some master eligible nodes are unable to discover other master "
+                        + "eligible nodes",
+                    nodeHasMasterLookupTimeframe
+                ),
+                getDetails(explain, localMasterHistory, null, nodeIdToClusterFormationDescription)
+            );
+        } else {
+            if (anyNodeInClusterReportsQuorumProblems(nodeClusterFormationStateMap)) {
+                result = new CoordinationDiagnosticsResult(
+                    CoordinationDiagnosticsStatus.RED,
+                    String.format(
+                        Locale.ROOT,
+                        "No master node observed in the last %s, and the master eligible nodes are unable to form a quorum",
+                        nodeHasMasterLookupTimeframe
+                    ),
+                    getDetails(explain, localMasterHistory, null, nodeIdToClusterFormationDescription)
+                );
+            } else {
+                result = new CoordinationDiagnosticsResult(
+                    CoordinationDiagnosticsStatus.RED,
+                    String.format(
+                        Locale.ROOT,
+                        "No master node observed in the last %s, and the cause has not been determined.",
+                        nodeHasMasterLookupTimeframe
+                    ),
+                    getDetails(explain, localMasterHistory, null, nodeIdToClusterFormationDescription)
+                );
+            }
         }
         return result;
     }
 
+    /**
+     * This method checks whether each master eligible node has discovered each of the other master eligible nodes. For the sake of this
+     * method, a discovery problem is when the foundPeers of any ClusterFormationState on any node we have that information for does not
+     * contain all of the nodes in the local coordinator.getFoundPeers().
+     * @param masterEligibleNodes The collection of all master eligible nodes
+     * @param nodeToClusterFormationStateMap A map of each master node to its ClusterFormationState
+     * @return true if there are discovery problems, false otherwise
+     */
+    static boolean anyNodeInClusterReportsDiscoveryProblems(
+        Collection<DiscoveryNode> masterEligibleNodes,
+        Map<DiscoveryNode, ClusterFormationFailureHelper.ClusterFormationState> nodeToClusterFormationStateMap
+    ) {
+        Map<DiscoveryNode, Collection<DiscoveryNode>> nodesNotDiscoveredMap = new HashMap<>();
+        for (Map.Entry<DiscoveryNode, ClusterFormationFailureHelper.ClusterFormationState> entry : nodeToClusterFormationStateMap
+            .entrySet()) {
+            Set<DiscoveryNode> foundPeersOnNode = new HashSet<>(entry.getValue().foundPeers());
+            if (foundPeersOnNode.containsAll(masterEligibleNodes) == false) {
+                Collection<DiscoveryNode> nodesNotDiscovered = masterEligibleNodes.stream()
+                    .filter(node -> foundPeersOnNode.contains(node) == false)
+                    .toList();
+                nodesNotDiscoveredMap.put(entry.getKey(), nodesNotDiscovered);
+            }
+        }
+        if (nodesNotDiscoveredMap.isEmpty()) {
+            return false;
+        } else {
+            String nodeDiscoveryProblemsMessage = nodesNotDiscoveredMap.entrySet()
+                .stream()
+                .map(
+                    entry -> String.format(
+                        Locale.ROOT,
+                        "%s cannot discover [%s]",
+                        entry.getKey().getName(),
+                        entry.getValue().stream().map(DiscoveryNode::getName).collect(Collectors.joining(", "))
+                    )
+                )
+                .collect(Collectors.joining("; "));
+            logger.debug("The following nodes report discovery problems: {}", nodeDiscoveryProblemsMessage);
+            return true;
+        }
+    }
+
+    /**
+     * This method checks that each master eligible node in the quorum thinks that it can form a quorum. If there are nodes that report a
+     * problem forming a quorum, this method returns true. This method determines whether a node thinks that a quorum can be formed by
+     * checking the value of that node's ClusterFormationState.hasDiscoveredQuorum field.
+     * @param nodeToClusterFormationStateMap A map of each master node to its ClusterFormationState
+     * @return True if any nodes in nodeToClusterFormationStateMap report a problem forming a quorum, false otherwise.
+     */
+    static boolean anyNodeInClusterReportsQuorumProblems(
+        Map<DiscoveryNode, ClusterFormationFailureHelper.ClusterFormationState> nodeToClusterFormationStateMap
+    ) {
+        Map<DiscoveryNode, String> quorumProblems = new HashMap<>();
+        for (Map.Entry<DiscoveryNode, ClusterFormationFailureHelper.ClusterFormationState> entry : nodeToClusterFormationStateMap
+            .entrySet()) {
+            ClusterFormationFailureHelper.ClusterFormationState clusterFormationState = entry.getValue();
+            if (clusterFormationState.hasDiscoveredQuorum() == false) {
+                quorumProblems.put(entry.getKey(), clusterFormationState.getDescription());
+            }
+        }
+        if (quorumProblems.isEmpty()) {
+            return false;
+        } else {
+            String quorumProblemsMessage = quorumProblems.entrySet()
+                .stream()
+                .map(
+                    entry -> String.format(
+                        Locale.ROOT,
+                        "%s reports that a quorum " + "cannot be formed: [%s]",
+                        entry.getKey().getName(),
+                        entry.getValue()
+                    )
+                )
+                .collect(Collectors.joining("; "));
+            logger.debug("Some master eligible nodes report that a quorum cannot be formed: {}", quorumProblemsMessage);
+            return true;
+        }
+    }
+
     /**
      * Creates a CoordinationDiagnosticsResult in the case that there has been no master in the last few seconds, there is no elected
      * master known, and there are no master eligible nodes. The status will be RED, and the details (if explain is true) will contain
@@ -361,7 +524,8 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
         CoordinationDiagnosticsDetails details = getDetails(
             explain,
             localMasterHistory,
-            coordinator.getClusterFormationState().getDescription()
+            null,
+            Map.of(coordinator.getLocalNode().getId(), coordinator.getClusterFormationState().getDescription())
         );
         return new CoordinationDiagnosticsResult(CoordinationDiagnosticsStatus.RED, summary, details);
     }
@@ -390,7 +554,8 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
         CoordinationDiagnosticsDetails details = getDetails(
             explain,
             localMasterHistory,
-            coordinator.getClusterFormationState().getDescription()
+            null,
+            Map.of(coordinator.getLocalNode().getId(), coordinator.getClusterFormationState().getDescription())
         );
         return new CoordinationDiagnosticsResult(CoordinationDiagnosticsStatus.RED, summary, details);
     }
@@ -647,19 +812,31 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
         List<DiscoveryNode> recentMasters,
         @Nullable String remoteExceptionMessage,
         @Nullable String remoteExceptionStackTrace,
-        @Nullable String clusterFormationDescription
+        @Nullable Map<String, String> nodeToClusterFormationDescriptionMap
     ) implements Writeable {
-
-        public CoordinationDiagnosticsDetails(DiscoveryNode currentMaster, List<DiscoveryNode> recentMasters) {
-            this(currentMaster, recentMasters, null, null, null);
-        }
-
-        public CoordinationDiagnosticsDetails(DiscoveryNode currentMaster, Exception remoteException) {
-            this(currentMaster, null, remoteException == null ? null : remoteException.getMessage(), getStackTrace(remoteException), null);
+        public CoordinationDiagnosticsDetails(
+            DiscoveryNode currentMaster,
+            List<DiscoveryNode> recentMasters,
+            Exception remoteException,
+            Map<String, String> nodeToClusterFormationDescriptionMap
+        ) {
+            this(
+                currentMaster,
+                recentMasters,
+                remoteException == null ? null : remoteException.getMessage(),
+                getStackTrace(remoteException),
+                nodeToClusterFormationDescriptionMap
+            );
         }
 
         public CoordinationDiagnosticsDetails(StreamInput in) throws IOException {
-            this(readCurrentMaster(in), readRecentMasters(in), in.readOptionalString(), in.readOptionalString(), in.readOptionalString());
+            this(
+                readCurrentMaster(in),
+                readRecentMasters(in),
+                in.readOptionalString(),
+                in.readOptionalString(),
+                readClusterFormationStates(in)
+            );
         }
 
         private static DiscoveryNode readCurrentMaster(StreamInput in) throws IOException {
@@ -684,6 +861,14 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
             return recentMasters;
         }
 
+        private static Map<String, String> readClusterFormationStates(StreamInput in) throws IOException {
+            if (in.readBoolean()) {
+                return in.readMap(StreamInput::readString, StreamInput::readString);
+            } else {
+                return Map.of();
+            }
+        }
+
         private static String getStackTrace(Exception e) {
             if (e == null) {
                 return null;
@@ -711,7 +896,12 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
             }
             out.writeOptionalString(remoteExceptionMessage);
             out.writeOptionalString(remoteExceptionStackTrace);
-            out.writeOptionalString(clusterFormationDescription);
+            if (nodeToClusterFormationDescriptionMap == null) {
+                out.writeBoolean(false);
+            } else {
+                out.writeBoolean(true);
+                out.writeMap(nodeToClusterFormationDescriptionMap, StreamOutput::writeString, StreamOutput::writeString);
+            }
         }
 
     }

+ 2 - 5
server/src/main/java/org/elasticsearch/cluster/coordination/StableMasterHealthIndicatorService.java

@@ -154,11 +154,8 @@ public class StableMasterHealthIndicatorService implements HealthIndicatorServic
                     builder.field("stack_trace", coordinationDiagnosticsDetails.remoteExceptionStackTrace());
                 });
             }
-            if (coordinationDiagnosticsDetails.clusterFormationDescription() != null) {
-                builder.object(
-                    CLUSTER_FORMATION,
-                    xContentBuilder -> { builder.field("description", coordinationDiagnosticsDetails.clusterFormationDescription()); }
-                );
+            if (coordinationDiagnosticsDetails.nodeToClusterFormationDescriptionMap() != null) {
+                builder.field(CLUSTER_FORMATION, coordinationDiagnosticsDetails.nodeToClusterFormationDescriptionMap());
             }
             return builder.endObject();
         };

+ 2 - 1
server/src/test/java/org/elasticsearch/action/admin/cluster/coordination/CoordinationDiagnosticsActionTests.java

@@ -16,6 +16,7 @@ import org.elasticsearch.test.EqualsHashCodeTestUtils;
 
 import java.util.Collections;
 import java.util.List;
+import java.util.Map;
 import java.util.UUID;
 
 import static org.elasticsearch.cluster.coordination.CoordinationDiagnosticsService.CoordinationDiagnosticsDetails;
@@ -46,7 +47,7 @@ public class CoordinationDiagnosticsActionTests extends ESTestCase {
             List.of(node1, node2),
             randomAlphaOfLengthBetween(0, 30),
             randomAlphaOfLengthBetween(0, 30),
-            randomAlphaOfLengthBetween(0, 30)
+            Map.of(randomAlphaOfLength(20), randomAlphaOfLengthBetween(0, 30))
         );
         CoordinationDiagnosticsResult result = new CoordinationDiagnosticsResult(
             randomFrom(CoordinationDiagnosticsStatus.values()),

+ 323 - 10
server/src/test/java/org/elasticsearch/cluster/coordination/CoordinationDiagnosticsServiceTests.java

@@ -18,7 +18,10 @@ import org.elasticsearch.cluster.node.DiscoveryNodeRole;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.cluster.routing.RoutingTable;
 import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.collect.ImmutableOpenMap;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.core.TimeValue;
+import org.elasticsearch.core.Tuple;
 import org.elasticsearch.monitor.StatusInfo;
 import org.elasticsearch.test.EqualsHashCodeTestUtils;
 import org.elasticsearch.threadpool.Scheduler;
@@ -29,12 +32,16 @@ import org.junit.Before;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
+import java.util.Map;
 import java.util.UUID;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.Collectors;
 
 import static org.elasticsearch.cluster.coordination.AbstractCoordinatorTestCase.Cluster.EXTREME_DELAY_VARIABILITY;
 import static org.elasticsearch.cluster.coordination.CoordinationDiagnosticsService.ClusterFormationStateOrException;
@@ -44,6 +51,7 @@ import static org.hamcrest.Matchers.emptyOrNullString;
 import static org.hamcrest.Matchers.endsWith;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.greaterThanOrEqualTo;
+import static org.hamcrest.Matchers.lessThanOrEqualTo;
 import static org.hamcrest.Matchers.not;
 import static org.hamcrest.Matchers.notNullValue;
 import static org.hamcrest.Matchers.startsWith;
@@ -370,25 +378,114 @@ public class CoordinationDiagnosticsServiceTests extends AbstractCoordinatorTest
         }
     }
 
-    public void testRedForNoMaster() {
+    public void testRedForNoMasterNoQuorum() {
+        /*
+         * This test brings up a 4-node cluster (3 master-eligible, one not). After the cluster stabilizes, one node is elected master.
+         * Then we disconnect all master nodes from all other nodes and allow the cluster to simulate running for a little longer. At
+         * that point, the non-master node thinks that there are no master nodes. The two master eligible nodes that were not initially
+         * elected master think that no quorum can be formed. Depending on timing, the node that had originally been elected master
+         * either thinks that it is still master, or that no quorum can be formed.
+         */
         try (Cluster cluster = new Cluster(4, false, Settings.EMPTY)) {
             // The allNodesMasterEligible=false passed to the Cluster constructor does not guarantee a non-master node in the cluster:
             createAndAddNonMasterNode(cluster);
             cluster.runRandomly();
             cluster.stabilise();
+            int masterNodeCount = 0;
             for (Cluster.ClusterNode node : cluster.clusterNodes) {
                 if (node.getLocalNode().isMasterNode()) {
                     node.disconnect();
+                    masterNodeCount++;
                 }
             }
+            int redNonMasterCount = 0;
+            int redMasterCount = 0;
+            int greenMasterCount = 0;
             cluster.runFor(DEFAULT_STABILISATION_TIME, "Cannot call stabilise() because there is no master");
             for (Cluster.ClusterNode node : cluster.clusterNodes) {
                 CoordinationDiagnosticsService.CoordinationDiagnosticsResult healthIndicatorResult = node.coordinationDiagnosticsService
                     .diagnoseMasterStability(true);
                 if (node.getLocalNode().isMasterNode() == false) {
                     assertThat(healthIndicatorResult.status(), equalTo(CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.RED));
+                    assertThat(healthIndicatorResult.summary(), containsString("No master eligible nodes found in the cluster"));
+                    redNonMasterCount++;
+                } else if (healthIndicatorResult.status().equals(CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.RED)) {
+                    assertThat(healthIndicatorResult.summary(), containsString("unable to form a quorum"));
+                    redMasterCount++;
+                } else {
+                    assertThat(healthIndicatorResult.status(), equalTo(CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.GREEN));
+                    greenMasterCount++;
                 }
             }
+            // Non-master nodes see only themselves and think that there are no master nodes:
+            assertThat(redNonMasterCount, equalTo(5 - masterNodeCount));
+            // The original master node sees only itself, but might think it is still master:
+            assertThat(greenMasterCount, lessThanOrEqualTo(1));
+            // The other master nodes only see themselves and cannot form a quorum (and sometimes the original master already sees this):
+            assertThat(redMasterCount, greaterThanOrEqualTo(masterNodeCount - 1));
+
+            while (cluster.clusterNodes.stream().anyMatch(Cluster.ClusterNode::deliverBlackholedRequests)) {
+                logger.debug("--> stabilising again after delivering blackholed requests");
+                cluster.runFor(DEFAULT_STABILISATION_TIME, "Cannot call stabilise() because there is no master");
+            }
+        }
+    }
+
+    public void testRedForDiscoveryProblems() {
+        /*
+         * In this test we bring up a 4-node cluster (3 master-eligible) and allow it to stabilize. Then we disconnect all master nodes.
+         * This actually causes a quorum failure (see testRedForNoMasterNoQuorum()). The reason for this is so that we get into the state
+         *  where we have not had a master node recently. Then we swap out he clusterFormationResponses so that the reason appears to be
+         * a discovery problem instead of a quorum problem.
+         */
+        try (Cluster cluster = new Cluster(3, true, Settings.EMPTY)) {
+            // The allNodesMasterEligible=false passed to the Cluster constructor does not guarantee a non-master node in the cluster:
+            createAndAddNonMasterNode(cluster);
+            cluster.runRandomly();
+            cluster.stabilise();
+            int masterNodeCount = 0;
+            ConcurrentHashMap<DiscoveryNode, CoordinationDiagnosticsService.ClusterFormationStateOrException> clusterFormationStates =
+                new ConcurrentHashMap<>();
+            for (Cluster.ClusterNode node : cluster.clusterNodes) {
+                if (node.getLocalNode().isMasterNode()) {
+                    clusterFormationStates.put(
+                        node.getLocalNode(),
+                        new CoordinationDiagnosticsService.ClusterFormationStateOrException(getClusterFormationState(false, true))
+                    );
+                    node.disconnect();
+                    masterNodeCount++;
+                }
+            }
+            int redNonMasterCount = 0;
+            int redMasterCount = 0;
+            int greenMasterCount = 0;
+            cluster.runFor(DEFAULT_STABILISATION_TIME, "Cannot call stabilise() because there is no master");
+            for (Cluster.ClusterNode node : cluster.clusterNodes) {
+                if (node.getLocalNode().isMasterNode()) {
+                    // This is artificially forcing a discovery problem:
+                    node.coordinationDiagnosticsService.clusterFormationResponses = clusterFormationStates;
+                }
+                CoordinationDiagnosticsService.CoordinationDiagnosticsResult healthIndicatorResult = node.coordinationDiagnosticsService
+                    .diagnoseMasterStability(true);
+                if (node.getLocalNode().isMasterNode() == false) {
+                    assertThat(healthIndicatorResult.status(), equalTo(CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.RED));
+                    assertThat(healthIndicatorResult.summary(), containsString("No master eligible nodes found in the cluster"));
+                    redNonMasterCount++;
+                } else if (healthIndicatorResult.status().equals(CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.RED)) {
+                    assertThat(healthIndicatorResult.summary(), containsString("unable to discover other master eligible nodes"));
+                    redMasterCount++;
+                } else {
+                    assertThat(healthIndicatorResult.status(), equalTo(CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.GREEN));
+                    greenMasterCount++;
+                }
+            }
+            // Non-master nodes see only themselves and think that there are no master nodes:
+            assertThat(redNonMasterCount, equalTo(1));
+            // The original master node sees only itself, but might think it is still master:
+            assertThat(greenMasterCount, lessThanOrEqualTo(1));
+            // The other master nodes only see themselves and cannot form a quorum (and sometimes the original master already sees this):
+            assertThat(redMasterCount, greaterThanOrEqualTo(masterNodeCount - 1));
+
             while (cluster.clusterNodes.stream().anyMatch(Cluster.ClusterNode::deliverBlackholedRequests)) {
                 logger.debug("--> stabilising again after delivering blackholed requests");
                 cluster.runFor(DEFAULT_STABILISATION_TIME, "Cannot call stabilise() because there is no master");
@@ -501,7 +598,11 @@ public class CoordinationDiagnosticsServiceTests extends AbstractCoordinatorTest
                     assertThat(recentMaster.getName(), notNullValue());
                     assertThat(recentMaster.getId(), not(emptyOrNullString()));
                 }
-                assertThat(result.details().clusterFormationDescription(), startsWith("master not discovered yet"));
+                assertThat(result.details().nodeToClusterFormationDescriptionMap().size(), equalTo(1));
+                assertThat(
+                    result.details().nodeToClusterFormationDescriptionMap().values().iterator().next(),
+                    startsWith("master not" + " discovered")
+                );
             }
             cluster.clusterNodes.addAll(removedClusterNodes);
             while (cluster.clusterNodes.stream().anyMatch(Cluster.ClusterNode::deliverBlackholedRequests)) {
@@ -545,7 +646,11 @@ public class CoordinationDiagnosticsServiceTests extends AbstractCoordinatorTest
                         assertThat(recentMaster.getName(), notNullValue());
                         assertThat(recentMaster.getId(), not(emptyOrNullString()));
                     }
-                    assertThat(result.details().clusterFormationDescription(), startsWith("master not discovered"));
+                    assertThat(result.details().nodeToClusterFormationDescriptionMap().size(), equalTo(1));
+                    assertThat(
+                        result.details().nodeToClusterFormationDescriptionMap().values().iterator().next(),
+                        startsWith("master not" + " discovered")
+                    );
                     // This restores the PeerFinder so that the test cleanup doesn't fail:
                     node.coordinator.getPeerFinder().activate(lastAcceptedNodes);
                 }
@@ -583,7 +688,11 @@ public class CoordinationDiagnosticsServiceTests extends AbstractCoordinatorTest
                         assertThat(recentMaster.getName(), notNullValue());
                         assertThat(recentMaster.getId(), not(emptyOrNullString()));
                     }
-                    assertThat(result.details().clusterFormationDescription(), startsWith("master not discovered"));
+                    assertThat(result.details().nodeToClusterFormationDescriptionMap().size(), equalTo(1));
+                    assertThat(
+                        result.details().nodeToClusterFormationDescriptionMap().values().iterator().next(),
+                        startsWith("master not" + " discovered")
+                    );
                 }
             }
             while (cluster.clusterNodes.stream().anyMatch(Cluster.ClusterNode::deliverBlackholedRequests)) {
@@ -593,6 +702,210 @@ public class CoordinationDiagnosticsServiceTests extends AbstractCoordinatorTest
         }
     }
 
+    public void testDiagnoseOnHaveNotSeenMasterRecentlyAndWeAreMasterEligible() {
+        MasterHistory localMasterHistory = mock(MasterHistory.class);
+        Collection<DiscoveryNode> masterEligibleNodes = List.of(node1, node2, node3);
+        boolean explain = true;
+
+        /*
+         * In this test, we have an exception rather than the ClusterFormationState for one of the nodes, we expect the status to be RED
+         * and the summary to indicate that there was an exception.
+         */
+        ConcurrentMap<DiscoveryNode, ClusterFormationStateOrException> clusterFormationResponses = new ConcurrentHashMap<>();
+        clusterFormationResponses.put(
+            node1,
+            new ClusterFormationStateOrException(getClusterFormationState(randomBoolean(), randomBoolean()))
+        );
+        clusterFormationResponses.put(
+            node2,
+            new ClusterFormationStateOrException(getClusterFormationState(randomBoolean(), randomBoolean()))
+        );
+        clusterFormationResponses.put(node3, new ClusterFormationStateOrException(new RuntimeException()));
+        TimeValue nodeHasMasterLookupTimeframe = new TimeValue(1, TimeUnit.MILLISECONDS);
+        Coordinator coordinator = mock(Coordinator.class);
+        ClusterFormationFailureHelper.ClusterFormationState localClusterFormationState = getClusterFormationState(true, true);
+        when(coordinator.getClusterFormationState()).thenReturn(localClusterFormationState);
+        when(coordinator.getLocalNode()).thenReturn(node1);
+        CoordinationDiagnosticsService.CoordinationDiagnosticsResult result = CoordinationDiagnosticsService
+            .diagnoseOnHaveNotSeenMasterRecentlyAndWeAreMasterEligible(
+                localMasterHistory,
+                masterEligibleNodes,
+                coordinator,
+                clusterFormationResponses,
+                nodeHasMasterLookupTimeframe,
+                explain
+            );
+        assertThat(result.status(), equalTo(CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.RED));
+        assertThat(result.summary(), containsString(" an exception occurred while reaching out to "));
+
+        /*
+         * In this test, the ClusterFormationStates of all of the nodes appear fine. Since we only run this method when there has been no
+         *  master recently, we expect the status to be RED and the summary to indicate that we don't know what the problem is.
+         */
+        clusterFormationResponses = new ConcurrentHashMap<>();
+        clusterFormationResponses.put(node1, new ClusterFormationStateOrException(getClusterFormationState(true, true)));
+        clusterFormationResponses.put(node2, new ClusterFormationStateOrException(getClusterFormationState(true, true)));
+        clusterFormationResponses.put(node3, new ClusterFormationStateOrException(getClusterFormationState(true, true)));
+        result = CoordinationDiagnosticsService.diagnoseOnHaveNotSeenMasterRecentlyAndWeAreMasterEligible(
+            localMasterHistory,
+            masterEligibleNodes,
+            coordinator,
+            clusterFormationResponses,
+            nodeHasMasterLookupTimeframe,
+            explain
+        );
+        assertThat(result.status(), equalTo(CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.RED));
+        assertThat(result.summary(), containsString(" the cause has not been determined"));
+
+        /*
+         * In this test, we have the ClusterFormationState of one of the node reports that it cannot form a quorum, so we expect the
+         * status to be RED and the summary to indicate that we are unable to form a quorum.
+         */
+        clusterFormationResponses = new ConcurrentHashMap<>();
+        clusterFormationResponses.put(node1, new ClusterFormationStateOrException(getClusterFormationState(true, true)));
+        clusterFormationResponses.put(node2, new ClusterFormationStateOrException(getClusterFormationState(true, true)));
+        clusterFormationResponses.put(node3, new ClusterFormationStateOrException(getClusterFormationState(true, false)));
+        result = CoordinationDiagnosticsService.diagnoseOnHaveNotSeenMasterRecentlyAndWeAreMasterEligible(
+            localMasterHistory,
+            masterEligibleNodes,
+            coordinator,
+            clusterFormationResponses,
+            nodeHasMasterLookupTimeframe,
+            explain
+        );
+        assertThat(result.status(), equalTo(CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.RED));
+        assertThat(result.summary(), containsString(" the master eligible nodes are unable to form a quorum"));
+
+        /*
+         * In this test, we have the ClusterFormationState of one of the node reports that its foundPeers does not include all known
+         * master eligible nodes, so we expect the status to be RED and the summary to indicate that we are unable to discover all nodes.
+         */
+        clusterFormationResponses = new ConcurrentHashMap<>();
+        clusterFormationResponses.put(node1, new ClusterFormationStateOrException(getClusterFormationState(true, true)));
+        clusterFormationResponses.put(node2, new ClusterFormationStateOrException(getClusterFormationState(false, true)));
+        clusterFormationResponses.put(node3, new ClusterFormationStateOrException(getClusterFormationState(true, false)));
+        result = CoordinationDiagnosticsService.diagnoseOnHaveNotSeenMasterRecentlyAndWeAreMasterEligible(
+            localMasterHistory,
+            masterEligibleNodes,
+            coordinator,
+            clusterFormationResponses,
+            nodeHasMasterLookupTimeframe,
+            explain
+        );
+        assertThat(result.status(), equalTo(CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.RED));
+        assertThat(result.summary(), containsString(" some master eligible nodes are unable to discover other master eligible nodes"));
+    }
+
+    public void testAnyNodeInClusterReportsDiscoveryProblems() {
+        Collection<DiscoveryNode> masterEligibleNodes = List.of(node1, node2, node3);
+        /*
+         * In this test, the foundPeers of the ClusterFormationStates of all nodes contain all known master eligible nodes, so we expect
+         * anyNodeInClusterReportsDiscoveryProblems() to be false since there are no discovery problems.
+         */
+        ConcurrentMap<DiscoveryNode, ClusterFormationStateOrException> clusterFormationResponses = new ConcurrentHashMap<>();
+        clusterFormationResponses.put(node1, new ClusterFormationStateOrException(getClusterFormationState(true, randomBoolean())));
+        clusterFormationResponses.put(node2, new ClusterFormationStateOrException(getClusterFormationState(true, randomBoolean())));
+        clusterFormationResponses.put(node3, new ClusterFormationStateOrException(getClusterFormationState(true, randomBoolean())));
+        Map<DiscoveryNode, ClusterFormationFailureHelper.ClusterFormationState> nodeClusterFormationStateMap = clusterFormationResponses
+            .entrySet()
+            .stream()
+            .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().clusterFormationState()));
+        assertFalse(
+            CoordinationDiagnosticsService.anyNodeInClusterReportsDiscoveryProblems(masterEligibleNodes, nodeClusterFormationStateMap)
+        );
+
+        /*
+         * In this test, the foundPeers of at least one ClusterFormationState does _not_ include all known master eligible nodes, so we
+         * expect anyNodeInClusterReportsDiscoveryProblems() to be true.
+         */
+        clusterFormationResponses = new ConcurrentHashMap<>();
+        clusterFormationResponses.put(
+            node1,
+            new ClusterFormationStateOrException(getClusterFormationState(randomBoolean(), randomBoolean()))
+        );
+        clusterFormationResponses.put(node2, new ClusterFormationStateOrException(getClusterFormationState(false, randomBoolean())));
+        clusterFormationResponses.put(
+            node3,
+            new ClusterFormationStateOrException(getClusterFormationState(randomBoolean(), randomBoolean()))
+        );
+        nodeClusterFormationStateMap = clusterFormationResponses.entrySet()
+            .stream()
+            .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().clusterFormationState()));
+        assertTrue(
+            CoordinationDiagnosticsService.anyNodeInClusterReportsDiscoveryProblems(masterEligibleNodes, nodeClusterFormationStateMap)
+        );
+    }
+
+    public void testAnyNodeInClusterReportsQuorumProblems() {
+        /*
+         * In this test the hasDiscoveredQuorum field of all ClusterFormationStates is true, so we expect
+         * anyNodeInClusterReportsQuorumProblems() to return false.
+         */
+        ConcurrentMap<DiscoveryNode, ClusterFormationStateOrException> clusterFormationResponses = new ConcurrentHashMap<>();
+        clusterFormationResponses.put(node1, new ClusterFormationStateOrException(getClusterFormationState(randomBoolean(), true)));
+        clusterFormationResponses.put(node2, new ClusterFormationStateOrException(getClusterFormationState(randomBoolean(), true)));
+        clusterFormationResponses.put(node3, new ClusterFormationStateOrException(getClusterFormationState(randomBoolean(), true)));
+        Map<DiscoveryNode, ClusterFormationFailureHelper.ClusterFormationState> nodeClusterFormationStateMap = clusterFormationResponses
+            .entrySet()
+            .stream()
+            .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().clusterFormationState()));
+        assertFalse(CoordinationDiagnosticsService.anyNodeInClusterReportsQuorumProblems(nodeClusterFormationStateMap));
+
+        /*
+         * In this test the hasDiscoveredQuorum field of at least one ClusterFormationState is false, so we expect
+         * anyNodeInClusterReportsQuorumProblems() to return true.
+         */
+        clusterFormationResponses = new ConcurrentHashMap<>();
+        clusterFormationResponses.put(
+            node1,
+            new ClusterFormationStateOrException(getClusterFormationState(randomBoolean(), randomBoolean()))
+        );
+        clusterFormationResponses.put(node2, new ClusterFormationStateOrException(getClusterFormationState(randomBoolean(), false)));
+        clusterFormationResponses.put(
+            node3,
+            new ClusterFormationStateOrException(getClusterFormationState(randomBoolean(), randomBoolean()))
+        );
+        nodeClusterFormationStateMap = clusterFormationResponses.entrySet()
+            .stream()
+            .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().clusterFormationState()));
+        assertTrue(CoordinationDiagnosticsService.anyNodeInClusterReportsQuorumProblems(nodeClusterFormationStateMap));
+    }
+
+    /*
+     * Creates a new ClusterFormationState for testing. If hasDiscoveredAllNodes is false, then the foundPeers on the returned
+     * ClusterFormationState will be missing at least one known master eligible node. The hasDiscoveredQuorum field in the returned
+     * ClusterFormationState is set to the hasDiscoveredQuorum argument of this method.
+     */
+    private ClusterFormationFailureHelper.ClusterFormationState getClusterFormationState(
+        boolean hasDiscoveredAllNodes,
+        boolean hasDiscoveredQuorum
+    ) {
+        Map<String, DiscoveryNode> masterEligibleNodesMap = Map.of("node1", node1, "node2", node2, "node3", node3);
+        List<String> initialMasterNodesSetting = Arrays.stream(generateRandomStringArray(7, 30, false, false)).toList();
+        DiscoveryNode localNode = masterEligibleNodesMap.values().stream().findAny().get();
+        ImmutableOpenMap.Builder<String, DiscoveryNode> masterEligibleNodesBuilder = new ImmutableOpenMap.Builder<>();
+        masterEligibleNodesMap.forEach(masterEligibleNodesBuilder::put);
+        ImmutableOpenMap<String, DiscoveryNode> masterEligibleNodes = masterEligibleNodesBuilder.build();
+        List<DiscoveryNode> allMasterEligibleNodes = List.of(node1, node2, node3);
+        return new ClusterFormationFailureHelper.ClusterFormationState(
+            initialMasterNodesSetting,
+            localNode,
+            masterEligibleNodes,
+            randomLong(),
+            randomLong(),
+            new CoordinationMetadata.VotingConfiguration(Collections.emptySet()),
+            new CoordinationMetadata.VotingConfiguration(Collections.emptySet()),
+            Collections.emptyList(),
+            hasDiscoveredAllNodes
+                ? allMasterEligibleNodes
+                : randomSubsetOf(randomInt(allMasterEligibleNodes.size() - 1), allMasterEligibleNodes),
+            randomLong(),
+            hasDiscoveredQuorum,
+            new StatusInfo(randomFrom(StatusInfo.Status.HEALTHY, StatusInfo.Status.UNHEALTHY), randomAlphaOfLength(20)),
+            Collections.emptyList()
+        );
+    }
+
     public void testBeginPollingClusterFormationInfo() {
         /*
          * This test sets up a 4-node cluster (3 master eligible). We call beginPollingClusterFormationInfo() on each node. This is allowed
@@ -747,7 +1060,7 @@ public class CoordinationDiagnosticsServiceTests extends AbstractCoordinatorTest
                     originalDetails.recentMasters(),
                     originalDetails.remoteExceptionMessage(),
                     originalDetails.remoteExceptionStackTrace(),
-                    originalDetails.clusterFormationDescription()
+                    originalDetails.nodeToClusterFormationDescriptionMap()
                 );
             }
             case 2 -> {
@@ -756,7 +1069,7 @@ public class CoordinationDiagnosticsServiceTests extends AbstractCoordinatorTest
                     List.of(node1, node2, node3),
                     originalDetails.remoteExceptionMessage(),
                     originalDetails.remoteExceptionStackTrace(),
-                    originalDetails.clusterFormationDescription()
+                    originalDetails.nodeToClusterFormationDescriptionMap()
                 );
             }
             case 3 -> {
@@ -765,7 +1078,7 @@ public class CoordinationDiagnosticsServiceTests extends AbstractCoordinatorTest
                     originalDetails.recentMasters(),
                     randomAlphaOfLength(30),
                     originalDetails.remoteExceptionStackTrace(),
-                    originalDetails.clusterFormationDescription()
+                    originalDetails.nodeToClusterFormationDescriptionMap()
                 );
             }
             case 4 -> {
@@ -774,7 +1087,7 @@ public class CoordinationDiagnosticsServiceTests extends AbstractCoordinatorTest
                     originalDetails.recentMasters(),
                     originalDetails.remoteExceptionMessage(),
                     randomAlphaOfLength(100),
-                    originalDetails.clusterFormationDescription()
+                    originalDetails.nodeToClusterFormationDescriptionMap()
                 );
             }
             case 5 -> {
@@ -783,7 +1096,7 @@ public class CoordinationDiagnosticsServiceTests extends AbstractCoordinatorTest
                     originalDetails.recentMasters(),
                     originalDetails.remoteExceptionMessage(),
                     originalDetails.remoteExceptionStackTrace(),
-                    randomAlphaOfLength(100)
+                    randomMap(0, 7, () -> new Tuple<>(randomNodeId(), randomAlphaOfLength(100)))
                 );
             }
             default -> throw new IllegalStateException();
@@ -848,7 +1161,7 @@ public class CoordinationDiagnosticsServiceTests extends AbstractCoordinatorTest
             List.of(node1, node2),
             randomNullableStringOfLengthBetween(0, 30),
             randomNullableStringOfLengthBetween(0, 30),
-            randomAlphaOfLengthBetween(0, 30)
+            Map.of(randomNodeId(), randomAlphaOfLengthBetween(0, 30))
         );
     }
 

+ 15 - 4
server/src/test/java/org/elasticsearch/cluster/coordination/StableMasterHealthIndicatorServiceTests.java

@@ -96,8 +96,15 @@ public class StableMasterHealthIndicatorServiceTests extends AbstractCoordinator
         MasterHistoryService masterHistoryService = createMasterHistoryService();
         StableMasterHealthIndicatorService service = createStableMasterHealthIndicatorService(nullMasterClusterState, masterHistoryService);
         List<DiscoveryNode> recentMasters = List.of(node2, node1);
+        String node1ClusterFormation = randomAlphaOfLength(100);
+        String node2ClusterFormation = randomAlphaOfLength(100);
         CoordinationDiagnosticsService.CoordinationDiagnosticsDetails coordinationDiagnosticsDetails =
-            new CoordinationDiagnosticsService.CoordinationDiagnosticsDetails(node1, recentMasters);
+            new CoordinationDiagnosticsService.CoordinationDiagnosticsDetails(
+                node1,
+                recentMasters,
+                null,
+                Map.of(node1.getId(), node1ClusterFormation, node2.getId(), node2ClusterFormation)
+            );
         CoordinationDiagnosticsService.CoordinationDiagnosticsStatus inputStatus = randomFrom(
             CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.RED,
             CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.YELLOW
@@ -115,7 +122,7 @@ public class StableMasterHealthIndicatorServiceTests extends AbstractCoordinator
         assertThat(result.name(), equalTo(StableMasterHealthIndicatorService.NAME));
         HealthIndicatorDetails details = result.details();
         Map<String, Object> detailsMap = xContentToMap(details);
-        assertThat(detailsMap.size(), equalTo(2));
+        assertThat(detailsMap.size(), equalTo(3));
         Map<String, String> currentMasterInResult = (Map<String, String>) detailsMap.get("current_master");
         assertThat(currentMasterInResult.get("name"), equalTo(node1.getName()));
         assertThat(currentMasterInResult.get("node_id"), equalTo(node1.getId()));
@@ -127,6 +134,10 @@ public class StableMasterHealthIndicatorServiceTests extends AbstractCoordinator
             assertThat(recentMasterMap.get("name"), not(emptyOrNullString()));
             assertThat(recentMasterMap.get("node_id"), not(emptyOrNullString()));
         }
+        Map<String, String> clusterFormationMap = (Map<String, String>) detailsMap.get("cluster_formation");
+        assertThat(clusterFormationMap.size(), equalTo(2));
+        assertThat(clusterFormationMap.get(node1.getId()), equalTo(node1ClusterFormation));
+        assertThat(clusterFormationMap.get(node2.getId()), equalTo(node2ClusterFormation));
         List<Diagnosis> diagnosis = result.diagnosisList();
         assertThat(diagnosis.size(), equalTo(1));
         assertThat(diagnosis.get(0), is(StableMasterHealthIndicatorService.CONTACT_SUPPORT_USER_ACTION));
@@ -138,7 +149,7 @@ public class StableMasterHealthIndicatorServiceTests extends AbstractCoordinator
         StableMasterHealthIndicatorService service = createStableMasterHealthIndicatorService(nullMasterClusterState, masterHistoryService);
         List<DiscoveryNode> recentMasters = List.of(node2, node1);
         CoordinationDiagnosticsService.CoordinationDiagnosticsDetails coordinationDiagnosticsDetails =
-            new CoordinationDiagnosticsService.CoordinationDiagnosticsDetails(node1, recentMasters);
+            new CoordinationDiagnosticsService.CoordinationDiagnosticsDetails(node1, recentMasters, null, null);
         CoordinationDiagnosticsService.CoordinationDiagnosticsStatus inputStatus = randomFrom(
             CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.RED,
             CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.YELLOW
@@ -165,7 +176,7 @@ public class StableMasterHealthIndicatorServiceTests extends AbstractCoordinator
         StableMasterHealthIndicatorService service = createStableMasterHealthIndicatorService(nullMasterClusterState, masterHistoryService);
         List<DiscoveryNode> recentMasters = List.of(node2, node1);
         CoordinationDiagnosticsService.CoordinationDiagnosticsDetails coordinationDiagnosticsDetails =
-            new CoordinationDiagnosticsService.CoordinationDiagnosticsDetails(node1, recentMasters);
+            new CoordinationDiagnosticsService.CoordinationDiagnosticsDetails(node1, recentMasters, null, null);
         CoordinationDiagnosticsService.CoordinationDiagnosticsStatus inputStatus = randomFrom(
             CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.GREEN,
             CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.UNKNOWN