Преглед изворни кода

Adding additional capability to the master_is_stable health indicator service (#87482)

This builds on #86524 by supporting two additional conditions, both of which happen when there has been
no elected master for more than 30 seconds (from the queried node's point of view), and both of which return
a RED status:
(1) There are no master-eligible nodes found in the cluster
(2) The node being queried sees a master-eligible node that has been elected the master, but cannot join it
Keith Massey пре 3 година
родитељ
комит
b34e1bf01f

+ 5 - 0
docs/changelog/87482.yaml

@@ -0,0 +1,5 @@
+pr: 87482
+summary: Adding additional capability to the `master_is_stable` health indicator service
+area: Health
+type: enhancement
+issues: []

+ 109 - 12
server/src/internalClusterTest/java/org/elasticsearch/discovery/StableMasterDisruptionIT.java

@@ -8,15 +8,21 @@
 
 package org.elasticsearch.discovery;
 
+import org.elasticsearch.Version;
 import org.elasticsearch.action.admin.cluster.state.ClusterStateRequest;
 import org.elasticsearch.client.internal.Client;
+import org.elasticsearch.cluster.ClusterChangedEvent;
+import org.elasticsearch.cluster.ClusterName;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.ClusterStateUpdateTask;
 import org.elasticsearch.cluster.coordination.CoordinationDiagnosticsService;
 import org.elasticsearch.cluster.coordination.Coordinator;
 import org.elasticsearch.cluster.coordination.FollowersChecker;
 import org.elasticsearch.cluster.coordination.LeaderChecker;
+import org.elasticsearch.cluster.coordination.MasterHistoryService;
 import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.cluster.node.DiscoveryNodeRole;
+import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.Priority;
 import org.elasticsearch.common.Strings;
@@ -35,6 +41,7 @@ import org.elasticsearch.test.disruption.NetworkDisruption.NetworkLinkDisruption
 import org.elasticsearch.test.disruption.NetworkDisruption.TwoPartitions;
 import org.elasticsearch.test.disruption.SingleNodeDisruption;
 import org.elasticsearch.test.transport.MockTransportService;
+import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.xcontent.ToXContent;
 import org.elasticsearch.xcontent.ToXContentObject;
 import org.elasticsearch.xcontent.XContentBuilder;
@@ -52,7 +59,6 @@ import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
 import java.util.concurrent.CountDownLatch;
-import java.util.concurrent.ExecutionException;
 import java.util.concurrent.TimeUnit;
 
 import static java.util.Collections.singleton;
@@ -116,20 +122,21 @@ public class StableMasterDisruptionIT extends ESIntegTestCase {
         assertGreenMasterStability(internalCluster().client());
     }
 
-    private void assertGreenMasterStability(Client client) throws ExecutionException, InterruptedException, IOException {
+    private void assertGreenMasterStability(Client client) throws Exception {
         assertMasterStability(client, HealthStatus.GREEN, "The cluster has a stable master node");
     }
 
-    private void assertMasterStability(Client client, HealthStatus expectedStatus, String expectedSummarySubstring)
-        throws ExecutionException, InterruptedException, IOException {
-        GetHealthAction.Response healthResponse = client.execute(GetHealthAction.INSTANCE, new GetHealthAction.Request(true)).get();
-        String debugInformation = xContentToString(healthResponse);
-        assertThat(debugInformation, healthResponse.getStatus(), equalTo(expectedStatus));
-        assertThat(
-            debugInformation,
-            healthResponse.findComponent("cluster_coordination").findIndicator("master_is_stable").summary(),
-            containsString(expectedSummarySubstring)
-        );
+    private void assertMasterStability(Client client, HealthStatus expectedStatus, String expectedSummarySubstring) throws Exception {
+        assertBusy(() -> {
+            GetHealthAction.Response healthResponse = client.execute(GetHealthAction.INSTANCE, new GetHealthAction.Request(true)).get();
+            String debugInformation = xContentToString(healthResponse);
+            assertThat(debugInformation, healthResponse.getStatus(), equalTo(expectedStatus));
+            assertThat(
+                debugInformation,
+                healthResponse.findComponent("cluster_coordination").findIndicator("master_is_stable").summary(),
+                containsString(expectedSummarySubstring)
+            );
+        });
     }
 
     private String xContentToString(ToXContentObject xContent) throws IOException {
@@ -463,4 +470,94 @@ public class StableMasterDisruptionIT extends ESIntegTestCase {
         }
         assertGreenMasterStability(internalCluster().client(randomFrom(dataNodes)));
     }
+
+    public void testNoMasterEligibleNodes() throws Exception {
+        /*
+         * In this test we have a single master-eligible node. We then stop the master. We set the master lookup threshold very low on the
+         * data nodes, so when we run the master stability check on one of the data nodes, it will see that there has been no master
+         * recently and there are no master eligible nodes, so it returns a RED status.
+         */
+        internalCluster().startMasterOnlyNodes(
+            1,
+            Settings.builder()
+                .put(LeaderChecker.LEADER_CHECK_TIMEOUT_SETTING.getKey(), "1s")
+                .put(Coordinator.PUBLISH_TIMEOUT_SETTING.getKey(), "1s")
+                .put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), 1)
+                .build()
+        );
+        final List<String> dataNodes = internalCluster().startDataOnlyNodes(
+            2,
+            Settings.builder()
+                .put(LeaderChecker.LEADER_CHECK_TIMEOUT_SETTING.getKey(), "1s")
+                .put(Coordinator.PUBLISH_TIMEOUT_SETTING.getKey(), "1s")
+                .put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), 1)
+                .put(ThreadPool.ESTIMATED_TIME_INTERVAL_SETTING.getKey(), TimeValue.ZERO)
+                .put(CoordinationDiagnosticsService.NODE_HAS_MASTER_LOOKUP_TIMEFRAME_SETTING.getKey(), new TimeValue(1, TimeUnit.SECONDS))
+                .build()
+        );
+        ensureStableCluster(3);
+        internalCluster().stopCurrentMasterNode();
+        assertMasterStability(
+            internalCluster().client(randomFrom(dataNodes)),
+            HealthStatus.RED,
+            "No master eligible nodes found in the cluster"
+        );
+        for (String dataNode : dataNodes) {
+            internalCluster().stopNode(dataNode);
+        }
+    }
+
+    public void testCannotJoinLeader() throws Exception {
+        /*
+         * In this test we have a single master-eligible node. We create a cluster change event saying that the master went to null and
+         * send it only to the master history on each data node. As a result, the PeerFinder still thinks it is the master. Since the
+         * PeerFinder thinks there is a master but we have record of it being null in the history, the data node thinks that it has
+         * problems joining the elected master and returns a RED status.
+         */
+        internalCluster().startMasterOnlyNodes(
+            1,
+            Settings.builder()
+                .put(LeaderChecker.LEADER_CHECK_TIMEOUT_SETTING.getKey(), "1s")
+                .put(Coordinator.PUBLISH_TIMEOUT_SETTING.getKey(), "1s")
+                .put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), 1)
+                .build()
+        );
+        final List<String> dataNodes = internalCluster().startDataOnlyNodes(
+            2,
+            Settings.builder()
+                .put(LeaderChecker.LEADER_CHECK_TIMEOUT_SETTING.getKey(), "1s")
+                .put(Coordinator.PUBLISH_TIMEOUT_SETTING.getKey(), "1s")
+                .put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), 1)
+                .put(ThreadPool.ESTIMATED_TIME_INTERVAL_SETTING.getKey(), TimeValue.ZERO)
+                .put(CoordinationDiagnosticsService.NODE_HAS_MASTER_LOOKUP_TIMEFRAME_SETTING.getKey(), new TimeValue(1, TimeUnit.SECONDS))
+                .build()
+        );
+        ensureStableCluster(3);
+        Iterable<MasterHistoryService> masterHistoryServices = internalCluster().getDataNodeInstances(MasterHistoryService.class);
+        for (MasterHistoryService masterHistoryService : masterHistoryServices) {
+            ClusterState state = new ClusterState.Builder(new ClusterName(internalCluster().getClusterName())).nodes(
+                new DiscoveryNodes.Builder().masterNodeId(null)
+            ).build();
+            ClusterState previousState = new ClusterState.Builder(new ClusterName(internalCluster().getClusterName())).nodes(
+                new DiscoveryNodes.Builder().masterNodeId("test")
+                    .add(
+                        new DiscoveryNode(
+                            "test",
+                            "test",
+                            buildNewFakeTransportAddress(),
+                            Collections.emptyMap(),
+                            DiscoveryNodeRole.roles(),
+                            Version.CURRENT
+                        )
+                    )
+            ).build();
+            ClusterChangedEvent clusterChangedEvent = new ClusterChangedEvent("test", state, previousState);
+            masterHistoryService.getLocalMasterHistory().clusterChanged(clusterChangedEvent);
+        }
+        assertMasterStability(
+            internalCluster().client(randomFrom(dataNodes)),
+            HealthStatus.RED,
+            "has been elected master, but the node being queried"
+        );
+    }
 }

+ 105 - 17
server/src/main/java/org/elasticsearch/cluster/coordination/CoordinationDiagnosticsService.java

@@ -20,9 +20,12 @@ import org.elasticsearch.core.TimeValue;
 
 import java.io.PrintWriter;
 import java.io.StringWriter;
+import java.util.Collection;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Objects;
+import java.util.Set;
 import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;
 
@@ -40,6 +43,7 @@ import java.util.stream.Collectors;
  */
 public class CoordinationDiagnosticsService implements ClusterStateListener {
     private final ClusterService clusterService;
+    private final Coordinator coordinator;
     private final MasterHistoryService masterHistoryService;
     /**
      * This is the amount of time we use to make the initial decision -- have we seen a master node in the very recent past?
@@ -88,8 +92,13 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
         Setting.Property.NodeScope
     );
 
-    public CoordinationDiagnosticsService(ClusterService clusterService, MasterHistoryService masterHistoryService) {
+    public CoordinationDiagnosticsService(
+        ClusterService clusterService,
+        Coordinator coordinator,
+        MasterHistoryService masterHistoryService
+    ) {
         this.clusterService = clusterService;
+        this.coordinator = coordinator;
         this.masterHistoryService = masterHistoryService;
         this.nodeHasMasterLookupTimeframe = NODE_HAS_MASTER_LOOKUP_TIMEFRAME_SETTING.get(clusterService.getSettings());
         this.unacceptableNullTransitions = NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.get(clusterService.getSettings());
@@ -156,7 +165,7 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
             masterChanges,
             localMasterHistory.getMaxHistoryAge()
         );
-        CoordinationDiagnosticsDetails details = getDetails(explain, localMasterHistory);
+        CoordinationDiagnosticsDetails details = getDetails(explain, localMasterHistory, null);
         return new CoordinationDiagnosticsResult(coordinationDiagnosticsStatus, summary, details);
     }
 
@@ -172,13 +181,17 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
      * @return An empty CoordinationDiagnosticsDetails if explain is false, otherwise a CoordinationDiagnosticsDetails containing only
      * "current_master" and "recent_masters"
      */
-    private CoordinationDiagnosticsDetails getDetails(boolean explain, MasterHistory localMasterHistory) {
+    private CoordinationDiagnosticsDetails getDetails(
+        boolean explain,
+        MasterHistory localMasterHistory,
+        @Nullable String clusterFormationMessage
+    ) {
         if (explain == false) {
             return CoordinationDiagnosticsDetails.EMPTY;
         }
         DiscoveryNode masterNode = localMasterHistory.getMostRecentMaster();
         List<DiscoveryNode> recentNonNullMasters = localMasterHistory.getNodes().stream().filter(Objects::nonNull).toList();
-        return new CoordinationDiagnosticsDetails(masterNode, recentNonNullMasters);
+        return new CoordinationDiagnosticsDetails(masterNode, recentNonNullMasters, null, null, clusterFormationMessage);
     }
 
     /**
@@ -267,7 +280,7 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
     private CoordinationDiagnosticsResult getMasterIsStableResult(boolean explain, MasterHistory localMasterHistory) {
         String summary = "The cluster has a stable master node";
         logger.trace("The cluster has a stable master node");
-        CoordinationDiagnosticsDetails details = getDetails(explain, localMasterHistory);
+        CoordinationDiagnosticsDetails details = getDetails(explain, localMasterHistory, null);
         return new CoordinationDiagnosticsResult(CoordinationDiagnosticsStatus.GREEN, summary, details);
     }
 
@@ -278,21 +291,95 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
      * @return The CoordinationDiagnosticsResult for the given localMasterHistory
      */
     private CoordinationDiagnosticsResult diagnoseOnHaveNotSeenMasterRecently(MasterHistory localMasterHistory, boolean explain) {
-        // NOTE: The logic in this method will be implemented in a future PR
-        String summary = "No master has been observed recently";
-        CoordinationDiagnosticsDetails details = CoordinationDiagnosticsDetails.EMPTY;
+        Collection<DiscoveryNode> masterEligibleNodes = getMasterEligibleNodes();
+        final CoordinationDiagnosticsResult result;
+        boolean leaderHasBeenElected = coordinator.getPeerFinder().getLeader().isPresent();
+        if (masterEligibleNodes.isEmpty() && leaderHasBeenElected == false) {
+            result = getResultOnNoMasterEligibleNodes(localMasterHistory, explain);
+        } else if (leaderHasBeenElected) {
+            DiscoveryNode currentMaster = coordinator.getPeerFinder().getLeader().get();
+            result = getResultOnCannotJoinLeader(localMasterHistory, currentMaster, explain);
+        } else {
+            // NOTE: The logic in this block will be implemented in a future PR
+            result = new CoordinationDiagnosticsResult(
+                CoordinationDiagnosticsStatus.RED,
+                "No master has been observed recently",
+                CoordinationDiagnosticsDetails.EMPTY
+            );
+        }
+        return result;
+    }
+
+    /**
+     * Creates a CoordinationDiagnosticsResult in the case that there has been no master in the last few seconds, there is no elected
+     * master known, and there are no master eligible nodes. The status will be RED, and the details (if explain is true) will contain
+     * the list of any masters seen previously and a description of known problems from this node's Coordinator.
+     * @param localMasterHistory Used to pull recent master nodes for the details if explain is true
+     * @param explain If true, details are returned
+     * @return A CoordinationDiagnosticsResult with a RED status
+     */
+    private CoordinationDiagnosticsResult getResultOnNoMasterEligibleNodes(MasterHistory localMasterHistory, boolean explain) {
+        String summary = "No master eligible nodes found in the cluster";
+        CoordinationDiagnosticsDetails details = getDetails(
+            explain,
+            localMasterHistory,
+            coordinator.getClusterFormationState().getDescription()
+        );
+        return new CoordinationDiagnosticsResult(CoordinationDiagnosticsStatus.RED, summary, details);
+    }
+
+    /**
+     * Creates a CoordinationDiagnosticsResult in the case that there has been no master in the last few seconds in this node's cluster
+     * state, but PeerFinder reports that there is an elected master. The assumption is that this node is having a problem joining the
+     * elected master. The status will be RED, and the details (if explain is true) will contain the list of any masters seen previously
+     * and a description of known problems from this node's Coordinator.
+     * @param localMasterHistory Used to pull recent master nodes for the details if explain is true
+     * @param currentMaster The node that PeerFinder reports as the elected master
+     * @param explain If true, details are returned
+     * @return A CoordinationDiagnosticsResult with a RED status
+     */
+    private CoordinationDiagnosticsResult getResultOnCannotJoinLeader(
+        MasterHistory localMasterHistory,
+        DiscoveryNode currentMaster,
+        boolean explain
+    ) {
+        String summary = String.format(
+            Locale.ROOT,
+            "%s has been elected master, but the node being queried, %s, is unable to join it",
+            currentMaster,
+            clusterService.localNode()
+        );
+        CoordinationDiagnosticsDetails details = getDetails(
+            explain,
+            localMasterHistory,
+            coordinator.getClusterFormationState().getDescription()
+        );
         return new CoordinationDiagnosticsResult(CoordinationDiagnosticsStatus.RED, summary, details);
     }
 
+    /**
+     * Returns the master eligible nodes as found in this node's Coordinator, plus the local node if it is master eligible.
+     * @return All known master eligible nodes in this cluster
+     */
+    private Collection<DiscoveryNode> getMasterEligibleNodes() {
+        Set<DiscoveryNode> masterEligibleNodes = new HashSet<>();
+        coordinator.getFoundPeers().forEach(node -> {
+            if (node.isMasterNode()) {
+                masterEligibleNodes.add(node);
+            }
+        });
+        // Coordinator does not report the local node, so add it:
+        if (clusterService.localNode().isMasterNode()) {
+            masterEligibleNodes.add(clusterService.localNode());
+        }
+        return masterEligibleNodes;
+    }
+
     /**
      * This returns true if this node has seen a master node within the last few seconds
      * @return true if this node has seen a master node within the last few seconds, false otherwise
      */
     private boolean hasSeenMasterInHasMasterLookupTimeframe() {
-        // If there is currently a master, there's no point in looking at the history:
-        if (clusterService.state().nodes().getMasterNode() != null) {
-            return true;
-        }
         return masterHistoryService.getLocalMasterHistory().hasSeenMasterInLastNSeconds((int) nodeHasMasterLookupTimeframe.seconds());
     }
 
@@ -337,16 +424,17 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
     public record CoordinationDiagnosticsDetails(
         DiscoveryNode currentMaster,
         List<DiscoveryNode> recentMasters,
-        String remoteExceptionMessage,
-        String remoteExceptionStackTrace
+        @Nullable String remoteExceptionMessage,
+        @Nullable String remoteExceptionStackTrace,
+        @Nullable String clusterFormationDescription
     ) {
 
         public CoordinationDiagnosticsDetails(DiscoveryNode currentMaster, List<DiscoveryNode> recentMasters) {
-            this(currentMaster, recentMasters, null, null);
+            this(currentMaster, recentMasters, null, null, null);
         }
 
         public CoordinationDiagnosticsDetails(DiscoveryNode currentMaster, Exception remoteException) {
-            this(currentMaster, null, remoteException == null ? null : remoteException.getMessage(), getStackTrace(remoteException));
+            this(currentMaster, null, remoteException == null ? null : remoteException.getMessage(), getStackTrace(remoteException), null);
         }
 
         private static String getStackTrace(Exception e) {
@@ -358,6 +446,6 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
             return stringWriter.toString();
         }
 
-        public static final CoordinationDiagnosticsDetails EMPTY = new CoordinationDiagnosticsDetails(null, null, null, null);
+        public static final CoordinationDiagnosticsDetails EMPTY = new CoordinationDiagnosticsDetails(null, null, null, null, null);
     }
 }

+ 8 - 0
server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java

@@ -284,6 +284,10 @@ public class Coordinator extends AbstractLifecycleComponent implements ClusterSt
         this.nodeHealthService = nodeHealthService;
     }
 
+    /**
+     * This method returns an object containing information about why cluster formation failed, which can be useful in troubleshooting.
+     * @return Information about why cluster formation failed
+     */
     public ClusterFormationState getClusterFormationState() {
         return new ClusterFormationState(
             settings,
@@ -1558,6 +1562,10 @@ public class Coordinator extends AbstractLifecycleComponent implements ClusterSt
         return peerFinder.getFoundPeers();
     }
 
+    public PeerFinder getPeerFinder() {
+        return this.peerFinder;
+    }
+
     /**
      * If there is any current committed publication, this method cancels it.
      * This method is used exclusively by tests.

+ 7 - 0
server/src/main/java/org/elasticsearch/cluster/coordination/StableMasterHealthIndicatorService.java

@@ -45,6 +45,7 @@ public class StableMasterHealthIndicatorService implements HealthIndicatorServic
     private static final String DETAILS_CURRENT_MASTER = "current_master";
     private static final String DETAILS_RECENT_MASTERS = "recent_masters";
     private static final String DETAILS_EXCEPTION_FETCHING_HISTORY = "exception_fetching_history";
+    private static final String CLUSTER_FORMATION = "cluster_formation";
 
     // Impacts of having an unstable master:
     private static final String UNSTABLE_MASTER_INGEST_IMPACT = "The cluster cannot create, delete, or rebalance indices, and cannot "
@@ -156,6 +157,12 @@ public class StableMasterHealthIndicatorService implements HealthIndicatorServic
                     builder.field("stack_trace", coordinationDiagnosticsDetails.remoteExceptionStackTrace());
                 });
             }
+            if (coordinationDiagnosticsDetails.clusterFormationDescription() != null) {
+                builder.object(
+                    CLUSTER_FORMATION,
+                    xContentBuilder -> { builder.field("description", coordinationDiagnosticsDetails.clusterFormationDescription()); }
+                );
+            }
             return builder.endObject();
         };
     }

+ 1 - 0
server/src/main/java/org/elasticsearch/node/Node.java

@@ -903,6 +903,7 @@ public class Node implements Closeable {
             MasterHistoryService masterHistoryService = new MasterHistoryService(transportService, threadPool, clusterService);
             CoordinationDiagnosticsService coordinationDiagnosticsService = new CoordinationDiagnosticsService(
                 clusterService,
+                discoveryModule.getCoordinator(),
                 masterHistoryService
             );
             HealthService healthService = createHealthService(clusterService, clusterModule, coordinationDiagnosticsService);

+ 141 - 3
server/src/test/java/org/elasticsearch/cluster/coordination/CoordinationDiagnosticsServiceTests.java

@@ -19,19 +19,24 @@ import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.cluster.routing.RoutingTable;
 import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.monitor.StatusInfo;
 import org.elasticsearch.threadpool.ThreadPool;
 import org.junit.Before;
 
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
 import java.util.UUID;
 
+import static org.elasticsearch.monitor.StatusInfo.Status.HEALTHY;
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.emptyOrNullString;
 import static org.hamcrest.Matchers.endsWith;
 import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.greaterThanOrEqualTo;
 import static org.hamcrest.Matchers.not;
+import static org.hamcrest.Matchers.notNullValue;
 import static org.hamcrest.Matchers.startsWith;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
@@ -141,7 +146,6 @@ public class CoordinationDiagnosticsServiceTests extends AbstractCoordinatorTest
             assertThat(recentMaster.getName(), not(emptyOrNullString()));
             assertThat(recentMaster.getId(), not(emptyOrNullString()));
         }
-
     }
 
     public void testMasterGoesNull() throws Exception {
@@ -358,7 +362,9 @@ public class CoordinationDiagnosticsServiceTests extends AbstractCoordinatorTest
     }
 
     public void testRedForNoMaster() {
-        try (Cluster cluster = new Cluster(5, false, Settings.EMPTY)) {
+        try (Cluster cluster = new Cluster(4, false, Settings.EMPTY)) {
+            // The allNodesMasterEligible=false passed to the Cluster constructor does not guarantee a non-master node in the cluster:
+            createAndAddNonMasterNode(cluster);
             cluster.runRandomly();
             cluster.stabilise();
             for (Cluster.ClusterNode node : cluster.clusterNodes) {
@@ -453,6 +459,131 @@ public class CoordinationDiagnosticsServiceTests extends AbstractCoordinatorTest
         }
     }
 
+    @SuppressWarnings("unchecked")
+    public void testRedForNoMasterAndNoMasterEligibleNodes() throws IOException {
+        try (Cluster cluster = new Cluster(4, false, Settings.EMPTY)) {
+            // The allNodesMasterEligible=false passed to the Cluster constructor does not guarantee a non-master node in the cluster:
+            createAndAddNonMasterNode(cluster);
+            cluster.runRandomly();
+            cluster.stabilise();
+            for (Cluster.ClusterNode node : cluster.clusterNodes) {
+                if (node.getLocalNode().isMasterNode()) {
+                    node.disconnect();
+                }
+            }
+            List<Cluster.ClusterNode> removedClusterNodes = new ArrayList<>();
+            for (Cluster.ClusterNode clusterNode : cluster.clusterNodes) {
+                if (clusterNode.getLocalNode().isMasterNode()) {
+                    removedClusterNodes.add(clusterNode);
+                }
+            }
+            cluster.clusterNodes.removeAll(removedClusterNodes);
+            cluster.clusterNodes.removeIf(node -> node.getLocalNode().isMasterNode());
+            cluster.runFor(DEFAULT_STABILISATION_TIME, "Cannot call stabilise() because there is no master");
+            for (Cluster.ClusterNode node : cluster.clusterNodes) {
+                CoordinationDiagnosticsService.CoordinationDiagnosticsResult result = node.coordinationDiagnosticsService
+                    .diagnoseMasterStability(true);
+                assertThat(result.status(), equalTo(CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.RED));
+                assertThat(result.summary(), equalTo("No master eligible nodes found in the cluster"));
+                List<DiscoveryNode> recentMasters = result.details().recentMasters();
+                // We don't show nulls in the recent_masters list:
+                assertThat(recentMasters.size(), greaterThanOrEqualTo(1));
+                for (DiscoveryNode recentMaster : recentMasters) {
+                    assertThat(recentMaster.getName(), notNullValue());
+                    assertThat(recentMaster.getId(), not(emptyOrNullString()));
+                }
+                assertThat(result.details().clusterFormationDescription(), startsWith("master not discovered yet"));
+            }
+            cluster.clusterNodes.addAll(removedClusterNodes);
+            while (cluster.clusterNodes.stream().anyMatch(Cluster.ClusterNode::deliverBlackholedRequests)) {
+                logger.debug("--> stabilising again after delivering blackholed requests");
+                cluster.runFor(DEFAULT_STABILISATION_TIME, "Cannot call stabilise() because there is no master");
+            }
+        }
+    }
+
+    @SuppressWarnings("unchecked")
+    public void testRedForNoMasterAndWithMasterEligibleNodesAndLeader() throws IOException {
+        try (Cluster cluster = new Cluster(4, false, Settings.EMPTY)) {
+            // The allNodesMasterEligible=false passed to the Cluster constructor does not guarantee a non-master node in the cluster:
+            createAndAddNonMasterNode(cluster);
+            cluster.runRandomly();
+            cluster.stabilise();
+            Cluster.ClusterNode currentLeader = cluster.getAnyLeader();
+            for (Cluster.ClusterNode node : cluster.clusterNodes) {
+                if (node.getLocalNode().isMasterNode()) {
+                    node.disconnect();
+                }
+            }
+            cluster.runFor(DEFAULT_STABILISATION_TIME, "Cannot call stabilise() because there is no master");
+            for (Cluster.ClusterNode node : cluster.clusterNodes) {
+                if (currentLeader.equals(node) == false) { // The current leader still thinks it is leader
+                    DiscoveryNodes lastAcceptedNodes = node.coordinator.getLastAcceptedState().nodes();
+                    /*
+                     * The following has the effect of making the PeerFinder say that there is a leader, even though there is not. It is
+                     * effectively saying that there is some leader (this node) which this node has not been able to join. This is just the
+                     * easiest way to set up the condition for the test.
+                     */
+                    node.coordinator.getPeerFinder().deactivate(node.getLocalNode());
+                    CoordinationDiagnosticsService.CoordinationDiagnosticsResult result = node.coordinationDiagnosticsService
+                        .diagnoseMasterStability(true);
+                    assertThat(result.status(), equalTo(CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.RED));
+                    assertThat(result.summary(), containsString("has been elected master, but the node being queried"));
+                    List<DiscoveryNode> recentMasters = result.details().recentMasters();
+                    // We don't show nulls in the recent_masters list:
+                    assertThat(recentMasters.size(), greaterThanOrEqualTo(1));
+                    for (DiscoveryNode recentMaster : recentMasters) {
+                        assertThat(recentMaster.getName(), notNullValue());
+                        assertThat(recentMaster.getId(), not(emptyOrNullString()));
+                    }
+                    assertThat(result.details().clusterFormationDescription(), startsWith("master not discovered"));
+                    // This restores the PeerFinder so that the test cleanup doesn't fail:
+                    node.coordinator.getPeerFinder().activate(lastAcceptedNodes);
+                }
+            }
+
+            while (cluster.clusterNodes.stream().anyMatch(Cluster.ClusterNode::deliverBlackholedRequests)) {
+                logger.debug("--> stabilising again after delivering blackholed requests");
+                cluster.runFor(DEFAULT_STABILISATION_TIME, "Cannot call stabilise() because there is no master");
+            }
+        }
+    }
+
+    @SuppressWarnings("unchecked")
+    public void testRedForNoMasterAndWithMasterEligibleNodesAndNoLeader() throws IOException {
+        try (Cluster cluster = new Cluster(4, false, Settings.EMPTY)) {
+            // The allNodesMasterEligible=false passed to the Cluster constructor does not guarantee a non-master node in the cluster:
+            createAndAddNonMasterNode(cluster);
+            cluster.runRandomly();
+            cluster.stabilise();
+            for (Cluster.ClusterNode node : cluster.clusterNodes) {
+                if (node.getLocalNode().isMasterNode()) {
+                    node.disconnect();
+                }
+            }
+            cluster.runFor(DEFAULT_STABILISATION_TIME, "Cannot call stabilise() because there is no master");
+            for (Cluster.ClusterNode node : cluster.clusterNodes) {
+                CoordinationDiagnosticsService.CoordinationDiagnosticsResult result = node.coordinationDiagnosticsService
+                    .diagnoseMasterStability(true);
+                if (node.getLocalNode().isMasterNode() == false) {
+                    assertThat(result.status(), equalTo(CoordinationDiagnosticsService.CoordinationDiagnosticsStatus.RED));
+                    List<DiscoveryNode> recentMasters = result.details().recentMasters();
+                    // We don't show nulls in the recent_masters list:
+                    assertThat(recentMasters.size(), greaterThanOrEqualTo(1));
+                    for (DiscoveryNode recentMaster : recentMasters) {
+                        assertThat(recentMaster.getName(), notNullValue());
+                        assertThat(recentMaster.getId(), not(emptyOrNullString()));
+                    }
+                    assertThat(result.details().clusterFormationDescription(), startsWith("master not discovered"));
+                }
+            }
+            while (cluster.clusterNodes.stream().anyMatch(Cluster.ClusterNode::deliverBlackholedRequests)) {
+                logger.debug("--> stabilising again after delivering blackholed requests");
+                cluster.runFor(DEFAULT_STABILISATION_TIME, "Cannot call stabilise() because there is no master");
+            }
+        }
+    }
+
     private static ClusterState createClusterState(DiscoveryNode masterNode) {
         var routingTableBuilder = RoutingTable.builder();
         Metadata.Builder metadataBuilder = Metadata.builder();
@@ -501,6 +632,13 @@ public class CoordinationDiagnosticsServiceTests extends AbstractCoordinatorTest
         when(localNode.isMasterNode()).thenReturn(false);
         Coordinator coordinator = mock(Coordinator.class);
         when(coordinator.getFoundPeers()).thenReturn(Collections.emptyList());
-        return new CoordinationDiagnosticsService(clusterService, masterHistoryService);
+        return new CoordinationDiagnosticsService(clusterService, coordinator, masterHistoryService);
+    }
+
+    private void createAndAddNonMasterNode(Cluster cluster) {
+        Cluster.ClusterNode nonMasterNode = cluster.new ClusterNode(
+            nextNodeIndex.getAndIncrement(), false, Settings.EMPTY, () -> new StatusInfo(HEALTHY, "healthy-info")
+        );
+        cluster.clusterNodes.add(nonMasterNode);
     }
 }

+ 3 - 1
server/src/test/java/org/elasticsearch/cluster/coordination/StableMasterHealthIndicatorServiceTests.java

@@ -284,7 +284,9 @@ public class StableMasterHealthIndicatorServiceTests extends AbstractCoordinator
         when(localNode.isMasterNode()).thenReturn(false);
         Coordinator coordinator = mock(Coordinator.class);
         when(coordinator.getFoundPeers()).thenReturn(Collections.emptyList());
-        return new StableMasterHealthIndicatorService(new CoordinationDiagnosticsService(clusterService, masterHistoryService));
+        return new StableMasterHealthIndicatorService(
+            new CoordinationDiagnosticsService(clusterService, coordinator, masterHistoryService)
+        );
     }
 
     private Map<String, Object> xContentToMap(ToXContent xcontent) throws IOException {

+ 2 - 2
test/framework/src/main/java/org/elasticsearch/cluster/coordination/AbstractCoordinatorTestCase.java

@@ -1223,8 +1223,6 @@ public class AbstractCoordinatorTestCase extends ESTestCase {
                 );
                 clusterService = new ClusterService(settings, clusterSettings, masterService, clusterApplierService);
                 masterHistoryService = new MasterHistoryService(transportService, threadPool, clusterService);
-                coordinationDiagnosticsService = new CoordinationDiagnosticsService(clusterService, masterHistoryService);
-                stableMasterHealthIndicatorService = new StableMasterHealthIndicatorService(coordinationDiagnosticsService);
                 clusterService.setNodeConnectionsService(
                     new NodeConnectionsService(clusterService.getSettings(), threadPool, transportService)
                 );
@@ -1264,6 +1262,8 @@ public class AbstractCoordinatorTestCase extends ESTestCase {
                     getElectionStrategy(),
                     nodeHealthService
                 );
+                coordinationDiagnosticsService = new CoordinationDiagnosticsService(clusterService, coordinator, masterHistoryService);
+                stableMasterHealthIndicatorService = new StableMasterHealthIndicatorService(coordinationDiagnosticsService);
                 masterService.setClusterStatePublisher(coordinator);
                 final GatewayService gatewayService = new GatewayService(
                     settings,