浏览代码

Adding a view of master history (#85941)

This commit adds the notion of an in-memory MasterHistory of which nodes have been master for the last 30 minutes that is maintained in memory on each node. It is exposed via the MasterHistoryService. This commit also has a transport action so that you can fetch the master history from any node in the cluster, represented as a List of NodeClients. The list is an ordered list of nodes that have been seen as master for the last 30 minutes, with the oldest first. This action is used by the MasterHistoryService exposed for use via the MasterHistoryService. The local and remote master history representations will be used to determine if the master has been stable as part of the health API.
Keith Massey 3 年之前
父节点
当前提交
4533c454bb

+ 5 - 0
docs/changelog/85941.yaml

@@ -0,0 +1,5 @@
+pr: 85941
+summary: New service to keep track of the master history as seen from each node
+area: Health
+type: feature
+issues: []

+ 2 - 0
server/src/main/java/org/elasticsearch/action/ActionModule.java

@@ -16,6 +16,7 @@ import org.elasticsearch.action.admin.cluster.configuration.AddVotingConfigExclu
 import org.elasticsearch.action.admin.cluster.configuration.ClearVotingConfigExclusionsAction;
 import org.elasticsearch.action.admin.cluster.configuration.TransportAddVotingConfigExclusionsAction;
 import org.elasticsearch.action.admin.cluster.configuration.TransportClearVotingConfigExclusionsAction;
+import org.elasticsearch.action.admin.cluster.coordination.MasterHistoryAction;
 import org.elasticsearch.action.admin.cluster.desirednodes.DeleteDesiredNodesAction;
 import org.elasticsearch.action.admin.cluster.desirednodes.GetDesiredNodesAction;
 import org.elasticsearch.action.admin.cluster.desirednodes.TransportDeleteDesiredNodesAction;
@@ -628,6 +629,7 @@ public class ActionModule extends AbstractModule {
         actions.register(ResolveIndexAction.INSTANCE, ResolveIndexAction.TransportAction.class);
         actions.register(AnalyzeIndexDiskUsageAction.INSTANCE, TransportAnalyzeIndexDiskUsageAction.class);
         actions.register(FieldUsageStatsAction.INSTANCE, TransportFieldUsageAction.class);
+        actions.register(MasterHistoryAction.INSTANCE, MasterHistoryAction.TransportAction.class);
 
         // Indexed scripts
         actions.register(PutStoredScriptAction.INSTANCE, TransportPutStoredScriptAction.class);

+ 142 - 0
server/src/main/java/org/elasticsearch/action/admin/cluster/coordination/MasterHistoryAction.java

@@ -0,0 +1,142 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.action.admin.cluster.coordination;
+
+import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.action.ActionRequest;
+import org.elasticsearch.action.ActionRequestValidationException;
+import org.elasticsearch.action.ActionResponse;
+import org.elasticsearch.action.ActionType;
+import org.elasticsearch.action.support.ActionFilters;
+import org.elasticsearch.action.support.HandledTransportAction;
+import org.elasticsearch.cluster.coordination.MasterHistoryService;
+import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.tasks.Task;
+import org.elasticsearch.transport.TransportService;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+
+/**
+ * This action is used to fetch the MasterHistory from a remote node.
+ */
+public class MasterHistoryAction extends ActionType<MasterHistoryAction.Response> {
+
+    public static final MasterHistoryAction INSTANCE = new MasterHistoryAction();
+    public static final String NAME = "cluster:internal/master_history/get";
+
+    private MasterHistoryAction() {
+        super(NAME, MasterHistoryAction.Response::new);
+    }
+
+    public static class Request extends ActionRequest {
+
+        public Request() {}
+
+        @Override
+        public ActionRequestValidationException validate() {
+            return null;
+        }
+
+        public Request(StreamInput in) throws IOException {
+            super(in);
+        }
+
+        @Override
+        public void writeTo(StreamOutput out) throws IOException {
+            super.writeTo(out);
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            // There are no parameters, so all instances of this class are equal
+            if (this == o) return true;
+            return o != null && getClass() == o.getClass();
+        }
+
+        @Override
+        public int hashCode() {
+            // There are no parameters, so all instances of this class are equal
+            return 1;
+        }
+
+    }
+
+    public static class Response extends ActionResponse {
+
+        private final List<DiscoveryNode> masterHistory;
+
+        public Response(List<DiscoveryNode> masterHistory) {
+            this.masterHistory = masterHistory;
+        }
+
+        public Response(StreamInput in) throws IOException {
+            int mastersCount = in.readVInt();
+            masterHistory = new ArrayList<>(mastersCount);
+            for (int i = 0; i < mastersCount; i++) {
+                masterHistory.add(in.readOptionalWriteable(DiscoveryNode::new));
+            }
+        }
+
+        /**
+         * Returns an ordered list of DiscoveryNodes that the node responding has seen to be master nodes over the last 30 minutes, ordered
+         * oldest first. Note that these DiscoveryNodes can be null.
+         * @return a list of DiscoveryNodes that the node responding has seen to be master nodes over the last 30 minutes, ordered oldest
+         * first
+         */
+        public List<DiscoveryNode> getMasterHistory() {
+            return masterHistory;
+        }
+
+        @Override
+        public void writeTo(StreamOutput out) throws IOException {
+            out.writeVInt(masterHistory.size());
+            for (DiscoveryNode master : masterHistory) {
+                out.writeOptionalWriteable(master);
+            }
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o) return true;
+            if (o == null || getClass() != o.getClass()) return false;
+            MasterHistoryAction.Response response = (MasterHistoryAction.Response) o;
+            return masterHistory.equals(response.masterHistory);
+        }
+
+        @Override
+        public int hashCode() {
+            return Objects.hash(masterHistory);
+        }
+    }
+
+    /**
+     * This transport action fetches the MasterHistory from a remote node.
+     */
+    public static class TransportAction extends HandledTransportAction<Request, Response> {
+        private final MasterHistoryService masterHistoryService;
+
+        @Inject
+        public TransportAction(TransportService transportService, ActionFilters actionFilters, MasterHistoryService masterHistoryService) {
+            super(MasterHistoryAction.NAME, transportService, actionFilters, MasterHistoryAction.Request::new);
+            this.masterHistoryService = masterHistoryService;
+        }
+
+        @Override
+        protected void doExecute(Task task, MasterHistoryAction.Request request, ActionListener<Response> listener) {
+            listener.onResponse(new MasterHistoryAction.Response(masterHistoryService.getLocalMasterHistory().getNodes()));
+        }
+    }
+
+}

+ 226 - 0
server/src/main/java/org/elasticsearch/cluster/coordination/MasterHistory.java

@@ -0,0 +1,226 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.cluster.coordination;
+
+import org.elasticsearch.cluster.ClusterChangedEvent;
+import org.elasticsearch.cluster.ClusterStateListener;
+import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.settings.Setting;
+import org.elasticsearch.core.Nullable;
+import org.elasticsearch.core.TimeValue;
+import org.elasticsearch.threadpool.ThreadPool;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Objects;
+import java.util.concurrent.TimeUnit;
+import java.util.function.LongSupplier;
+import java.util.stream.Collectors;
+
+/**
+ * This class represents a node's view of the history of which nodes have been elected master over the last 30 minutes. It is kept in
+ * memory, so when a node comes up it does not have any knowledge of previous master history before that point. This object is updated
+ * if and when the cluster state changes with a new master node.
+ */
+public class MasterHistory implements ClusterStateListener {
+    /**
+     * The maximum amount of time that the master history covers.
+     */
+    private final TimeValue maxHistoryAge;
+    // Note: While the master can be null, the TimeAndMaster object in this list is never null
+    private volatile List<TimeAndMaster> masterHistory;
+    private final LongSupplier currentTimeMillisSupplier;
+    /**
+     * This is the maximum number of master nodes kept in history so that the list doesn't grow extremely large and impact performance if
+     * things become really unstable. We don't get additional any value in keeping more than this.
+     */
+    public static final int MAX_HISTORY_SIZE = 50;
+
+    private static final TimeValue DEFAULT_MAX_HISTORY_AGE = new TimeValue(30, TimeUnit.MINUTES);
+    private static final TimeValue SMALLEST_ALLOWED_MAX_HISTORY_AGE = new TimeValue(1, TimeUnit.MINUTES);
+
+    public static final Setting<TimeValue> MAX_HISTORY_AGE_SETTING = Setting.timeSetting(
+        "master_history.max_age",
+        DEFAULT_MAX_HISTORY_AGE,
+        SMALLEST_ALLOWED_MAX_HISTORY_AGE,
+        Setting.Property.NodeScope
+    );
+
+    public MasterHistory(ThreadPool threadPool, ClusterService clusterService) {
+        this.masterHistory = new ArrayList<>();
+        this.currentTimeMillisSupplier = threadPool::relativeTimeInMillis;
+        this.maxHistoryAge = MAX_HISTORY_AGE_SETTING.get(clusterService.getSettings());
+        clusterService.addListener(this);
+    }
+
+    public TimeValue getMaxHistoryAge() {
+        return this.maxHistoryAge;
+    }
+
+    @Override
+    public void clusterChanged(ClusterChangedEvent event) {
+        DiscoveryNode currentMaster = event.state().nodes().getMasterNode();
+        DiscoveryNode previousMaster = event.previousState().nodes().getMasterNode();
+        if (currentMaster == null || currentMaster.equals(previousMaster) == false || masterHistory.isEmpty()) {
+            long now = currentTimeMillisSupplier.getAsLong();
+            long oldestRelevantHistoryTime = now - maxHistoryAge.getMillis();
+            List<TimeAndMaster> newMasterHistory = new ArrayList<>();
+            int sizeAfterAddingNewMaster = masterHistory.size() + 1;
+            int startIndex = Math.max(0, sizeAfterAddingNewMaster - MAX_HISTORY_SIZE);
+            for (int i = startIndex; i < masterHistory.size(); i++) {
+                TimeAndMaster timeAndMaster = masterHistory.get(i);
+                if (timeAndMaster.startTimeMillis >= oldestRelevantHistoryTime) {
+                    newMasterHistory.add(timeAndMaster);
+                }
+            }
+            newMasterHistory.add(new TimeAndMaster(currentTimeMillisSupplier.getAsLong(), currentMaster));
+            masterHistory = Collections.unmodifiableList(newMasterHistory);
+        }
+    }
+
+    /**
+     * Returns the node that has been most recently seen as the master
+     * @return The node that has been most recently seen as the master, which could be null if no master exists
+     */
+    public @Nullable DiscoveryNode getMostRecentMaster() {
+        List<TimeAndMaster> masterHistoryCopy = getRecentMasterHistory(masterHistory);
+        return masterHistoryCopy.isEmpty() ? null : masterHistoryCopy.get(masterHistoryCopy.size() - 1).master;
+    }
+
+    /**
+     * Returns the most recent non-null master seen, or null if there has been no master seen. Only 30 minutes of history is kept. If the
+     * most recent master change is more than 30 minutes old and that change was to set the master to null, then null will be returned.
+     * @return The most recent non-null master seen, or null if there has been no master seen.
+     */
+    public @Nullable DiscoveryNode getMostRecentNonNullMaster() {
+        List<TimeAndMaster> masterHistoryCopy = getRecentMasterHistory(masterHistory);
+        Collections.reverse(masterHistoryCopy);
+        for (TimeAndMaster timeAndMaster : masterHistoryCopy) {
+            if (timeAndMaster.master != null) {
+                return timeAndMaster.master;
+            }
+        }
+        return null;
+    }
+
+    /**
+     * Returns true if for the life of this MasterHistory (30 minutes) non-null masters have transitioned to null n times.
+     * @param n The number of times a non-null master must have switched to null
+     * @return True if non-null masters have transitioned to null n or more times.
+     */
+    public boolean hasMasterGoneNullAtLeastNTimes(int n) {
+        return hasMasterGoneNullAtLeastNTimes(getNodes(), n);
+    }
+
+    /**
+     * Returns true if for the List of master nodes passed in, non-null masters have transitioned to null n times.
+     * So for example:
+     * node1 -> null is 1 transition to null
+     * node1 -> null -> null is 1 transition to null
+     * null -> node1 -> null is 1 transition to null
+     * node1 -> null -> node1 is 1 transition to null
+     * node1 -> null -> node1 -> null is 2 transitions to null
+     * node1 -> null -> node2 -> null is 2 transitions to null
+     * @param masters The List of masters to use
+     * @param n The number of times a non-null master must have switched to null
+     * @return True if non-null masters have transitioned to null n or more timesin the given list of masters.
+     */
+    public static boolean hasMasterGoneNullAtLeastNTimes(List<DiscoveryNode> masters, int n) {
+        int timesMasterHasGoneNull = 0;
+        boolean previousNull = true;
+        for (DiscoveryNode master : masters) {
+            if (master == null) {
+                if (previousNull == false) {
+                    timesMasterHasGoneNull++;
+                }
+                previousNull = true;
+            } else {
+                previousNull = false;
+            }
+        }
+        return timesMasterHasGoneNull >= n;
+    }
+
+    /**
+     * An identity change is when we get notified of a change to a non-null master that is different from the previous non-null master.
+     * Note that a master changes to null on (virtually) every identity change.
+     * So for example:
+     * node1 -> node2 is 1 identity change
+     * node1 -> node2 -> node1 is 2 identity changes
+     * node1 -> node2 -> node2 is 1 identity change (transitions from a node to itself do not count)
+     * node1 -> null -> node1 is 0 identity changes (transitions from a node to itself, even with null in the middle, do not count)
+     * node1 -> null -> node2 is 1 identity change
+     * @param masterHistory The list of nodes that have been master
+     * @return The number of master identity changes as defined above
+     */
+    public static int getNumberOfMasterIdentityChanges(List<DiscoveryNode> masterHistory) {
+        int identityChanges = 0;
+        List<DiscoveryNode> nonNullHistory = masterHistory.stream().filter(Objects::nonNull).toList();
+        DiscoveryNode previousNode = null;
+        for (DiscoveryNode node : nonNullHistory) {
+            if (previousNode != null && previousNode.equals(node) == false) {
+                identityChanges++;
+            }
+            previousNode = node;
+        }
+        return identityChanges;
+    }
+
+    /**
+     * Returns true if a non-null master was seen at any point in the last nSeconds seconds, or if the last-seen master was more than
+     * nSeconds seconds ago and non-null.
+     * @param nSeconds The number of seconds to look back
+     * @return true if the current master is non-null or if a non-null master was seen in the last nSeconds seconds
+     */
+    public boolean hasSeenMasterInLastNSeconds(int nSeconds) {
+        List<TimeAndMaster> masterHistoryCopy = getRecentMasterHistory(masterHistory);
+        long now = currentTimeMillisSupplier.getAsLong();
+        TimeValue nSecondsTimeValue = new TimeValue(nSeconds, TimeUnit.SECONDS);
+        long nSecondsAgo = now - nSecondsTimeValue.getMillis();
+        return getMostRecentMaster() != null
+            || masterHistoryCopy.stream()
+                .filter(timeAndMaster -> timeAndMaster.master != null)
+                .anyMatch(timeAndMaster -> timeAndMaster.startTimeMillis > nSecondsAgo);
+    }
+
+    /*
+     * This method creates a copy of masterHistory that only has entries from more than maxHistoryAge before now (but leaves the newest
+     * entry in even if it is more than maxHistoryAge).
+     */
+    private List<TimeAndMaster> getRecentMasterHistory(List<TimeAndMaster> history) {
+        if (history.size() < 2) {
+            return history;
+        }
+        long now = currentTimeMillisSupplier.getAsLong();
+        long oldestRelevantHistoryTime = now - maxHistoryAge.getMillis();
+        TimeAndMaster mostRecent = history.isEmpty() ? null : history.get(history.size() - 1);
+        List<TimeAndMaster> filteredHistory = history.stream()
+            .filter(timeAndMaster -> timeAndMaster.startTimeMillis > oldestRelevantHistoryTime)
+            .collect(Collectors.toList());
+        if (filteredHistory.isEmpty() && mostRecent != null) { // The most recent entry was more than 30 minutes ago
+            filteredHistory.add(mostRecent);
+        }
+        return filteredHistory;
+    }
+
+    /**
+     * This method returns an immutable view of this master history, typically for sending over the wire to another node. The returned List
+     * is ordered by when the master was seen, with the earliest-seen masters being first. The List can contain null values. Times are
+     * intentionally not included because they cannot be compared across machines.
+     * @return An immutable view of this master history
+     */
+    public List<DiscoveryNode> getNodes() {
+        List<TimeAndMaster> masterHistoryCopy = getRecentMasterHistory(masterHistory);
+        return masterHistoryCopy.stream().map(TimeAndMaster::master).toList();
+    }
+
+    private record TimeAndMaster(long startTimeMillis, DiscoveryNode master) {}
+}

+ 157 - 0
server/src/main/java/org/elasticsearch/cluster/coordination/MasterHistoryService.java

@@ -0,0 +1,157 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.cluster.coordination;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.elasticsearch.Version;
+import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.action.ActionListenerResponseHandler;
+import org.elasticsearch.action.admin.cluster.coordination.MasterHistoryAction;
+import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.core.Nullable;
+import org.elasticsearch.core.TimeValue;
+import org.elasticsearch.threadpool.ThreadPool;
+import org.elasticsearch.transport.ConnectionProfile;
+import org.elasticsearch.transport.Transport;
+import org.elasticsearch.transport.TransportRequestOptions;
+import org.elasticsearch.transport.TransportService;
+
+import java.util.List;
+
+/**
+ * This service provides access to this node's view of the master history, as well as access to other nodes' view of master stability.
+ */
+public class MasterHistoryService {
+    private final TransportService transportService;
+    private final MasterHistory localMasterHistory;
+    private final ClusterService clusterService;
+    /*
+     * This is a view of the master history one a remote node, or the exception that fetching it resulted in. This is populated
+     * asynchronously.
+     */
+    volatile RemoteHistoryOrException remoteHistoryOrException = new RemoteHistoryOrException(null, null); // non-private for testing
+    private static final Logger logger = LogManager.getLogger(MasterHistoryService.class);
+
+    public MasterHistoryService(TransportService transportService, ThreadPool threadPool, ClusterService clusterService) {
+        this.transportService = transportService;
+        this.localMasterHistory = new MasterHistory(threadPool, clusterService);
+        this.clusterService = clusterService;
+    }
+
+    /**
+     * This returns the MasterHistory as seen from this node. The returned MasterHistory will be automatically updated whenever the
+     * ClusterState on this node is updated with new information about the master.
+     * @return The MasterHistory from this node's point of view. This MasterHistory object will be updated whenever the ClusterState changes
+     */
+    public MasterHistory getLocalMasterHistory() {
+        return localMasterHistory;
+    }
+
+    /**
+     * This method returns a static view of the MasterHistory on a remote node. This MasterHistory is static in that it will not be
+     * updated even if the ClusterState is updated on this node or the remote node. The history is retrieved asynchronously, and only if
+     * requestRemoteMasterHistory has been called for this node. If anything has gone wrong fetching it, the exception returned by the
+     * remote machine will be thrown here. If the remote history has not been fetched or if something went wrong and there was no exception,
+     * the returned value will be null.
+     * @return The MasterHistory from a remote node's point of view. This MasterHistory object will not be updated with future changes
+     * @throws Exception the exception (if any) returned by the remote machine when fetching the history
+     */
+    @Nullable
+    public List<DiscoveryNode> getRemoteMasterHistory() throws Exception {
+        // Grabbing a reference to the object in case it is replaced in another thread during this method:
+        RemoteHistoryOrException remoteHistoryOrExceptionCopy = remoteHistoryOrException;
+        if (remoteHistoryOrExceptionCopy.exception != null) {
+            throw remoteHistoryOrExceptionCopy.exception;
+        }
+        return remoteHistoryOrExceptionCopy.remoteHistory;
+    }
+
+    /**
+     * This method attempts to fetch the master history from the requested node. If we are able to successfully fetch it, it will be
+     * available in a later call to getRemoteMasterHistory. The client is not notified if or when the remote history is successfully
+     * retrieved. This method only fetches the remote master history once, and it is never updated unless this method is called again. If
+     * two calls are made to this method, the response of one will overwrite the response of the other (with no guarantee of the ordering
+     * of responses).
+     * This is a remote call, so clients should avoid calling it any more often than necessary.
+     * @param node The node whose view of the master history we want to fetch
+     */
+    public void refreshRemoteMasterHistory(DiscoveryNode node) {
+        long startTime = System.nanoTime();
+        transportService.openConnection(
+            // Note: This connection must be explicitly closed below
+            node,
+            ConnectionProfile.buildDefaultConnectionProfile(clusterService.getSettings()),
+            new ActionListener<>() {
+                @Override
+                public void onResponse(Transport.Connection connection) {
+                    Version minSupportedVersion = Version.V_8_3_0;
+                    if (connection.getVersion().onOrAfter(minSupportedVersion)) { // This was introduced in 8.3.0
+                        logger.trace("Opened connection to {}, making master history request", node);
+                        // If we don't get a response in 10 seconds that is a failure worth capturing on its own:
+                        final TimeValue remoteMasterHistoryTimeout = TimeValue.timeValueSeconds(10);
+                        transportService.sendRequest(
+                            node,
+                            MasterHistoryAction.NAME,
+                            new MasterHistoryAction.Request(),
+                            TransportRequestOptions.timeout(remoteMasterHistoryTimeout),
+                            new ActionListenerResponseHandler<>(ActionListener.runBefore(new ActionListener<>() {
+
+                                @Override
+                                public void onResponse(MasterHistoryAction.Response response) {
+                                    long endTime = System.nanoTime();
+                                    logger.trace("Received history from {} in {}", node, TimeValue.timeValueNanos(endTime - startTime));
+                                    remoteHistoryOrException = new RemoteHistoryOrException(response.getMasterHistory());
+                                }
+
+                                @Override
+                                public void onFailure(Exception e) {
+                                    logger.warn("Exception in master history request to master node", e);
+                                    remoteHistoryOrException = new RemoteHistoryOrException(e);
+                                }
+                            }, connection::close), MasterHistoryAction.Response::new)
+                        );
+                    } else {
+                        connection.close();
+                        logger.trace(
+                            "Cannot get master history for {} because it is at version {} and {} is required",
+                            node,
+                            connection.getVersion(),
+                            minSupportedVersion
+                        );
+                    }
+                }
+
+                @Override
+                public void onFailure(Exception e) {
+                    logger.warn("Exception connecting to master node", e);
+                    remoteHistoryOrException = new RemoteHistoryOrException(e);
+                }
+            }
+        );
+    }
+
+    record RemoteHistoryOrException(List<DiscoveryNode> remoteHistory, Exception exception) { // non-private for testing
+
+        public RemoteHistoryOrException {
+            if (remoteHistory != null && exception != null) {
+                throw new IllegalArgumentException("Remote history and exception cannot both be non-null");
+            }
+        }
+
+        RemoteHistoryOrException(List<DiscoveryNode> remoteHistory) {
+            this(remoteHistory, null);
+        }
+
+        RemoteHistoryOrException(Exception exception) {
+            this(null, exception);
+        }
+    }
+}

+ 9 - 2
server/src/main/java/org/elasticsearch/node/Node.java

@@ -42,6 +42,7 @@ import org.elasticsearch.cluster.NodeConnectionsService;
 import org.elasticsearch.cluster.action.index.MappingUpdatedAction;
 import org.elasticsearch.cluster.coordination.Coordinator;
 import org.elasticsearch.cluster.coordination.InstanceHasMasterHealthIndicatorService;
+import org.elasticsearch.cluster.coordination.MasterHistoryService;
 import org.elasticsearch.cluster.desirednodes.DesiredNodesSettingsValidator;
 import org.elasticsearch.cluster.metadata.IndexMetadataVerifier;
 import org.elasticsearch.cluster.metadata.IndexTemplateMetadata;
@@ -900,7 +901,8 @@ public class Node implements Closeable {
                 clusterService.getClusterSettings()
             );
 
-            HealthService healthService = createHealthService(clusterService, clusterModule);
+            MasterHistoryService masterHistoryService = new MasterHistoryService(transportService, threadPool, clusterService);
+            HealthService healthService = createHealthService(clusterService, clusterModule, masterHistoryService);
 
             modules.add(b -> {
                 b.bind(Node.class).toInstance(this);
@@ -983,6 +985,7 @@ public class Node implements Closeable {
                 b.bind(IndexSettingProviders.class).toInstance(indexSettingProviders);
                 b.bind(DesiredNodesSettingsValidator.class).toInstance(desiredNodesSettingsValidator);
                 b.bind(HealthService.class).toInstance(healthService);
+                b.bind(MasterHistoryService.class).toInstance(masterHistoryService);
                 b.bind(StatsRequestLimiter.class).toInstance(statsRequestLimiter);
             });
 
@@ -1039,7 +1042,11 @@ public class Node implements Closeable {
         }
     }
 
-    private HealthService createHealthService(ClusterService clusterService, ClusterModule clusterModule) {
+    private HealthService createHealthService(
+        ClusterService clusterService,
+        ClusterModule clusterModule,
+        MasterHistoryService masterHistoryService
+    ) {
         List<HealthIndicatorService> preflightHealthIndicatorServices = Collections.singletonList(
             new InstanceHasMasterHealthIndicatorService(clusterService)
         );

+ 109 - 0
server/src/test/java/org/elasticsearch/action/admin/cluster/coordination/MasterHistoryActionTests.java

@@ -0,0 +1,109 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.action.admin.cluster.coordination;
+
+import org.elasticsearch.Version;
+import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.action.support.ActionFilters;
+import org.elasticsearch.cluster.coordination.MasterHistory;
+import org.elasticsearch.cluster.coordination.MasterHistoryService;
+import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.test.ESTestCase;
+import org.elasticsearch.test.EqualsHashCodeTestUtils;
+import org.elasticsearch.threadpool.ThreadPool;
+import org.elasticsearch.transport.TransportService;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+public class MasterHistoryActionTests extends ESTestCase {
+    public void testSerialization() {
+        List<DiscoveryNode> masterHistory = List.of(
+            new DiscoveryNode("_id1", buildNewFakeTransportAddress(), Version.CURRENT),
+            new DiscoveryNode("_id2", buildNewFakeTransportAddress(), Version.CURRENT)
+        );
+        MasterHistoryAction.Response response = new MasterHistoryAction.Response(masterHistory);
+        EqualsHashCodeTestUtils.checkEqualsAndHashCode(
+            response,
+            history -> copyWriteable(history, writableRegistry(), MasterHistoryAction.Response::new),
+            this::mutateMasterHistoryResponse
+        );
+
+        MasterHistoryAction.Request request = new MasterHistoryAction.Request();
+        EqualsHashCodeTestUtils.checkEqualsAndHashCode(
+            request,
+            history -> copyWriteable(history, writableRegistry(), MasterHistoryAction.Request::new)
+        );
+    }
+
+    private MasterHistoryAction.Response mutateMasterHistoryResponse(MasterHistoryAction.Response originalResponse) {
+        List<DiscoveryNode> nodes = originalResponse.getMasterHistory();
+        switch (randomIntBetween(1, 4)) {
+            case 1 -> {
+                List<DiscoveryNode> newNodes = new ArrayList<>(nodes);
+                newNodes.add(new DiscoveryNode("_id3", buildNewFakeTransportAddress(), Version.CURRENT));
+                return new MasterHistoryAction.Response(newNodes);
+            }
+            case 2 -> {
+                List<DiscoveryNode> newNodes = new ArrayList<>(nodes);
+                newNodes.remove(0);
+                return new MasterHistoryAction.Response(newNodes);
+            }
+            case 3 -> {
+                List<DiscoveryNode> newNodes = new ArrayList<>(nodes);
+                newNodes.remove(0);
+                newNodes.add(0, new DiscoveryNode("_id1", buildNewFakeTransportAddress(), Version.CURRENT));
+                return new MasterHistoryAction.Response(newNodes);
+            }
+            case 4 -> {
+                List<DiscoveryNode> newNodes = new ArrayList<>(nodes);
+                newNodes.remove(0);
+                newNodes.add(0, null);
+                return new MasterHistoryAction.Response(newNodes);
+            }
+            default -> throw new IllegalStateException();
+        }
+    }
+
+    public void testTransportDoExecute() {
+        TransportService transportService = mock(TransportService.class);
+        ActionFilters actionFilters = mock(ActionFilters.class);
+        MasterHistoryService masterHistoryService = mock(MasterHistoryService.class);
+        ClusterService clusterService = mock(ClusterService.class);
+        when(clusterService.getSettings()).thenReturn(Settings.EMPTY);
+        ThreadPool threadPool = mock(ThreadPool.class);
+        when(threadPool.relativeTimeInMillis()).thenReturn(System.currentTimeMillis());
+        MasterHistory masterHistory = new MasterHistory(threadPool, clusterService);
+        when(masterHistoryService.getLocalMasterHistory()).thenReturn(masterHistory);
+        MasterHistoryAction.TransportAction action = new MasterHistoryAction.TransportAction(
+            transportService,
+            actionFilters,
+            masterHistoryService
+        );
+        final List<List<DiscoveryNode>> result = new ArrayList<>();
+        ActionListener<MasterHistoryAction.Response> listener = new ActionListener<>() {
+            @Override
+            public void onResponse(MasterHistoryAction.Response response) {
+                result.add(response.getMasterHistory());
+            }
+
+            @Override
+            public void onFailure(Exception e) {
+                fail("Not expecting failure");
+            }
+        };
+        action.doExecute(null, new MasterHistoryAction.Request(), listener);
+        assertEquals(masterHistory.getNodes(), result.get(0));
+    }
+}

+ 62 - 0
server/src/test/java/org/elasticsearch/cluster/coordination/MasterHistoryServiceTests.java

@@ -0,0 +1,62 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.cluster.coordination;
+
+import org.elasticsearch.Version;
+import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.test.ESTestCase;
+import org.elasticsearch.threadpool.ThreadPool;
+import org.elasticsearch.transport.TransportService;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.UUID;
+
+import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.equalTo;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+public class MasterHistoryServiceTests extends ESTestCase {
+
+    public void testGetRemoteHistory() throws Exception {
+        MasterHistoryService masterHistoryService = createMasterHistoryService();
+        List<DiscoveryNode> remoteHistory = masterHistoryService.getRemoteMasterHistory();
+        assertNull(remoteHistory);
+        DiscoveryNode masterNode = new DiscoveryNode(UUID.randomUUID().toString(), buildNewFakeTransportAddress(), Version.CURRENT);
+        List<DiscoveryNode> masterHistory = new ArrayList<>();
+        masterHistory.add(masterNode);
+        masterHistory.add(null);
+        masterHistory.add(masterNode);
+        masterHistory.add(null);
+        masterHistory.add(masterNode);
+        masterHistory.add(null);
+        masterHistory.add(masterNode);
+        masterHistoryService.remoteHistoryOrException = new MasterHistoryService.RemoteHistoryOrException(masterHistory);
+        remoteHistory = masterHistoryService.getRemoteMasterHistory();
+        assertThat(remoteHistory, equalTo(masterHistory));
+        Exception exception = new Exception("Something happened");
+        masterHistoryService.remoteHistoryOrException = new MasterHistoryService.RemoteHistoryOrException(exception);
+        assertThat(
+            expectThrows(Exception.class, masterHistoryService::getRemoteMasterHistory).getMessage(),
+            containsString("Something happened")
+        );
+    }
+
+    private static MasterHistoryService createMasterHistoryService() throws Exception {
+        var clusterService = mock(ClusterService.class);
+        when(clusterService.getSettings()).thenReturn(Settings.EMPTY);
+        ThreadPool threadPool = mock(ThreadPool.class);
+        when(threadPool.relativeTimeInMillis()).thenReturn(System.currentTimeMillis());
+        TransportService transportService = mock(TransportService.class);
+        return new MasterHistoryService(transportService, threadPool, clusterService);
+    }
+}

+ 178 - 0
server/src/test/java/org/elasticsearch/cluster/coordination/MasterHistoryTests.java

@@ -0,0 +1,178 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.cluster.coordination;
+
+import org.elasticsearch.Version;
+import org.elasticsearch.cluster.ClusterChangedEvent;
+import org.elasticsearch.cluster.ClusterName;
+import org.elasticsearch.cluster.ClusterState;
+import org.elasticsearch.cluster.metadata.Metadata;
+import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.cluster.node.DiscoveryNodes;
+import org.elasticsearch.cluster.routing.RoutingTable;
+import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.test.ESTestCase;
+import org.elasticsearch.threadpool.ThreadPool;
+import org.junit.Before;
+
+import java.net.UnknownHostException;
+import java.util.UUID;
+
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.lessThanOrEqualTo;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+public class MasterHistoryTests extends ESTestCase {
+
+    private ClusterState nullMasterClusterState;
+    private ClusterState node1MasterClusterState;
+    private ClusterState node2MasterClusterState;
+    private ClusterState node3MasterClusterState;
+    private static final String TEST_SOURCE = "test";
+
+    @Before
+    public void setup() throws Exception {
+        String node1 = randomNodeId();
+        String node2 = randomNodeId();
+        String node3 = randomNodeId();
+        nullMasterClusterState = createClusterState(null);
+        node1MasterClusterState = createClusterState(node1);
+        node2MasterClusterState = createClusterState(node2);
+        node3MasterClusterState = createClusterState(node3);
+    }
+
+    public void testGetBasicUse() {
+        var clusterService = mock(ClusterService.class);
+        when(clusterService.getSettings()).thenReturn(Settings.EMPTY);
+        ThreadPool threadPool = mock(ThreadPool.class);
+        when(threadPool.relativeTimeInMillis()).thenReturn(System.currentTimeMillis());
+        MasterHistory masterHistory = new MasterHistory(threadPool, clusterService);
+        assertNull(masterHistory.getMostRecentMaster());
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, nullMasterClusterState, nullMasterClusterState));
+        assertNull(masterHistory.getMostRecentMaster());
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, node1MasterClusterState, nullMasterClusterState));
+        assertThat(masterHistory.getMostRecentMaster(), equalTo(node1MasterClusterState.nodes().getMasterNode()));
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, node2MasterClusterState, node1MasterClusterState));
+        assertThat(masterHistory.getMostRecentMaster(), equalTo(node2MasterClusterState.nodes().getMasterNode()));
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, node3MasterClusterState, node2MasterClusterState));
+        assertThat(masterHistory.getMostRecentMaster(), equalTo(node3MasterClusterState.nodes().getMasterNode()));
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, node1MasterClusterState, node3MasterClusterState));
+        assertThat(masterHistory.getMostRecentMaster(), equalTo(node1MasterClusterState.nodes().getMasterNode()));
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, nullMasterClusterState, node1MasterClusterState));
+        assertNull(masterHistory.getMostRecentMaster());
+        assertThat(masterHistory.getMostRecentNonNullMaster(), equalTo(node1MasterClusterState.nodes().getMasterNode()));
+    }
+
+    public void testHasMasterGoneNull() {
+        var clusterService = mock(ClusterService.class);
+        when(clusterService.getSettings()).thenReturn(Settings.EMPTY);
+        ThreadPool threadPool = mock(ThreadPool.class);
+        MasterHistory masterHistory = new MasterHistory(threadPool, clusterService);
+        long oneHourAgo = System.currentTimeMillis() - (60 * 60 * 1000);
+        when(threadPool.relativeTimeInMillis()).thenReturn(oneHourAgo);
+        assertFalse(masterHistory.hasMasterGoneNullAtLeastNTimes(3));
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, nullMasterClusterState, nullMasterClusterState));
+        assertFalse(masterHistory.hasMasterGoneNullAtLeastNTimes(3));
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, node1MasterClusterState, nullMasterClusterState));
+        assertFalse(masterHistory.hasMasterGoneNullAtLeastNTimes(3));
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, nullMasterClusterState, node1MasterClusterState));
+        assertFalse(masterHistory.hasMasterGoneNullAtLeastNTimes(3));
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, node2MasterClusterState, nullMasterClusterState));
+        assertFalse(masterHistory.hasMasterGoneNullAtLeastNTimes(3));
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, nullMasterClusterState, node2MasterClusterState));
+        assertFalse(masterHistory.hasMasterGoneNullAtLeastNTimes(3));
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, node1MasterClusterState, nullMasterClusterState));
+        assertFalse(masterHistory.hasMasterGoneNullAtLeastNTimes(3));
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, nullMasterClusterState, node1MasterClusterState));
+        assertTrue(masterHistory.hasMasterGoneNullAtLeastNTimes(3));
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, node1MasterClusterState, nullMasterClusterState));
+        assertTrue(masterHistory.hasMasterGoneNullAtLeastNTimes(3));
+        when(threadPool.relativeTimeInMillis()).thenReturn(System.currentTimeMillis());
+        assertFalse(masterHistory.hasMasterGoneNullAtLeastNTimes(3));
+    }
+
+    public void testTime() {
+        var clusterService = mock(ClusterService.class);
+        when(clusterService.getSettings()).thenReturn(Settings.EMPTY);
+        ThreadPool threadPool = mock(ThreadPool.class);
+        MasterHistory masterHistory = new MasterHistory(threadPool, clusterService);
+        long oneHourAgo = System.currentTimeMillis() - (60 * 60 * 1000);
+        when(threadPool.relativeTimeInMillis()).thenReturn(oneHourAgo);
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, nullMasterClusterState, nullMasterClusterState));
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, node1MasterClusterState, nullMasterClusterState));
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, node2MasterClusterState, node1MasterClusterState));
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, node3MasterClusterState, node2MasterClusterState));
+        assertThat(masterHistory.getMostRecentMaster(), equalTo(node3MasterClusterState.nodes().getMasterNode()));
+        when(threadPool.relativeTimeInMillis()).thenReturn(System.currentTimeMillis());
+        assertThat(masterHistory.getMostRecentMaster(), equalTo(node3MasterClusterState.nodes().getMasterNode()));
+        assertTrue(masterHistory.hasSeenMasterInLastNSeconds(5));
+    }
+
+    public void testGetNumberOfMasterChanges() {
+        var clusterService = mock(ClusterService.class);
+        when(clusterService.getSettings()).thenReturn(Settings.EMPTY);
+        ThreadPool threadPool = mock(ThreadPool.class);
+        MasterHistory masterHistory = new MasterHistory(threadPool, clusterService);
+        assertThat(MasterHistory.getNumberOfMasterIdentityChanges(masterHistory.getNodes()), equalTo(0));
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, node1MasterClusterState, nullMasterClusterState));
+        assertThat(MasterHistory.getNumberOfMasterIdentityChanges(masterHistory.getNodes()), equalTo(0)); // The first master
+                                                                                                          // doesn't count as a
+                                                                                                          // change
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, nullMasterClusterState, node1MasterClusterState));
+        assertThat(MasterHistory.getNumberOfMasterIdentityChanges(masterHistory.getNodes()), equalTo(0)); // Nulls don't count
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, node1MasterClusterState, nullMasterClusterState));
+        assertThat(MasterHistory.getNumberOfMasterIdentityChanges(masterHistory.getNodes()), equalTo(0)); // Still no change in the
+                                                                                                          // last non-null master
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, nullMasterClusterState, node1MasterClusterState));
+        assertThat(MasterHistory.getNumberOfMasterIdentityChanges(masterHistory.getNodes()), equalTo(0)); // Nulls don't count
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, node2MasterClusterState, node1MasterClusterState));
+        assertThat(MasterHistory.getNumberOfMasterIdentityChanges(masterHistory.getNodes()), equalTo(1)); // Finally a new non-null
+                                                                                                          // master
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, nullMasterClusterState, node2MasterClusterState));
+        assertThat(MasterHistory.getNumberOfMasterIdentityChanges(masterHistory.getNodes()), equalTo(1)); // Nulls don't count
+        masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, node1MasterClusterState, nullMasterClusterState));
+        assertThat(MasterHistory.getNumberOfMasterIdentityChanges(masterHistory.getNodes()), equalTo(2)); // Back to node1, but it's
+                                                                                                          // a change from node2
+    }
+
+    public void testMaxSize() {
+        var clusterService = mock(ClusterService.class);
+        when(clusterService.getSettings()).thenReturn(Settings.EMPTY);
+        ThreadPool threadPool = mock(ThreadPool.class);
+        MasterHistory masterHistory = new MasterHistory(threadPool, clusterService);
+        for (int i = 0; i < MasterHistory.MAX_HISTORY_SIZE; i++) {
+            masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, node1MasterClusterState, nullMasterClusterState));
+            masterHistory.clusterChanged(new ClusterChangedEvent(TEST_SOURCE, nullMasterClusterState, node1MasterClusterState));
+        }
+        assertThat(masterHistory.getNodes().size(), lessThanOrEqualTo(MasterHistory.MAX_HISTORY_SIZE));
+
+    }
+
+    private static String randomNodeId() {
+        return UUID.randomUUID().toString();
+    }
+
+    private static ClusterState createClusterState(String masterNodeId) throws UnknownHostException {
+        var routingTableBuilder = RoutingTable.builder();
+        Metadata.Builder metadataBuilder = Metadata.builder();
+        DiscoveryNodes.Builder nodesBuilder = DiscoveryNodes.builder();
+        if (masterNodeId != null) {
+            DiscoveryNode node = new DiscoveryNode(masterNodeId, buildNewFakeTransportAddress(), Version.CURRENT);
+            nodesBuilder.masterNodeId(masterNodeId);
+            nodesBuilder.add(node);
+        }
+        return ClusterState.builder(new ClusterName("test-cluster"))
+            .routingTable(routingTableBuilder.build())
+            .metadata(metadataBuilder.build())
+            .nodes(nodesBuilder)
+            .build();
+    }
+}

+ 1 - 0
x-pack/plugin/security/qa/operator-privileges-tests/src/javaRestTest/java/org/elasticsearch/xpack/security/operator/Constants.java

@@ -248,6 +248,7 @@ public class Constants {
         "cluster:internal/xpack/ml/trained_models/cache/info",
         "cluster:internal/xpack/ml/trained_models/deployments/stats/get",
         "cluster:internal/xpack/transform/reset_mode",
+        "cluster:internal/master_history/get",
         "cluster:monitor/allocation/explain",
         "cluster:monitor/async_search/status",
         "cluster:monitor/ccr/follow_info",