3 năm trước cách đây · 5b3d51d30c
--- a/docs/changelog/89472.yaml
+++ b/docs/changelog/89472.yaml
@@ -0,0 +1,6 @@
 
				+pr: 89472
			
 
				+summary: Fixing the conditions for fetching remote master history
			
 
				+area: Health
			
 
				+type: bug
			
 
				+issues:
			
 
				+ - 89431
			
--- a/server/src/internalClusterTest/java/org/elasticsearch/discovery/StableMasterDisruptionIT.java
+++ b/server/src/internalClusterTest/java/org/elasticsearch/discovery/StableMasterDisruptionIT.java
@@ -430,28 +430,24 @@ public class StableMasterDisruptionIT extends ESIntegTestCase {
 
				                 .put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), 1)
			
 
				                 .build()
			
 
				         );
			
 
				+        int nullTransitionsThreshold = 1;
			
 
				         final List<String> dataNodes = internalCluster().startDataOnlyNodes(
			
 
				             2,
			
 
				             Settings.builder()
			
 
				                 .put(LeaderChecker.LEADER_CHECK_TIMEOUT_SETTING.getKey(), "1s")
			
 
				                 .put(Coordinator.PUBLISH_TIMEOUT_SETTING.getKey(), "1s")
			
 
				-                .put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), 1)
			
 
				+                .put(CoordinationDiagnosticsService.NO_MASTER_TRANSITIONS_THRESHOLD_SETTING.getKey(), nullTransitionsThreshold)
			
 
				                 .put(CoordinationDiagnosticsService.NODE_HAS_MASTER_LOOKUP_TIMEFRAME_SETTING.getKey(), new TimeValue(60, TimeUnit.SECONDS))
			
 
				                 .build()
			
 
				         );
			
 
				         ensureStableCluster(3);
			
 
				-        for (int i = 0; i < 2; i++) {
			
 
				+        for (int i = 0; i < nullTransitionsThreshold + 1; i++) {
			
 
				             final String masterNode = masterNodes.get(0);
			
 
				 
			
 
				             // Simulating a painful gc by suspending all threads for a long time on the current elected master node.
			
 
				             SingleNodeDisruption masterNodeDisruption = new LongGCDisruption(random(), masterNode);
			
 
				 
			
 
				             final CountDownLatch dataNodeMasterSteppedDown = new CountDownLatch(2);
			
 
				-            internalCluster().getInstance(ClusterService.class, masterNode).addListener(event -> {
			
 
				-                if (event.state().nodes().getMasterNodeId() == null) {
			
 
				-                    dataNodeMasterSteppedDown.countDown();
			
 
				-                }
			
 
				-            });
			
 
				             internalCluster().getInstance(ClusterService.class, dataNodes.get(0)).addListener(event -> {
			
 
				                 if (event.state().nodes().getMasterNodeId() == null) {
			
 
				                     dataNodeMasterSteppedDown.countDown();
			
@@ -470,7 +466,7 @@ public class StableMasterDisruptionIT extends ESIntegTestCase {
 
				             // Stop disruption
			
 
				             logger.info("--> unfreezing node [{}]", masterNode);
			
 
				             masterNodeDisruption.stopDisrupting();
			
 
				-            ensureStableCluster(3);
			
 
				+            ensureStableCluster(3, TimeValue.timeValueSeconds(30), false, randomFrom(dataNodes));
			
 
				         }
			
 
				         assertGreenMasterStability(internalCluster().client(randomFrom(dataNodes)));
			
 
				     }
			
--- a/server/src/main/java/org/elasticsearch/cluster/coordination/CoordinationDiagnosticsService.java
+++ b/server/src/main/java/org/elasticsearch/cluster/coordination/CoordinationDiagnosticsService.java
@@ -741,8 +741,16 @@ public class CoordinationDiagnosticsService implements ClusterStateListener {
 
				     public void clusterChanged(ClusterChangedEvent event) {
			
 
				         DiscoveryNode currentMaster = event.state().nodes().getMasterNode();
			
 
				         DiscoveryNode previousMaster = event.previousState().nodes().getMasterNode();
			
 
				-        if (currentMaster == null && previousMaster != null) {
			
 
				+        if ((currentMaster == null && previousMaster != null) || (currentMaster != null && previousMaster == null)) {
			
 
				             if (masterHistoryService.getLocalMasterHistory().hasMasterGoneNullAtLeastNTimes(unacceptableNullTransitions)) {
			
 
				+                /*
			
 
				+                 * If the master node has been going to null repeatedly, we want to make a remote request to it to see what it thinks of
			
 
				+                 * master stability. We want to query the most recent master whether the current master has just transitioned to null or
			
 
				+                 * just transitioned from null to not null. The reason that we make the latter request is that sometimes when the elected
			
 
				+                 * master goes to null the most recent master is not responsive for the duration of the request timeout (for example if
			
 
				+                 * that node is in the middle of a long GC pause which would be both the reason for it not being master and the reason it
			
 
				+                 * does not respond quickly to transport requests).
			
 
				+                 */
			
 
				                 DiscoveryNode master = masterHistoryService.getLocalMasterHistory().getMostRecentNonNullMaster();
			
 
				                 /*
			
 
				                  * If the most recent master was this box, there is no point in making a transport request -- we already know what this