6 years ago · d292d95eaa
--- a/server/src/test/java/org/elasticsearch/discovery/ClusterDisruptionIT.java
+++ b/server/src/test/java/org/elasticsearch/discovery/ClusterDisruptionIT.java
@@ -31,6 +31,7 @@ import org.elasticsearch.client.Client;
 
				 import org.elasticsearch.cluster.ClusterState;
			
 
				 import org.elasticsearch.cluster.action.shard.ShardStateAction;
			
 
				 import org.elasticsearch.cluster.coordination.ClusterBootstrapService;
			
 
				+import org.elasticsearch.cluster.coordination.LagDetector;
			
 
				 import org.elasticsearch.cluster.metadata.IndexMetaData;
			
 
				 import org.elasticsearch.cluster.routing.Murmur3HashFunction;
			
 
				 import org.elasticsearch.cluster.routing.ShardRouting;
			
@@ -389,7 +390,6 @@ public class ClusterDisruptionIT extends AbstractDisruptionTestCase {
 
				         }
			
 
				     }
			
 
				 
			
 
				-    @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/41047")
			
 
				     public void testCannotJoinIfMasterLostDataFolder() throws Exception {
			
 
				         String masterNode = internalCluster().startMasterOnlyNode();
			
 
				         String dataNode = internalCluster().startDataOnlyNode();
			
@@ -402,7 +402,18 @@ public class ClusterDisruptionIT extends AbstractDisruptionTestCase {
 
				 
			
 
				             @Override
			
 
				             public Settings onNodeStopped(String nodeName) {
			
 
				-                return Settings.builder().put(ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING.getKey(), nodeName).build();
			
 
				+                return Settings.builder()
			
 
				+                    .put(ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING.getKey(), nodeName)
			
 
				+                    /*
			
 
				+                     * the data node might join while the master is still not fully established as master just yet and bypasses the join
			
 
				+                     * validation that is done before adding the node to the cluster. Only the join validation when handling the publish
			
 
				+                     * request takes place, but at this point the cluster state has been successfully committed, and will subsequently be
			
 
				+                     * exposed to the applier. The health check below therefore sees the cluster state with the 2 nodes and thinks all is
			
 
				+                     * good, even though the data node never accepted this state. What's worse is that it takes 90 seconds for the data
			
 
				+                     * node to be kicked out of the cluster (lag detection). We speed this up here.
			
 
				+                     */
			
 
				+                    .put(LagDetector.CLUSTER_FOLLOWER_LAG_TIMEOUT_SETTING.getKey(), "10s")
			
 
				+                    .build();
			
 
				             }
			
 
				 
			
 
				             @Override
			
@@ -411,9 +422,11 @@ public class ClusterDisruptionIT extends AbstractDisruptionTestCase {
 
				             }
			
 
				         });
			
 
				 
			
 
				-        assertFalse(internalCluster().client(masterNode).admin().cluster().prepareHealth().get().isTimedOut());
			
 
				-        assertTrue(internalCluster().client(masterNode).admin().cluster().prepareHealth().setWaitForNodes("2").setTimeout("2s").get()
			
 
				-            .isTimedOut());
			
 
				+        assertBusy(() -> {
			
 
				+            assertFalse(internalCluster().client(masterNode).admin().cluster().prepareHealth().get().isTimedOut());
			
 
				+            assertTrue(internalCluster().client(masterNode).admin().cluster().prepareHealth().setWaitForNodes("2").setTimeout("2s").get()
			
 
				+                .isTimedOut());
			
 
				+        }, 30, TimeUnit.SECONDS);
			
 
				         internalCluster().stopRandomNode(InternalTestCluster.nameFilter(dataNode)); // otherwise we will fail during clean-up
			
 
				     }