Browse Source

Reduce follower cluster state lag timeout for disruption tests (#108691)

It's possible for a node-left task to get interrupted prior to removing
the node from the master's list of faultyNodes. Nodes on the faultyNodes
list do not receive cluster state updates, and are eventually removed.

Subsequently, when the node attempts to rejoin, after test network
disruptions have ceased, the node-join request can succeed, but the
node will never receive the cluster state update, consider the node-join
a failure, and will resend node-join requests until the LagDetector
removes the node from the faultyNodes list.
https://github.com/elastic/elasticsearch/issues/108690 will address the
node-join issue.

Closes #91447
Dianna Hohensee 1 year ago
parent
commit
4700027565

+ 5 - 5
server/src/internalClusterTest/java/org/elasticsearch/discovery/ClusterDisruptionIT.java

@@ -40,7 +40,7 @@ import org.elasticsearch.test.disruption.NetworkDisruption;
 import org.elasticsearch.test.disruption.NetworkDisruption.Bridge;
 import org.elasticsearch.test.disruption.NetworkDisruption.TwoPartitions;
 import org.elasticsearch.test.disruption.ServiceDisruptionScheme;
-import org.elasticsearch.test.junit.annotations.TestIssueLogging;
+import org.elasticsearch.test.junit.annotations.TestLogging;
 import org.elasticsearch.test.transport.MockTransportService;
 import org.elasticsearch.xcontent.XContentType;
 
@@ -94,17 +94,17 @@ public class ClusterDisruptionIT extends AbstractDisruptionTestCase {
     }
 
     /**
-     * Test that we do not loose document whose indexing request was successful, under a randomly selected disruption scheme
+     * Test that we do not lose documents, indexed via requests that return success, under randomly selected disruption schemes.
      * We also collect & report the type of indexing failures that occur.
      * <p>
-     * This test is a superset of tests run in the Jepsen test suite, with the exception of versioned updates
+     * This test is a superset of tests run in the Jepsen test suite, with the exception of versioned updates.
      */
-    @TestIssueLogging(
+    @TestLogging(
         value = "_root:DEBUG,org.elasticsearch.action.bulk:TRACE,org.elasticsearch.action.get:TRACE,"
             + "org.elasticsearch.discovery:TRACE,org.elasticsearch.action.support.replication:TRACE,"
             + "org.elasticsearch.cluster.service:TRACE,org.elasticsearch.indices.recovery:TRACE,"
             + "org.elasticsearch.indices.cluster:TRACE,org.elasticsearch.index.shard:TRACE",
-        issueUrl = "https://github.com/elastic/elasticsearch/issues/41068"
+        reason = "Past failures have required a lot of additional logging to debug"
     )
     public void testAckedIndexing() throws Exception {
 

+ 2 - 0
server/src/test/java/org/elasticsearch/discovery/AbstractDisruptionTestCase.java

@@ -13,6 +13,7 @@ import org.elasticsearch.cluster.block.ClusterBlock;
 import org.elasticsearch.cluster.block.ClusterBlockLevel;
 import org.elasticsearch.cluster.coordination.Coordinator;
 import org.elasticsearch.cluster.coordination.FollowersChecker;
+import org.elasticsearch.cluster.coordination.LagDetector;
 import org.elasticsearch.cluster.coordination.LeaderChecker;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.common.settings.Settings;
@@ -116,6 +117,7 @@ public abstract class AbstractDisruptionTestCase extends ESIntegTestCase {
         .put(FollowersChecker.FOLLOWER_CHECK_TIMEOUT_SETTING.getKey(), "5s") // for hitting simulated network failures quickly
         .put(FollowersChecker.FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey(), 1) // for hitting simulated network failures quickly
         .put(Coordinator.PUBLISH_TIMEOUT_SETTING.getKey(), "5s") // <-- for hitting simulated network failures quickly
+        .put(LagDetector.CLUSTER_FOLLOWER_LAG_TIMEOUT_SETTING.getKey(), "5s") // remove lagging nodes quickly so they can rejoin
         .put(TransportSettings.CONNECT_TIMEOUT.getKey(), "10s") // Network delay disruption waits for the min between this
         // value and the time of disruption and does not recover immediately
         // when disruption is stop. We should make sure we recover faster