Prechádzať zdrojové kódy

Distinguish timeouts/failures in follower checker (#75519)

Today when a node is removed from the cluster because of repeated
follower check failures it is not clear whether those failures are due
to active rejections or timeouts. It's important for further
investigation to distinguish these two cases. This commit adds tracking
for the number of timeouts separately from the number of active
failures, and includes the counts of each in the removal reason.
David Turner 4 rokov pred
rodič
commit
b6efb38aae

+ 12 - 3
server/src/main/java/org/elasticsearch/cluster/coordination/FollowersChecker.java

@@ -24,6 +24,7 @@ import org.elasticsearch.monitor.NodeHealthService;
 import org.elasticsearch.monitor.StatusInfo;
 import org.elasticsearch.threadpool.ThreadPool.Names;
 import org.elasticsearch.transport.ConnectTransportException;
+import org.elasticsearch.transport.ReceiveTimeoutTransportException;
 import org.elasticsearch.transport.Transport;
 import org.elasticsearch.transport.TransportChannel;
 import org.elasticsearch.transport.TransportConnectionListener;
@@ -261,6 +262,7 @@ public class FollowersChecker {
     private class FollowerChecker {
         private final DiscoveryNode discoveryNode;
         private int failureCountSinceLastSuccess;
+        private int timeoutCountSinceLastSuccess;
 
         FollowerChecker(DiscoveryNode discoveryNode) {
             this.discoveryNode = discoveryNode;
@@ -296,6 +298,7 @@ public class FollowersChecker {
                         }
 
                         failureCountSinceLastSuccess = 0;
+                        timeoutCountSinceLastSuccess = 0;
                         logger.trace("{} check successful", FollowerChecker.this);
                         scheduleNextWakeUp();
                     }
@@ -307,7 +310,11 @@ public class FollowersChecker {
                             return;
                         }
 
-                        failureCountSinceLastSuccess++;
+                        if (exp instanceof ReceiveTimeoutTransportException) {
+                            timeoutCountSinceLastSuccess++;
+                        } else {
+                            failureCountSinceLastSuccess++;
+                        }
 
                         final String reason;
                         if (exp instanceof ConnectTransportException
@@ -317,9 +324,10 @@ public class FollowersChecker {
                         } else if (exp.getCause() instanceof NodeHealthCheckFailureException) {
                             logger.debug(() -> new ParameterizedMessage("{} health check failed", FollowerChecker.this), exp);
                             reason = "health check failed";
-                        } else if (failureCountSinceLastSuccess >= followerCheckRetryCount) {
+                        } else if (failureCountSinceLastSuccess + timeoutCountSinceLastSuccess >= followerCheckRetryCount) {
                             logger.debug(() -> new ParameterizedMessage("{} failed too many times", FollowerChecker.this), exp);
-                            reason = "followers check retry count exceeded";
+                            reason = "followers check retry count exceeded [timeouts=" + timeoutCountSinceLastSuccess +
+                                ", failures=" + failureCountSinceLastSuccess + "]";
                         } else {
                             logger.debug(() -> new ParameterizedMessage("{} failed, retrying", FollowerChecker.this), exp);
                             scheduleNextWakeUp();
@@ -373,6 +381,7 @@ public class FollowersChecker {
             return "FollowerChecker{" +
                 "discoveryNode=" + discoveryNode +
                 ", failureCountSinceLastSuccess=" + failureCountSinceLastSuccess +
+                ", timeoutCountSinceLastSuccess=" + timeoutCountSinceLastSuccess +
                 ", [" + FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey() + "]=" + followerCheckRetryCount +
                 '}';
         }

+ 78 - 6
server/src/test/java/org/elasticsearch/cluster/coordination/FollowersCheckerTests.java

@@ -63,6 +63,7 @@ import static org.hamcrest.Matchers.contains;
 import static org.hamcrest.Matchers.containsInAnyOrder;
 import static org.hamcrest.Matchers.empty;
 import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.greaterThan;
 import static org.hamcrest.Matchers.not;
 import static org.hamcrest.Matchers.nullValue;
 import static org.hamcrest.core.IsInstanceOf.instanceOf;
@@ -160,7 +161,7 @@ public class FollowersCheckerTests extends ESTestCase {
     public void testFailsNodeThatDoesNotRespond() {
         final Settings settings = randomSettings();
         testBehaviourOfFailingNode(settings, () -> null,
-            "followers check retry count exceeded",
+            "followers check retry count exceeded [timeouts=" + FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) + ", failures=0]",
             (FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) - 1) * FOLLOWER_CHECK_INTERVAL_SETTING.get(settings).millis()
                 + FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) * FOLLOWER_CHECK_TIMEOUT_SETTING.get(settings).millis(),
             () -> new StatusInfo(HEALTHY, "healthy-info"));
@@ -171,18 +172,61 @@ public class FollowersCheckerTests extends ESTestCase {
         testBehaviourOfFailingNode(settings, () -> {
                 throw new ElasticsearchException("simulated exception");
             },
-            "followers check retry count exceeded",
+            "followers check retry count exceeded [timeouts=0, failures=" + FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) + "]",
             (FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) - 1) * FOLLOWER_CHECK_INTERVAL_SETTING.get(settings).millis(),
             () -> new StatusInfo(HEALTHY, "healthy-info"));
     }
 
+    public void testFailsNodeThatRejectsCheckAndDoesNotRespond() {
+        final Settings settings = randomSettings();
+        final int retryCount = FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings);
+        final int timeoutCount = between(0, retryCount);
+        final int failureCount = retryCount - timeoutCount;
+
+        testBehaviourOfFailingNode(
+            settings,
+            new Supplier<Empty>() {
+
+                private int timeoutsRemaining;
+                private int failuresRemaining;
+
+                @Override
+                public Empty get() {
+                    if (timeoutsRemaining == 0 && failuresRemaining == 0) {
+                        // node was added, reset counters
+                        timeoutsRemaining = timeoutCount;
+                        failuresRemaining = failureCount;
+                    }
+                    if (timeoutsRemaining == 0) {
+                        assertThat(failuresRemaining--, greaterThan(0));
+                        throw new ElasticsearchException("simulated exception");
+                    } else if (failuresRemaining == 0) {
+                        assertThat(timeoutsRemaining--, greaterThan(0));
+                        return null;
+                    } else if (randomBoolean()) {
+                        assertThat(failuresRemaining--, greaterThan(0));
+                        throw new ElasticsearchException("simulated exception");
+                    } else {
+                        assertThat(timeoutsRemaining--, greaterThan(0));
+                        return null;
+                    }
+                }
+            },
+            "followers check retry count exceeded [timeouts=" + timeoutCount + ", failures=" + failureCount + "]",
+            (retryCount - 1) * FOLLOWER_CHECK_INTERVAL_SETTING.get(settings).millis()
+                + timeoutCount * FOLLOWER_CHECK_TIMEOUT_SETTING.get(settings).millis(),
+            () -> new StatusInfo(HEALTHY, "healthy-info"));
+    }
+
     public void testFailureCounterResetsOnSuccess() {
         final Settings settings = randomSettings();
         final int retryCount = FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings);
         final int maxRecoveries = randomIntBetween(3, 10);
 
         // passes just enough checks to keep it alive, up to maxRecoveries, and then fails completely
-        testBehaviourOfFailingNode(settings, new Supplier<Empty>() {
+        testBehaviourOfFailingNode(
+            settings,
+            new Supplier<Empty>() {
                 private int checkIndex;
                 private int recoveries;
 
@@ -196,9 +240,37 @@ public class FollowersCheckerTests extends ESTestCase {
                     throw new ElasticsearchException("simulated exception");
                 }
             },
-            "followers check retry count exceeded",
-            (FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) * (maxRecoveries + 1) - 1)
-                * FOLLOWER_CHECK_INTERVAL_SETTING.get(settings).millis(), () -> new StatusInfo(HEALTHY, "healthy-info"));
+            "followers check retry count exceeded [timeouts=0, failures=" + retryCount + "]",
+            (retryCount * (maxRecoveries + 1) - 1) * FOLLOWER_CHECK_INTERVAL_SETTING.get(settings).millis(),
+            () -> new StatusInfo(HEALTHY, "healthy-info"));
+    }
+
+    public void testTimeoutCounterResetsOnSuccess() {
+        final Settings settings = randomSettings();
+        final int retryCount = FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings);
+        final int maxRecoveries = randomIntBetween(3, 10);
+
+        // passes just enough checks to keep it alive, up to maxRecoveries, and then fails completely
+        testBehaviourOfFailingNode(
+            settings,
+            new Supplier<Empty>() {
+                private int checkIndex;
+                private int recoveries;
+
+                @Override
+                public Empty get() {
+                    checkIndex++;
+                    if (checkIndex % retryCount == 0 && recoveries < maxRecoveries) {
+                        recoveries++;
+                        return Empty.INSTANCE;
+                    }
+                    return null;
+                }
+            },
+            "followers check retry count exceeded [timeouts=" + retryCount + ", failures=0]",
+            (retryCount * (maxRecoveries + 1) - 1) * FOLLOWER_CHECK_INTERVAL_SETTING.get(settings).millis() +
+                (retryCount * (maxRecoveries + 1) - maxRecoveries) * FOLLOWER_CHECK_TIMEOUT_SETTING.get(settings).millis(),
+            () -> new StatusInfo(HEALTHY, "healthy-info"));
     }
 
     public void testFailsNodeThatIsDisconnected() {