Răsfoiți Sursa

Remove `PeerFinder` request timeout (#134365)

There's really no need to time out so enthusiastically here, we can wait
for as long as it takes to receive a list of other discovery targets
from the remote peer. This commit removes the timeout, deferring failure
detection down to the TCP level (e.g. keepalives) as managed by the OS.

Relates #132713, #123568
David Turner 1 lună în urmă
părinte
comite
2faa714006

+ 13 - 0
docs/changelog/134365.yaml

@@ -0,0 +1,13 @@
+pr: 134365
+summary: Remove `PeerFinder` request timeout
+area: Cluster Coordination
+type: deprecation
+issues: []
+deprecation:
+  title: Remove `PeerFinder` request timeout
+  area: Cluster and node setting
+  details: >-
+    There is no need to time out requests sent by the `PeerFinder` during discovery and cluster formation, and this
+    timeout may sometimes cause spurious failures. With this change the `PeerFinder` requests will wait indefinitely for
+    responses. The `discovery.request_peers_timeout` setting no longer has any effect.
+  impact: Discontinue use of the `discovery.request_peers_timeout` setting.

+ 1 - 1
docs/reference/elasticsearch/configuration-reference/discovery-cluster-formation-settings.md

@@ -50,7 +50,7 @@ If you adjust these settings then your cluster may not form correctly or may bec
 :   ([Static](docs-content://deploy-manage/stack-settings.md#static-cluster-setting)) Sets how long to wait when attempting to identify the remote node via a handshake. Defaults to `30s`.
 
 `discovery.request_peers_timeout`
-:   ([Static](docs-content://deploy-manage/stack-settings.md#static-cluster-setting)) Sets how long a node will wait after asking its peers again before considering the request to have failed. Defaults to `3s`.
+:   ([Static](docs-content://deploy-manage/stack-settings.md#static-cluster-setting)) In 9.1.x and earlier versions, sets how long a node will wait after asking its peers again before considering the request to have failed. Has no effect from version 9.2.0 onwards. Defaults to `3s`.
 
 `discovery.find_peers_warning_timeout`
 :   ([Static](docs-content://deploy-manage/stack-settings.md#static-cluster-setting)) Sets how long a node will attempt to discover its peers before it starts to log verbose messages describing why the connection attempts are failing. Defaults to `3m`.

+ 3 - 4
server/src/main/java/org/elasticsearch/discovery/PeerFinder.java

@@ -66,7 +66,8 @@ public abstract class PeerFinder {
         "discovery.request_peers_timeout",
         TimeValue.timeValueMillis(3000),
         TimeValue.timeValueMillis(1),
-        Setting.Property.NodeScope
+        Setting.Property.NodeScope,
+        Setting.Property.Deprecated
     );
 
     // We do not log connection failures immediately: some failures are expected, especially if the hosts list isn't perfectly up-to-date
@@ -81,7 +82,6 @@ public abstract class PeerFinder {
     );
 
     private final TimeValue findPeersInterval;
-    private final TimeValue requestPeersTimeout;
     private final TimeValue verbosityIncreaseTimeout;
 
     private final Object mutex = new Object();
@@ -106,7 +106,6 @@ public abstract class PeerFinder {
         ConfiguredHostsResolver configuredHostsResolver
     ) {
         findPeersInterval = DISCOVERY_FIND_PEERS_INTERVAL_SETTING.get(settings);
-        requestPeersTimeout = DISCOVERY_REQUEST_PEERS_TIMEOUT_SETTING.get(settings);
         verbosityIncreaseTimeout = VERBOSITY_INCREASE_TIMEOUT_SETTING.get(settings);
         this.transportService = transportService;
         this.clusterCoordinationExecutor = transportService.getThreadPool().executor(Names.CLUSTER_COORDINATION);
@@ -571,7 +570,7 @@ public abstract class PeerFinder {
                 discoveryNode,
                 REQUEST_PEERS_ACTION_NAME,
                 new PeersRequest(getLocalNode(), knownNodes),
-                TransportRequestOptions.timeout(requestPeersTimeout),
+                TransportRequestOptions.EMPTY,
                 peersResponseHandler
             );
         }

+ 0 - 46
server/src/test/java/org/elasticsearch/discovery/PeerFinderTests.java

@@ -750,52 +750,6 @@ public class PeerFinderTests extends ESTestCase {
         assertFoundPeers(otherNode);
     }
 
-    public void testTimesOutAndRetriesConnectionsToBlackholedNodes() {
-        final DiscoveryNode otherNode = newDiscoveryNode("node-from-hosts-list");
-        final DiscoveryNode nodeToFind = newDiscoveryNode("node-to-find");
-
-        providedAddresses.add(otherNode.getAddress());
-        transportAddressConnector.addReachableNode(otherNode);
-        transportAddressConnector.addReachableNode(nodeToFind);
-
-        peerFinder.activate(lastAcceptedNodes);
-
-        while (true) {
-            deterministicTaskQueue.advanceTime();
-            runAllRunnableTasks(); // MockTransportAddressConnector verifies no multiple connection attempts
-            if (capturingTransport.getCapturedRequestsAndClear().length > 0) {
-                break;
-            }
-        }
-
-        final long timeoutAtMillis = deterministicTaskQueue.getCurrentTimeMillis() + PeerFinder.DISCOVERY_REQUEST_PEERS_TIMEOUT_SETTING.get(
-            Settings.EMPTY
-        ).millis();
-        while (deterministicTaskQueue.getCurrentTimeMillis() < timeoutAtMillis) {
-            assertFoundPeers(otherNode);
-            deterministicTaskQueue.advanceTime();
-            runAllRunnableTasks();
-        }
-
-        // need to wait for the connection to timeout, then for another wakeup, before discovering the peer
-        final long expectedTime = timeoutAtMillis + PeerFinder.DISCOVERY_FIND_PEERS_INTERVAL_SETTING.get(Settings.EMPTY).millis();
-
-        while (deterministicTaskQueue.getCurrentTimeMillis() < expectedTime) {
-            deterministicTaskQueue.advanceTime();
-            runAllRunnableTasks();
-        }
-
-        respondToRequests(node -> {
-            assertThat(node, is(otherNode));
-            return new PeersResponse(Optional.empty(), singletonList(nodeToFind), randomNonNegativeLong());
-        });
-
-        deterministicTaskQueue.advanceTime();
-        runAllRunnableTasks();
-
-        assertFoundPeers(nodeToFind, otherNode);
-    }
-
     @TestLogging(reason = "testing logging at WARN level", value = "org.elasticsearch.discovery:WARN")
     public void testLogsWarningsIfActiveForLongEnough() throws IllegalAccessException {
         final DiscoveryNode otherNode = newDiscoveryNode("node-from-hosts-list");