Browse Source

Have a dedicated join timeout that is higher than ping.timeout for node join
Using ping.timeout, which defaults to 3s, to use as a timeout value on the join request a node makes to the master once its discovered can be too small, specifically when there is a large cluster state involved (and by definition, all the buffers and such on the nio layer will be "cold"). Introduce a dedicated join.timeout setting, that by default is 10x the ping.timeout (so 30s by default).
closes #6342

Shay Banon 11 years ago
parent
commit
9c98bb3554

+ 3 - 1
docs/reference/modules/discovery/zen.asciidoc

@@ -72,7 +72,9 @@ As part of the initial ping process a master of the cluster is either
 elected or joined to. This is done automatically. The
 `discovery.zen.ping_timeout` (which defaults to `3s`) allows to
 configure the election to handle cases of slow or congested networks
-(higher values assure less chance of failure). 
+(higher values assure less chance of failure). Once a node joins, it
+ will send a join request to the master (`discovery.zen.join_timeout`)
+ with a timeout defaulting at 10 times the ping timeout.
 
 Nodes can be excluded from becoming a master by setting `node.master` to
 `false`. Note, once a node is a client node (`node.client` set to

+ 5 - 3
src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java

@@ -93,6 +93,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
 
 
     private final TimeValue pingTimeout;
+    private final TimeValue joinTimeout;
 
     // a flag that should be used only for testing
     private final boolean sendLeaveRequest;
@@ -134,12 +135,13 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
 
         // also support direct discovery.zen settings, for cases when it gets extended
         this.pingTimeout = settings.getAsTime("discovery.zen.ping.timeout", settings.getAsTime("discovery.zen.ping_timeout", componentSettings.getAsTime("ping_timeout", componentSettings.getAsTime("initial_ping_timeout", timeValueSeconds(3)))));
+        this.joinTimeout = settings.getAsTime("discovery.zen.join_timeout", TimeValue.timeValueMillis(pingTimeout.millis() * 10));
         this.sendLeaveRequest = componentSettings.getAsBoolean("send_leave_request", true);
 
         this.masterElectionFilterClientNodes = settings.getAsBoolean("discovery.zen.master_election.filter_client", true);
         this.masterElectionFilterDataNodes = settings.getAsBoolean("discovery.zen.master_election.filter_data", false);
 
-        logger.debug("using ping.timeout [{}], master_election.filter_client [{}], master_election.filter_data [{}]", pingTimeout, masterElectionFilterClientNodes, masterElectionFilterDataNodes);
+        logger.debug("using ping.timeout [{}], join.timeout [{}], master_election.filter_client [{}], master_election.filter_data [{}]", pingTimeout, joinTimeout, masterElectionFilterClientNodes, masterElectionFilterDataNodes);
 
         this.electMaster = new ElectMasterService(settings);
         nodeSettingsService.addListener(new ApplySettings());
@@ -343,7 +345,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
                 }
                 // send join request
                 try {
-                    membership.sendJoinRequestBlocking(masterNode, localNode, pingTimeout);
+                    membership.sendJoinRequestBlocking(masterNode, localNode, joinTimeout);
                 } catch (Exception e) {
                     if (e instanceof ElasticsearchException) {
                         logger.info("failed to send join request to master [{}], reason [{}]", masterNode, ((ElasticsearchException) e).getDetailedMessage());
@@ -713,7 +715,7 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen
 
             // validate the join request, will throw a failure if it fails, which will get back to the
             // node calling the join request
-            membership.sendValidateJoinRequestBlocking(node, state, pingTimeout);
+            membership.sendValidateJoinRequestBlocking(node, state, joinTimeout);
 
             clusterService.submitStateUpdateTask("zen-disco-receive(join from node[" + node + "])", Priority.IMMEDIATE, new ClusterStateUpdateTask() {
                 @Override