Browse Source

Mute connection fail log during test cluster stop (#94523)

Currently a node connection failure is logged at the warn level every
six failures. While stopping a test cluster this often fills the logs
with many noisy logs. This is particularly true for large internal test
clusters. This commit temporarily raises the NodeConnectionsService log
to error during the cluster stop.
Tim Brooks 2 years ago
parent
commit
7166af0b72

+ 3 - 1
server/src/main/java/org/elasticsearch/cluster/NodeConnectionsService.java

@@ -307,7 +307,9 @@ public class NodeConnectionsService extends AbstractLifecycleComponent {
                 @Override
                 public void onFailure(Exception e) {
                     final int currentFailureCount = consecutiveFailureCount.incrementAndGet();
-                    // only warn every 6th failure
+                    // Only warn every 6th failure. We work around this log while stopping integ test clusters in InternalTestCluster#close
+                    // by temporarily raising the log level to ERROR. If the nature of this log changes in the future, that workaround might
+                    // need to be adjusted.
                     final Level level = currentFailureCount % 6 == 1 ? Level.WARN : Level.DEBUG;
                     logger.log(level, () -> format("failed to connect to %s (tried [%s] times)", discoveryNode, currentFailureCount), e);
                     setConnectionRef(null);

+ 9 - 0
test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java

@@ -13,6 +13,7 @@ import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
 import com.carrotsearch.randomizedtesting.generators.RandomPicks;
 import com.carrotsearch.randomizedtesting.generators.RandomStrings;
 
+import org.apache.logging.log4j.Level;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.apache.lucene.store.AlreadyClosedException;
@@ -28,6 +29,7 @@ import org.elasticsearch.action.support.replication.TransportReplicationAction;
 import org.elasticsearch.client.internal.Client;
 import org.elasticsearch.cluster.ClusterName;
 import org.elasticsearch.cluster.ClusterState;
+import org.elasticsearch.cluster.NodeConnectionsService;
 import org.elasticsearch.cluster.action.index.MappingUpdatedAction;
 import org.elasticsearch.cluster.coordination.ClusterBootstrapService;
 import org.elasticsearch.cluster.coordination.NoMasterBlockService;
@@ -49,6 +51,7 @@ import org.elasticsearch.common.breaker.CircuitBreaker;
 import org.elasticsearch.common.component.LifecycleListener;
 import org.elasticsearch.common.io.FileSystemUtils;
 import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
+import org.elasticsearch.common.logging.Loggers;
 import org.elasticsearch.common.settings.MockSecureSettings;
 import org.elasticsearch.common.settings.SecureSettings;
 import org.elasticsearch.common.settings.Settings;
@@ -898,10 +901,16 @@ public final class InternalTestCluster extends TestCluster {
                 activeDisruptionScheme.testClusterClosed();
                 activeDisruptionScheme = null;
             }
+            // There is a WARN level log in NodeConnectionsService which logs when a node connection attempt fails. As we stop nodes one by
+            // one, this logs a bunch of noise. Temporarily raise the log level to ERROR while stopping cluster.
+            Logger nodeConnectionLogger = LogManager.getLogger(NodeConnectionsService.class);
+            Level initialLogLevel = nodeConnectionLogger.getLevel();
+            Loggers.setLevel(nodeConnectionLogger, Level.ERROR);
             try {
                 IOUtils.close(nodes.values());
             } finally {
                 nodes = Collections.emptyNavigableMap();
+                Loggers.setLevel(nodeConnectionLogger, initialLogLevel);
                 executor.shutdownNow();
             }
         }