فهرست منبع

Fix Stuck IO Thread Logging Time Precision (#42882)

* The precision of the timestamps we get from the cached time thread is only 200ms by default resulting in a number of needless ~200ms slow network thread execution logs
  * Fixed by making the warn threshold a function of the precision of the cached time thread found in the settings
Armin Braun 6 سال پیش
والد
کامیت
0fcadb94d4

+ 6 - 7
test/framework/src/main/java/org/elasticsearch/transport/nio/MockNioTransport.java

@@ -84,7 +84,7 @@ public class MockNioTransport extends TcpTransport {
                             PageCacheRecycler pageCacheRecycler, NamedWriteableRegistry namedWriteableRegistry,
                             CircuitBreakerService circuitBreakerService) {
         super(settings, version, threadPool, pageCacheRecycler, circuitBreakerService, namedWriteableRegistry, networkService);
-        this.transportThreadWatchdog = new TransportThreadWatchdog(threadPool);
+        this.transportThreadWatchdog = new TransportThreadWatchdog(threadPool, settings);
     }
 
     @Override
@@ -322,21 +322,20 @@ public class MockNioTransport extends TcpTransport {
     }
 
     static final class TransportThreadWatchdog {
-
-        private static final long WARN_THRESHOLD = TimeUnit.MILLISECONDS.toNanos(150);
-
         // Only check every 2s to not flood the logs on a blocked thread.
         // We mostly care about long blocks and not random slowness anyway and in tests would randomly catch slow operations that block for
         // less than 2s eventually.
         private static final TimeValue CHECK_INTERVAL = TimeValue.timeValueSeconds(2);
 
+        private final long warnThreshold;
         private final ThreadPool threadPool;
         private final ConcurrentHashMap<Thread, Long> registry = new ConcurrentHashMap<>();
 
         private volatile boolean stopped;
 
-        TransportThreadWatchdog(ThreadPool threadPool) {
+        TransportThreadWatchdog(ThreadPool threadPool, Settings settings) {
             this.threadPool = threadPool;
+            warnThreshold = ThreadPool.ESTIMATED_TIME_INTERVAL_SETTING.get(settings).nanos() + TimeValue.timeValueMillis(100L).nanos();
             threadPool.schedule(this::logLongRunningExecutions, CHECK_INTERVAL, ThreadPool.Names.GENERIC);
         }
 
@@ -353,7 +352,7 @@ public class MockNioTransport extends TcpTransport {
 
         private void maybeLogElapsedTime(long startTime) {
             long elapsedTime = threadPool.relativeTimeInNanos() - startTime;
-            if (elapsedTime > WARN_THRESHOLD) {
+            if (elapsedTime > warnThreshold) {
                 logger.warn(
                     new ParameterizedMessage("Slow execution on network thread [{} milliseconds]",
                         TimeUnit.NANOSECONDS.toMillis(elapsedTime)),
@@ -364,7 +363,7 @@ public class MockNioTransport extends TcpTransport {
         private void logLongRunningExecutions() {
             for (Map.Entry<Thread, Long> entry : registry.entrySet()) {
                 final long elapsedTimeInNanos = threadPool.relativeTimeInNanos() - entry.getValue();
-                if (elapsedTimeInNanos > WARN_THRESHOLD) {
+                if (elapsedTimeInNanos > warnThreshold) {
                     final Thread thread = entry.getKey();
                     logger.warn("Potentially blocked execution on network thread [{}] [{} milliseconds]: \n{}", thread.getName(),
                         TimeUnit.NANOSECONDS.toMillis(elapsedTimeInNanos),

+ 3 - 2
test/framework/src/test/java/org/elasticsearch/transport/nio/TestEventHandlerTests.java

@@ -23,6 +23,7 @@ import org.apache.logging.log4j.Level;
 import org.apache.logging.log4j.LogManager;
 import org.elasticsearch.common.CheckedRunnable;
 import org.elasticsearch.common.logging.Loggers;
+import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.nio.ServerChannelContext;
 import org.elasticsearch.nio.SocketChannelContext;
 import org.elasticsearch.test.ESTestCase;
@@ -57,7 +58,7 @@ public class TestEventHandlerTests extends ESTestCase {
 
     public void testLogOnElapsedTime() throws Exception {
         long start = System.nanoTime();
-        long end = start + TimeUnit.MILLISECONDS.toNanos(200);
+        long end = start + TimeUnit.MILLISECONDS.toNanos(400);
         AtomicBoolean isStart = new AtomicBoolean(true);
         LongSupplier timeSupplier = () -> {
             if (isStart.compareAndSet(true, false)) {
@@ -70,7 +71,7 @@ public class TestEventHandlerTests extends ESTestCase {
         final ThreadPool threadPool = mock(ThreadPool.class);
         doAnswer(i -> timeSupplier.getAsLong()).when(threadPool).relativeTimeInNanos();
         TestEventHandler eventHandler =
-            new TestEventHandler((e) -> {}, () -> null, new MockNioTransport.TransportThreadWatchdog(threadPool));
+            new TestEventHandler(e -> {}, () -> null, new MockNioTransport.TransportThreadWatchdog(threadPool, Settings.EMPTY));
 
         ServerChannelContext serverChannelContext = mock(ServerChannelContext.class);
         SocketChannelContext socketChannelContext = mock(SocketChannelContext.class);