Browse Source

Slightly better hot threads for transport workers (#96315)

A completely idle `transport_worker` thread is reported as `0.0%` idle,
which is confusing. Moreover the docs on the network threading model do
not reflect the changes made in #90482. This commit fixes both of those
things.
David Turner 2 years ago
parent
commit
2a49ad929c

+ 4 - 6
docs/reference/modules/network/threading.asciidoc

@@ -63,7 +63,7 @@ reported like this:
 
 [source,text]
 ----
-   100.0% [cpu=0.0%, other=100.0%] (500ms out of 500ms) cpu usage by thread 'elasticsearch[instance-0000000004][transport_worker][T#1]'
+   0.0% [cpu=0.0%, idle=100.0%] (500ms out of 500ms) cpu usage by thread 'elasticsearch[instance-0000000004][transport_worker][T#1]'
      10/10 snapshots sharing following 9 elements
        java.base@17.0.2/sun.nio.ch.EPoll.wait(Native Method)
        java.base@17.0.2/sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:118)
@@ -77,11 +77,9 @@ reported like this:
 ----
 
 Note that `transport_worker` threads should always be in state `RUNNABLE`, even
-when waiting for input, because they block in the native `EPoll#wait` method.
-This means the hot threads API will report these threads at 100% overall
-utilisation. This is normal, and the breakdown of time into `cpu=` and `other=`
-fractions shows how much time the thread spent running and waiting for input
-respectively.
+when waiting for input, because they block in the native `EPoll#wait` method. The `idle=`
+time reports the proportion of time the thread spent waiting for input, whereas the `cpu=` time
+reports the proportion of time the thread spent processing input it has received.
 
 If a `transport_worker` thread is not frequently idle, it may build up a
 backlog of work. This can cause delays in processing messages on the channels

+ 3 - 1
server/src/main/java/org/elasticsearch/monitor/jvm/HotThreads.java

@@ -313,7 +313,9 @@ public class HotThreads {
                 );
                 case CPU -> {
                     double percentCpu = getTimeSharePercentage(topThread.getCpuTime());
-                    double percentOther = getTimeSharePercentage(topThread.getOtherTime());
+                    double percentOther = Transports.isTransportThread(threadName) && topThread.getCpuTime() == 0L
+                        ? 100.0
+                        : getTimeSharePercentage(topThread.getOtherTime());
                     double percentTotal = (Transports.isTransportThread(threadName)) ? percentCpu : percentOther + percentCpu;
                     String otherLabel = (Transports.isTransportThread(threadName)) ? "idle" : "other";
                     sb.append(

+ 4 - 4
server/src/test/java/org/elasticsearch/monitor/jvm/HotThreadsTests.java

@@ -892,19 +892,19 @@ public class HotThreadsTests extends ESTestCase {
 
         assertThat(
             innerResult,
-            containsString("0.0% [cpu=0.0%, idle=0.0%] (0s out of 10ms) cpu usage by thread '__mock_network_thread 1'")
+            containsString("0.0% [cpu=0.0%, idle=100.0%] (0s out of 10ms) cpu usage by thread '__mock_network_thread 1'")
         );
         assertThat(
             innerResult,
-            containsString("0.0% [cpu=0.0%, idle=0.0%] (0s out of 10ms) cpu usage by thread '__mock_network_thread 2'")
+            containsString("0.0% [cpu=0.0%, idle=100.0%] (0s out of 10ms) cpu usage by thread '__mock_network_thread 2'")
         );
         assertThat(
             innerResult,
-            containsString("0.0% [cpu=0.0%, idle=0.0%] (0s out of 10ms) cpu usage by thread '__mock_network_thread 3'")
+            containsString("0.0% [cpu=0.0%, idle=100.0%] (0s out of 10ms) cpu usage by thread '__mock_network_thread 3'")
         );
         assertThat(
             innerResult,
-            containsString("0.0% [cpu=0.0%, idle=0.0%] (0s out of 10ms) cpu usage by thread '__mock_network_thread 4'")
+            containsString("0.0% [cpu=0.0%, idle=100.0%] (0s out of 10ms) cpu usage by thread '__mock_network_thread 4'")
         );
 
         // Test with the legacy sort order