Browse Source

Fix APM trace start time (#98113)

In #96205 we started recording the start time of the authn phase of REST
request processing, because this happens too early to even start a span
for the request. However we capture the threadpool's cached time which
can be 200ms slow, yielding spans that appear much longer than they
should. This commit moves to capturing the time using
`System.currentTimeMillis` to avoid this problem.
David Turner 2 years ago
parent
commit
7259b0d9d9

+ 5 - 0
docs/changelog/98113.yaml

@@ -0,0 +1,5 @@
+pr: 98113
+summary: Fix APM trace start time
+area: Infra/Core
+type: bug
+issues: []

+ 1 - 1
server/src/main/java/org/elasticsearch/action/ActionModule.java

@@ -586,7 +586,7 @@ public class ActionModule extends AbstractModule {
     public void copyRequestHeadersToThreadContext(HttpPreRequest request, ThreadContext threadContext) {
         // the request's thread-context must always be populated (by calling this method) before undergoing any request related processing
         // we use this opportunity to first record the request processing start time
-        threadContext.putTransient(Task.TRACE_START_TIME, Instant.ofEpochMilli(threadPool.absoluteTimeInMillis()));
+        threadContext.putTransient(Task.TRACE_START_TIME, Instant.ofEpochMilli(System.currentTimeMillis()));
         for (final RestHeaderDefinition restHeader : headersToCopy) {
             final String name = restHeader.getName();
             final List<String> headerValues = request.getHeaders().get(name);

+ 18 - 7
server/src/test/java/org/elasticsearch/http/AbstractHttpServerTransportTests.java

@@ -55,6 +55,7 @@ import org.junit.Before;
 
 import java.net.InetSocketAddress;
 import java.net.UnknownHostException;
+import java.time.Instant;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
@@ -69,6 +70,7 @@ import java.util.concurrent.BlockingDeque;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.LinkedBlockingDeque;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicReference;
 
 import static java.net.InetAddress.getByName;
 import static java.util.Arrays.asList;
@@ -82,8 +84,10 @@ import static org.hamcrest.Matchers.allOf;
 import static org.hamcrest.Matchers.contains;
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.greaterThanOrEqualTo;
 import static org.hamcrest.Matchers.hasSize;
 import static org.hamcrest.Matchers.instanceOf;
+import static org.hamcrest.Matchers.lessThanOrEqualTo;
 import static org.hamcrest.Matchers.notNullValue;
 import static org.hamcrest.Matchers.nullValue;
 import static org.mockito.Mockito.mock;
@@ -324,6 +328,7 @@ public class AbstractHttpServerTransportTests extends ESTestCase {
      */
     public void testTraceParentAndTraceId() {
         final String traceParentValue = "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01";
+        final AtomicReference<Instant> traceStartTimeRef = new AtomicReference<>();
         final HttpServerTransport.Dispatcher dispatcher = new HttpServerTransport.Dispatcher() {
 
             @Override
@@ -332,7 +337,8 @@ public class AbstractHttpServerTransportTests extends ESTestCase {
                 assertThat(threadContext.getHeader(Task.TRACE_PARENT_HTTP_HEADER), nullValue());
                 assertThat(threadContext.getTransient("parent_" + Task.TRACE_PARENT_HTTP_HEADER), equalTo(traceParentValue));
                 // request trace start time is also set
-                assertThat(threadContext.getTransient(Task.TRACE_START_TIME), notNullValue());
+                assertTrue(traceStartTimeRef.compareAndSet(null, threadContext.getTransient(Task.TRACE_START_TIME)));
+                assertNotNull(traceStartTimeRef.get());
             }
 
             @Override
@@ -371,14 +377,10 @@ public class AbstractHttpServerTransportTests extends ESTestCase {
                 }
 
                 @Override
-                protected void doStart() {
-
-                }
+                protected void doStart() {}
 
                 @Override
-                protected void stopInternal() {
-
-                }
+                protected void stopInternal() {}
 
                 @Override
                 public HttpStats stats() {
@@ -391,11 +393,20 @@ public class AbstractHttpServerTransportTests extends ESTestCase {
                 }
             }
         ) {
+            final var systemTimeBeforeRequest = System.currentTimeMillis();
             transport.dispatchRequest(fakeRequest, channel, null);
+            final var systemTimeAfterRequest = System.currentTimeMillis();
             // headers are "null" here, aka not present, because the thread context changes containing them is to be confined to the request
             assertThat(threadPool.getThreadContext().getHeader(Task.TRACE_ID), nullValue());
             assertThat(threadPool.getThreadContext().getHeader(Task.TRACE_PARENT_HTTP_HEADER), nullValue());
             assertThat(threadPool.getThreadContext().getTransient("parent_" + Task.TRACE_PARENT_HTTP_HEADER), nullValue());
+
+            // system clock is not _technically_ monotonic but in practice it's very unlikely to see a discontinuity here
+            assertThat(
+                traceStartTimeRef.get().toEpochMilli(),
+                allOf(greaterThanOrEqualTo(systemTimeBeforeRequest), lessThanOrEqualTo(systemTimeAfterRequest))
+            );
+
             transport.dispatchRequest(null, null, new Exception());
             // headers are "null" here, aka not present, because the thread context changes containing them is to be confined to the request
             assertThat(threadPool.getThreadContext().getHeader(Task.TRACE_ID), nullValue());