浏览代码

Use milliseconds for time histogram metrics (#111502)

LongHistogram's default base2 exponentional aggregagtion is optimized
for latency range of 1ms to 100s. Hence we should record time metrics in
milliseconds instead of micros.

Relates: ES-9065
Yang Wang 1 年之前
父节点
当前提交
d4f330fd10

+ 37 - 6
modules/repository-s3/src/internalClusterTest/java/org/elasticsearch/repositories/s3/S3BlobStoreRepositoryMetricsTests.java

@@ -36,8 +36,9 @@ import java.util.List;
 import java.util.Map;
 import java.util.Queue;
 import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.TimeUnit;
 
-import static org.elasticsearch.repositories.RepositoriesMetrics.HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM;
+import static org.elasticsearch.repositories.RepositoriesMetrics.HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM;
 import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_EXCEPTIONS_HISTOGRAM;
 import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_EXCEPTIONS_REQUEST_RANGE_NOT_SATISFIED_TOTAL;
 import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_EXCEPTIONS_TOTAL;
@@ -52,6 +53,7 @@ import static org.elasticsearch.rest.RestStatus.REQUESTED_RANGE_NOT_SATISFIED;
 import static org.elasticsearch.rest.RestStatus.TOO_MANY_REQUESTS;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.instanceOf;
+import static org.hamcrest.Matchers.lessThanOrEqualTo;
 
 @SuppressForbidden(reason = "this test uses a HttpServer to emulate an S3 endpoint")
 // Need to set up a new cluster for each test because cluster settings use randomized authentication settings
@@ -98,6 +100,35 @@ public class S3BlobStoreRepositoryMetricsTests extends S3BlobStoreRepositoryTest
         return blobStoreRepository.blobStore().blobContainer(BlobPath.EMPTY.add(randomIdentifier()));
     }
 
+    public void testHttpRequestTimeCaptureInMilliseconds() throws IOException {
+        final String repository = createRepository(randomRepositoryName());
+        final String dataNodeName = internalCluster().getNodeNameThat(DiscoveryNode::canContainData);
+        final TestTelemetryPlugin plugin = getPlugin(dataNodeName);
+        final OperationPurpose purpose = randomFrom(OperationPurpose.values());
+        final BlobContainer blobContainer = getBlobContainer(dataNodeName, repository);
+        final String blobName = randomIdentifier();
+
+        long before = System.nanoTime();
+        blobContainer.writeBlob(purpose, blobName, new BytesArray(randomBytes(between(10, 1000))), false);
+        long elapsed = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - before);
+        assertThat(getLongHistogramValue(plugin, HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM, Operation.PUT_OBJECT), lessThanOrEqualTo(elapsed));
+
+        plugin.resetMeter();
+        before = System.nanoTime();
+        blobContainer.readBlob(purpose, blobName).close();
+        elapsed = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - before);
+        assertThat(getLongHistogramValue(plugin, HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM, Operation.GET_OBJECT), lessThanOrEqualTo(elapsed));
+
+        plugin.resetMeter();
+        before = System.nanoTime();
+        blobContainer.deleteBlobsIgnoringIfNotExists(purpose, Iterators.single(blobName));
+        elapsed = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - before);
+        assertThat(
+            getLongHistogramValue(plugin, HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM, Operation.DELETE_OBJECTS),
+            lessThanOrEqualTo(elapsed)
+        );
+    }
+
     public void testMetricsWithErrors() throws IOException {
         final String repository = createRepository(randomRepositoryName());
 
@@ -121,7 +152,7 @@ public class S3BlobStoreRepositoryMetricsTests extends S3BlobStoreRepositoryTest
             assertThat(getLongCounterValue(plugin, METRIC_THROTTLES_TOTAL, Operation.PUT_OBJECT), equalTo(2L * batch));
             assertThat(getLongHistogramValue(plugin, METRIC_EXCEPTIONS_HISTOGRAM, Operation.PUT_OBJECT), equalTo(batch));
             assertThat(getLongHistogramValue(plugin, METRIC_THROTTLES_HISTOGRAM, Operation.PUT_OBJECT), equalTo(2L * batch));
-            assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM, Operation.PUT_OBJECT), equalTo(batch));
+            assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM, Operation.PUT_OBJECT), equalTo(batch));
         }
 
         // Get not found
@@ -141,7 +172,7 @@ public class S3BlobStoreRepositoryMetricsTests extends S3BlobStoreRepositoryTest
             assertThat(getLongCounterValue(plugin, METRIC_THROTTLES_TOTAL, Operation.GET_OBJECT), equalTo(batch));
             assertThat(getLongHistogramValue(plugin, METRIC_EXCEPTIONS_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch));
             assertThat(getLongHistogramValue(plugin, METRIC_THROTTLES_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch));
-            assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch));
+            assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch));
 
             // Make sure we don't hit the request range not satisfied counters
             assertThat(getLongCounterValue(plugin, METRIC_EXCEPTIONS_REQUEST_RANGE_NOT_SATISFIED_TOTAL, Operation.GET_OBJECT), equalTo(0L));
@@ -164,7 +195,7 @@ public class S3BlobStoreRepositoryMetricsTests extends S3BlobStoreRepositoryTest
             assertThat(getLongCounterValue(plugin, METRIC_THROTTLES_TOTAL, Operation.LIST_OBJECTS), equalTo(5L * batch));
             assertThat(getLongHistogramValue(plugin, METRIC_EXCEPTIONS_HISTOGRAM, Operation.LIST_OBJECTS), equalTo(batch));
             assertThat(getLongHistogramValue(plugin, METRIC_THROTTLES_HISTOGRAM, Operation.LIST_OBJECTS), equalTo(5L * batch));
-            assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM, Operation.LIST_OBJECTS), equalTo(batch));
+            assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM, Operation.LIST_OBJECTS), equalTo(batch));
         }
 
         // Delete to clean up
@@ -176,7 +207,7 @@ public class S3BlobStoreRepositoryMetricsTests extends S3BlobStoreRepositoryTest
         assertThat(getLongCounterValue(plugin, METRIC_THROTTLES_TOTAL, Operation.DELETE_OBJECTS), equalTo(0L));
         assertThat(getLongHistogramValue(plugin, METRIC_EXCEPTIONS_HISTOGRAM, Operation.DELETE_OBJECTS), equalTo(0L));
         assertThat(getLongHistogramValue(plugin, METRIC_THROTTLES_HISTOGRAM, Operation.DELETE_OBJECTS), equalTo(0L));
-        assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM, Operation.DELETE_OBJECTS), equalTo(1L));
+        assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM, Operation.DELETE_OBJECTS), equalTo(1L));
     }
 
     public void testMetricsForRequestRangeNotSatisfied() {
@@ -208,7 +239,7 @@ public class S3BlobStoreRepositoryMetricsTests extends S3BlobStoreRepositoryTest
             );
             assertThat(getLongCounterValue(plugin, METRIC_THROTTLES_TOTAL, Operation.GET_OBJECT), equalTo(2 * batch));
             assertThat(getLongHistogramValue(plugin, METRIC_THROTTLES_HISTOGRAM, Operation.GET_OBJECT), equalTo(2 * batch));
-            assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch));
+            assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch));
         }
     }
 

+ 11 - 10
modules/repository-s3/src/main/java/org/elasticsearch/repositories/s3/S3BlobStore.java

@@ -224,11 +224,11 @@ class S3BlobStore implements BlobStore {
                 return;
             }
 
-            final long totalTimeInMicros = getTotalTimeInMicros(requestTimesIncludingRetries);
-            if (totalTimeInMicros == 0) {
+            final long totalTimeInMillis = getTotalTimeInMillis(requestTimesIncludingRetries);
+            if (totalTimeInMillis == 0) {
                 logger.warn("Expected HttpRequestTime to be tracked for request [{}] but found no count.", request);
             } else {
-                s3RepositoriesMetrics.common().httpRequestTimeInMicroHistogram().record(totalTimeInMicros, attributes);
+                s3RepositoriesMetrics.common().httpRequestTimeInMillisHistogram().record(totalTimeInMillis, attributes);
             }
         }
 
@@ -271,18 +271,19 @@ class S3BlobStore implements BlobStore {
         }
     }
 
-    private static long getTotalTimeInMicros(List<TimingInfo> requestTimesIncludingRetries) {
-        // Here we calculate the timing in Microseconds for the sum of the individual subMeasurements with the goal of deriving the TTFB
-        // (time to first byte). We calculate the time in micros for later use with an APM style counter (exposed as a long), rather than
-        // using the default double exposed by getTimeTakenMillisIfKnown().
-        long totalTimeInMicros = 0;
+    private static long getTotalTimeInMillis(List<TimingInfo> requestTimesIncludingRetries) {
+        // Here we calculate the timing in Milliseconds for the sum of the individual subMeasurements with the goal of deriving the TTFB
+        // (time to first byte). We calculate the time in millis for later use with an APM style counter (exposed as a long), rather than
+        // using the default double exposed by getTimeTakenMillisIfKnown(). We don't need sub-millisecond precision. So no need perform
+        // the data type castings.
+        long totalTimeInMillis = 0;
         for (TimingInfo timingInfo : requestTimesIncludingRetries) {
             var endTimeInNanos = timingInfo.getEndTimeNanoIfKnown();
             if (endTimeInNanos != null) {
-                totalTimeInMicros += TimeUnit.NANOSECONDS.toMicros(endTimeInNanos - timingInfo.getStartTimeNano());
+                totalTimeInMillis += TimeUnit.NANOSECONDS.toMillis(endTimeInNanos - timingInfo.getStartTimeNano());
             }
         }
-        return totalTimeInMicros;
+        return totalTimeInMillis;
     }
 
     @Override

+ 5 - 5
server/src/main/java/org/elasticsearch/repositories/RepositoriesMetrics.java

@@ -22,7 +22,7 @@ public record RepositoriesMetrics(
     LongCounter unsuccessfulOperationCounter,
     LongHistogram exceptionHistogram,
     LongHistogram throttleHistogram,
-    LongHistogram httpRequestTimeInMicroHistogram
+    LongHistogram httpRequestTimeInMillisHistogram
 ) {
 
     public static RepositoriesMetrics NOOP = new RepositoriesMetrics(MeterRegistry.NOOP);
@@ -36,7 +36,7 @@ public record RepositoriesMetrics(
     public static final String METRIC_UNSUCCESSFUL_OPERATIONS_TOTAL = "es.repositories.operations.unsuccessful.total";
     public static final String METRIC_EXCEPTIONS_HISTOGRAM = "es.repositories.exceptions.histogram";
     public static final String METRIC_THROTTLES_HISTOGRAM = "es.repositories.throttles.histogram";
-    public static final String HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM = "es.repositories.requests.http_request_time.histogram";
+    public static final String HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM = "es.repositories.requests.http_request_time.histogram";
 
     public RepositoriesMetrics(MeterRegistry meterRegistry) {
         this(
@@ -54,9 +54,9 @@ public record RepositoriesMetrics(
             meterRegistry.registerLongHistogram(METRIC_EXCEPTIONS_HISTOGRAM, "repository request exception histogram", "unit"),
             meterRegistry.registerLongHistogram(METRIC_THROTTLES_HISTOGRAM, "repository request throttle histogram", "unit"),
             meterRegistry.registerLongHistogram(
-                HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM,
-                "HttpRequestTime in microseconds expressed as as a histogram",
-                "micros"
+                HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM,
+                "HttpRequestTime in milliseconds expressed as as a histogram",
+                "ms"
             )
         );
     }

+ 2 - 2
x-pack/plugin/blob-cache/src/main/java/org/elasticsearch/blobcache/BlobCacheMetrics.java

@@ -31,8 +31,8 @@ public class BlobCacheMetrics {
             ),
             meterRegistry.registerLongHistogram(
                 "es.blob_cache.cache_miss_load_times.histogram",
-                "The time in microseconds for populating entries in the blob store resulting from a cache miss, expressed as a histogram.",
-                "micros"
+                "The time in milliseconds for populating entries in the blob store resulting from a cache miss, expressed as a histogram.",
+                "ms"
             )
         );
     }

+ 1 - 1
x-pack/plugin/blob-cache/src/main/java/org/elasticsearch/blobcache/shared/SharedBlobCacheService.java

@@ -1116,7 +1116,7 @@ public class SharedBlobCacheService<KeyType> implements Releasable {
                     IntConsumer progressUpdater
                 ) throws IOException {
                     writer.fillCacheRange(channel, channelPos, streamFactory, relativePos, length, progressUpdater);
-                    var elapsedTime = TimeUnit.NANOSECONDS.toMicros(relativeTimeInNanosSupplier.getAsLong() - startTime);
+                    var elapsedTime = TimeUnit.NANOSECONDS.toMillis(relativeTimeInNanosSupplier.getAsLong() - startTime);
                     SharedBlobCacheService.this.blobCacheMetrics.getCacheMissLoadTimes().record(elapsedTime);
                     SharedBlobCacheService.this.blobCacheMetrics.getCacheMissCounter().increment();
                 }