Browse Source

Add cache miss and read metrics (#132497)

Fix up the reads metric to also consider tryRead accessing a new region
as a read/hit. Add cache miss metric that only increments once per region
per read. Add a miss ratio as well for easy querying.
Henning Andersen 2 months ago
parent
commit
444c5f2322

+ 5 - 0
docs/changelog/132497.yaml

@@ -0,0 +1,5 @@
+pr: 132497
+summary: Add cache miss and read metrics
+area: Searchable Snapshots
+type: enhancement
+issues: []

+ 47 - 0
x-pack/plugin/blob-cache/src/main/java/org/elasticsearch/blobcache/BlobCacheMetrics.java

@@ -12,12 +12,15 @@ import org.apache.logging.log4j.Logger;
 import org.elasticsearch.index.store.LuceneFilesExtensions;
 import org.elasticsearch.telemetry.TelemetryProvider;
 import org.elasticsearch.telemetry.metric.DoubleHistogram;
+import org.elasticsearch.telemetry.metric.DoubleWithAttributes;
 import org.elasticsearch.telemetry.metric.LongCounter;
 import org.elasticsearch.telemetry.metric.LongHistogram;
+import org.elasticsearch.telemetry.metric.LongWithAttributes;
 import org.elasticsearch.telemetry.metric.MeterRegistry;
 
 import java.util.Map;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.LongAdder;
 
 public class BlobCacheMetrics {
     private static final Logger logger = LogManager.getLogger(BlobCacheMetrics.class);
@@ -37,6 +40,9 @@ public class BlobCacheMetrics {
     private final LongCounter cachePopulationBytes;
     private final LongCounter cachePopulationTime;
 
+    private final LongAdder missCount = new LongAdder();
+    private final LongAdder readCount = new LongAdder();
+
     public enum CachePopulationReason {
         /**
          * When warming the cache
@@ -94,6 +100,31 @@ public class BlobCacheMetrics {
                 "milliseconds"
             )
         );
+
+        meterRegistry.registerLongGauge(
+            "es.blob_cache.read.total",
+            "The number of cache reads (warming not included)",
+            "count",
+            () -> new LongWithAttributes(readCount.longValue())
+        );
+        // notice that this is different from `miss_that_triggered_read` in that `miss_that_triggered_read` will count once per gap
+        // filled for a single read. Whereas this one only counts whenever a read provoked populating data from the object store, though
+        // once per region for multi-region reads. This allows reasoning about hit ratio too.
+        meterRegistry.registerLongGauge(
+            "es.blob_cache.miss.total",
+            "The number of cache misses (warming not included)",
+            "count",
+            () -> new LongWithAttributes(missCount.longValue())
+        );
+        // adding this helps search for high or low miss ratio. It will be since boot of the node though. More advanced queries can use
+        // deltas of the totals to see miss ratio over time.
+        meterRegistry.registerDoubleGauge(
+            "es.blob_cache.miss.ratio",
+            "The fraction of cache reads that missed data (warming not included)",
+            "fraction",
+            // read misses before reads on purpose
+            () -> new DoubleWithAttributes(Math.min((double) missCount.longValue() / Math.max(readCount.longValue(), 1L), 1.0d))
+        );
     }
 
     BlobCacheMetrics(
@@ -170,6 +201,22 @@ public class BlobCacheMetrics {
         }
     }
 
+    public void recordRead() {
+        readCount.increment();
+    }
+
+    public void recordMiss() {
+        missCount.increment();
+    }
+
+    public long readCount() {
+        return readCount.sum();
+    }
+
+    public long missCount() {
+        return missCount.sum();
+    }
+
     /**
      * Calculate throughput as MiB/second
      *

+ 28 - 8
x-pack/plugin/blob-cache/src/main/java/org/elasticsearch/blobcache/shared/SharedBlobCacheService.java

@@ -328,7 +328,6 @@ public class SharedBlobCacheService<KeyType> implements Releasable {
     private final LongAdder writeCount = new LongAdder();
     private final LongAdder writeBytes = new LongAdder();
 
-    private final LongAdder readCount = new LongAdder();
     private final LongAdder readBytes = new LongAdder();
 
     private final LongAdder evictCount = new LongAdder();
@@ -741,8 +740,9 @@ public class SharedBlobCacheService<KeyType> implements Releasable {
             evictCount.sum(),
             writeCount.sum(),
             writeBytes.sum(),
-            readCount.sum(),
-            readBytes.sum()
+            blobCacheMetrics.readCount(),
+            readBytes.sum(),
+            blobCacheMetrics.missCount()
         );
     }
 
@@ -1113,7 +1113,7 @@ public class SharedBlobCacheService<KeyType> implements Releasable {
                                     + '-'
                                     + rangeToRead.start()
                                     + ']';
-                            blobCacheService.readCount.increment();
+                            blobCacheService.blobCacheMetrics.recordRead();
                             l.onResponse(read);
                         })
                     );
@@ -1228,12 +1228,14 @@ public class SharedBlobCacheService<KeyType> implements Releasable {
                 return false;
             }
             var fileRegion = lastAccessedRegion;
+            boolean incrementReads = false;
             if (fileRegion != null && fileRegion.chunk.regionKey.region == startRegion) {
                 // existing item, check if we need to promote item
                 fileRegion.touch();
 
             } else {
                 fileRegion = cache.get(cacheKey, length, startRegion);
+                incrementReads = true;
             }
             final var region = fileRegion.chunk;
             if (region.tracker.checkAvailable(end - getRegionStart(startRegion)) == false) {
@@ -1241,6 +1243,10 @@ public class SharedBlobCacheService<KeyType> implements Releasable {
             }
             boolean res = region.tryRead(buf, offset);
             lastAccessedRegion = res ? fileRegion : null;
+            if (res && incrementReads) {
+                blobCacheMetrics.recordRead();
+                // todo: should we add to readBytes? readBytes.add(end - offset);
+            }
             return res;
         }
 
@@ -1309,7 +1315,7 @@ public class SharedBlobCacheService<KeyType> implements Releasable {
                 mapSubRangeToRegion(rangeToWrite, region),
                 mapSubRangeToRegion(rangeToRead, region),
                 readerWithOffset(reader, fileRegion, Math.toIntExact(rangeToRead.start() - regionStart)),
-                writerWithOffset(writer, fileRegion, Math.toIntExact(rangeToWrite.start() - regionStart)),
+                metricRecordingWriter(writerWithOffset(writer, fileRegion, Math.toIntExact(rangeToWrite.start() - regionStart))),
                 ioExecutor,
                 readFuture
             );
@@ -1341,7 +1347,9 @@ public class SharedBlobCacheService<KeyType> implements Releasable {
                             mapSubRangeToRegion(rangeToWrite, region),
                             subRangeToRead,
                             readerWithOffset(reader, fileRegion, Math.toIntExact(rangeToRead.start() - regionStart)),
-                            writerWithOffset(writer, fileRegion, Math.toIntExact(rangeToWrite.start() - regionStart)),
+                            metricRecordingWriter(
+                                writerWithOffset(writer, fileRegion, Math.toIntExact(rangeToWrite.start() - regionStart))
+                            ),
                             ioExecutor,
                             listener
                         );
@@ -1416,6 +1424,16 @@ public class SharedBlobCacheService<KeyType> implements Releasable {
             return adjustedWriter;
         }
 
+        private RangeMissingHandler metricRecordingWriter(RangeMissingHandler writer) {
+            return new DelegatingRangeMissingHandler(writer) {
+                @Override
+                public SourceInputStreamFactory sharedInputStreamFactory(List<SparseFileTracker.Gap> gaps) {
+                    blobCacheMetrics.recordMiss();
+                    return super.sharedInputStreamFactory(gaps);
+                }
+            };
+        }
+
         private RangeAvailableHandler readerWithOffset(RangeAvailableHandler reader, CacheFileRegion<KeyType> fileRegion, int readOffset) {
             final RangeAvailableHandler adjustedReader = (channel, channelPos, relativePos, len) -> reader.onRangeAvailable(
                 channel,
@@ -1558,9 +1576,11 @@ public class SharedBlobCacheService<KeyType> implements Releasable {
         long writeCount,
         long writeBytes,
         long readCount,
-        long readBytes
+        long readBytes,
+        // miss-count not exposed in REST API for now
+        long missCount
     ) {
-        public static final Stats EMPTY = new Stats(0, 0L, 0L, 0L, 0L, 0L, 0L, 0L);
+        public static final Stats EMPTY = new Stats(0, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L);
     }
 
     private class LFUCache implements Cache<KeyType, CacheFileRegion<KeyType>> {

+ 35 - 0
x-pack/plugin/blob-cache/src/test/java/org/elasticsearch/blobcache/BlobCacheMetricsTests.java

@@ -17,6 +17,7 @@ import org.junit.Before;
 
 import java.util.Arrays;
 import java.util.concurrent.TimeUnit;
+import java.util.stream.IntStream;
 
 import static org.hamcrest.Matchers.is;
 
@@ -66,6 +67,40 @@ public class BlobCacheMetricsTests extends ESTestCase {
             .get(0);
         assertEquals(totalTimeMeasurement.getLong(), TimeUnit.SECONDS.toMillis(secondsTaken));
         assertExpectedAttributesPresent(totalTimeMeasurement, cachePopulationReason, cachePopulationSource, fileExtension);
+
+        // let us check for 0, avoid div by 0.
+        checkReadsAndMisses(0, 0, 1);
+        int reads = between(1, 100);
+        int misses = between(1, reads);
+        recordMisses(metrics, misses);
+        checkReadsAndMisses(0, misses, misses);
+        IntStream.range(0, reads).forEach(i -> metrics.recordRead());
+        checkReadsAndMisses(reads, misses, reads);
+        recordMisses(metrics, reads);
+        checkReadsAndMisses(reads, misses + reads, misses + reads);
+    }
+
+    private void recordMisses(BlobCacheMetrics metrics, int misses) {
+        IntStream.range(0, misses).forEach(i -> metrics.recordMiss());
+    }
+
+    private void checkReadsAndMisses(int reads, int writes, int readsForRatio) {
+        recordingMeterRegistry.getRecorder().collect();
+
+        Measurement totalReadsMeasurement = recordingMeterRegistry.getRecorder()
+            .getMeasurements(InstrumentType.LONG_GAUGE, "es.blob_cache.read.total")
+            .getLast();
+        assertEquals(reads, totalReadsMeasurement.getLong());
+
+        Measurement totalMissesMeasurement = recordingMeterRegistry.getRecorder()
+            .getMeasurements(InstrumentType.LONG_GAUGE, "es.blob_cache.miss.total")
+            .getLast();
+        assertEquals(writes, totalMissesMeasurement.getLong());
+
+        Measurement missRatio = recordingMeterRegistry.getRecorder()
+            .getMeasurements(InstrumentType.DOUBLE_GAUGE, "es.blob_cache.miss.ratio")
+            .getLast();
+        assertEquals((double) writes / readsForRatio, missRatio.getDouble(), 0.00000001d);
     }
 
     private static void assertExpectedAttributesPresent(