Bladeren bron

Fix blob cache races/assertion errors (#96458)

In racy evict cases, assertions in blob cache did not hold, adapted test
and added fixes.

Relates #96399
Henning Andersen 2 jaren geleden
bovenliggende
commit
c04b32e4ff

+ 5 - 0
docs/changelog/96458.yaml

@@ -0,0 +1,5 @@
+pr: 96458
+summary: Fix blob cache races/assertion errors
+area: Snapshot/Restore
+type: bug
+issues: []

+ 29 - 18
x-pack/plugin/blob-cache/src/main/java/org/elasticsearch/blobcache/shared/SharedBlobCacheService.java

@@ -397,12 +397,7 @@ public class SharedBlobCacheService<KeyType> implements Releasable {
                     final Integer freeSlot = freeRegions.poll();
                     if (freeSlot != null) {
                         // no need to evict an item, just add
-                        assert regionOwners[freeSlot].compareAndSet(null, entry.chunk);
-                        synchronized (this) {
-                            pushEntryToBack(entry);
-                            // assign sharedBytesPos only when chunk is ready for use. Under lock to avoid concurrent tryEvict.
-                            entry.chunk.sharedBytesPos = freeSlot;
-                        }
+                        assignToSlot(entry, freeSlot);
                     } else {
                         // need to evict something
                         synchronized (this) {
@@ -410,12 +405,7 @@ public class SharedBlobCacheService<KeyType> implements Releasable {
                         }
                         final Integer freeSlotRetry = freeRegions.poll();
                         if (freeSlotRetry != null) {
-                            assert regionOwners[freeSlotRetry].compareAndSet(null, entry.chunk);
-                            synchronized (this) {
-                                pushEntryToBack(entry);
-                                // assign sharedBytesPos only when chunk is ready for use. Under lock to avoid concurrent tryEvict.
-                                entry.chunk.sharedBytesPos = freeSlotRetry;
-                            }
+                            assignToSlot(entry, freeSlotRetry);
                         } else {
                             boolean removed = keyMapping.remove(regionKey, entry);
                             assert removed;
@@ -431,7 +421,7 @@ public class SharedBlobCacheService<KeyType> implements Releasable {
 
         // existing item, check if we need to promote item
         synchronized (this) {
-            if (now - entry.lastAccessed >= minTimeDelta && entry.freq + 1 < maxFreq) {
+            if (now - entry.lastAccessed >= minTimeDelta && entry.freq + 1 < maxFreq && entry.chunk.isEvicted() == false) {
                 unlink(entry);
                 entry.freq++;
                 entry.lastAccessed = now;
@@ -442,6 +432,21 @@ public class SharedBlobCacheService<KeyType> implements Releasable {
         return entry.chunk;
     }
 
+    private void assignToSlot(Entry<CacheFileRegion> entry, int freeSlot) {
+        assert regionOwners[freeSlot].compareAndSet(null, entry.chunk);
+        synchronized (this) {
+            if (entry.chunk.isEvicted()) {
+                assert regionOwners[freeSlot].compareAndSet(entry.chunk, null);
+                freeRegions.add(freeSlot);
+                keyMapping.remove(entry.chunk.regionKey, entry);
+                throw new AlreadyClosedException("evicted during free region allocation");
+            }
+            pushEntryToBack(entry);
+            // assign sharedBytesPos only when chunk is ready for use. Under lock to avoid concurrent tryEvict.
+            entry.chunk.sharedBytesPos = freeSlot;
+        }
+    }
+
     private void assertChunkActiveOrEvicted(Entry<CacheFileRegion> entry) {
         if (Assertions.ENABLED) {
             synchronized (this) {
@@ -454,8 +459,11 @@ public class SharedBlobCacheService<KeyType> implements Releasable {
     }
 
     public void onClose(CacheFileRegion chunk) {
-        assert regionOwners[chunk.sharedBytesPos].compareAndSet(chunk, null);
-        freeRegions.add(chunk.sharedBytesPos);
+        // we held the "this" lock when this was evicted, hence if sharedBytesPos is not filled in, chunk will never be registered.
+        if (chunk.sharedBytesPos != -1) {
+            assert regionOwners[chunk.sharedBytesPos].compareAndSet(chunk, null);
+            freeRegions.add(chunk.sharedBytesPos);
+        }
     }
 
     // used by tests
@@ -510,7 +518,7 @@ public class SharedBlobCacheService<KeyType> implements Releasable {
         for (int i = 0; i < maxFreq; i++) {
             for (Entry<CacheFileRegion> entry = freqs[i]; entry != null; entry = entry.next) {
                 boolean evicted = entry.chunk.tryEvict();
-                if (evicted) {
+                if (evicted && entry.chunk.sharedBytesPos != -1) {
                     unlink(entry);
                     keyMapping.remove(entry.chunk.regionKey, entry);
                     return;
@@ -603,7 +611,7 @@ public class SharedBlobCacheService<KeyType> implements Releasable {
             synchronized (this) {
                 for (Entry<CacheFileRegion> entry : matchingEntries) {
                     boolean evicted = entry.chunk.forceEvict();
-                    if (evicted) {
+                    if (evicted && entry.chunk.sharedBytesPos != -1) {
                         unlink(entry);
                         keyMapping.remove(entry.chunk.regionKey, entry);
                     }
@@ -693,7 +701,9 @@ public class SharedBlobCacheService<KeyType> implements Releasable {
         private final AtomicBoolean evicted = new AtomicBoolean(false);
 
         // tries to evict this chunk if noone is holding onto its resources anymore
-        public boolean tryEvict() {
+        // visible for tests.
+        boolean tryEvict() {
+            assert Thread.holdsLock(SharedBlobCacheService.this) : "must hold lock when evicting";
             if (refCount() <= 1 && evicted.compareAndSet(false, true)) {
                 logger.trace("evicted {} with channel offset {}", regionKey, physicalStartOffset());
                 evictCount.increment();
@@ -704,6 +714,7 @@ public class SharedBlobCacheService<KeyType> implements Releasable {
         }
 
         public boolean forceEvict() {
+            assert Thread.holdsLock(SharedBlobCacheService.this) : "must hold lock when evicting";
             if (evicted.compareAndSet(false, true)) {
                 logger.trace("force evicted {} with channel offset {}", regionKey, physicalStartOffset());
                 evictCount.increment();

+ 23 - 6
x-pack/plugin/blob-cache/src/test/java/org/elasticsearch/blobcache/shared/SharedBlobCacheServiceTests.java

@@ -73,9 +73,13 @@ public class SharedBlobCacheServiceTests extends ESTestCase {
             assertEquals(size(50), region2.tracker.getLength());
             assertEquals(2, cacheService.freeRegionCount());
 
-            assertTrue(region1.tryEvict());
+            synchronized (cacheService) {
+                assertTrue(region1.tryEvict());
+            }
             assertEquals(3, cacheService.freeRegionCount());
-            assertFalse(region1.tryEvict());
+            synchronized (cacheService) {
+                assertFalse(region1.tryEvict());
+            }
             assertEquals(3, cacheService.freeRegionCount());
             final var bytesReadFuture = new PlainActionFuture<Integer>();
             region0.populateAndRead(
@@ -86,13 +90,19 @@ public class SharedBlobCacheServiceTests extends ESTestCase {
                 taskQueue.getThreadPool().executor(ThreadPool.Names.GENERIC),
                 bytesReadFuture
             );
-            assertFalse(region0.tryEvict());
+            synchronized (cacheService) {
+                assertFalse(region0.tryEvict());
+            }
             assertEquals(3, cacheService.freeRegionCount());
             assertFalse(bytesReadFuture.isDone());
             taskQueue.runAllRunnableTasks();
-            assertTrue(region0.tryEvict());
+            synchronized (cacheService) {
+                assertTrue(region0.tryEvict());
+            }
             assertEquals(4, cacheService.freeRegionCount());
-            assertTrue(region2.tryEvict());
+            synchronized (cacheService) {
+                assertTrue(region2.tryEvict());
+            }
             assertEquals(5, cacheService.freeRegionCount());
             assertTrue(bytesReadFuture.isDone());
             assertEquals(Integer.valueOf(1), bytesReadFuture.actionGet());
@@ -130,7 +140,9 @@ public class SharedBlobCacheServiceTests extends ESTestCase {
             assertFalse(region1.isEvicted());
 
             // explicitly evict region 1
-            assertTrue(region1.tryEvict());
+            synchronized (cacheService) {
+                assertTrue(region1.tryEvict());
+            }
             assertEquals(1, cacheService.freeRegionCount());
         }
     }
@@ -230,6 +242,7 @@ public class SharedBlobCacheServiceTests extends ESTestCase {
                 ByteSizeValue.ofBytes(size(between(1, 20) * 100L)).getStringRep()
             )
             .put(SharedBlobCacheService.SHARED_CACHE_REGION_SIZE_SETTING.getKey(), ByteSizeValue.ofBytes(size(100)).getStringRep())
+            .put(SharedBlobCacheService.SHARED_CACHE_MIN_TIME_DELTA_SETTING.getKey(), randomFrom("0", "1ms", "10s"))
             .put("path.home", createTempDir())
             .build();
         long fileLength = size(500);
@@ -245,6 +258,7 @@ public class SharedBlobCacheServiceTests extends ESTestCase {
                 String[] cacheKeys = IntStream.range(0, iterations).mapToObj(ignore -> randomFrom(files)).toArray(String[]::new);
                 int[] regions = IntStream.range(0, iterations).map(ignore -> between(0, 4)).toArray();
                 int[] yield = IntStream.range(0, iterations).map(ignore -> between(0, 9)).toArray();
+                int[] evict = IntStream.range(0, iterations).map(ignore -> between(0, 99)).toArray();
                 return new Thread(() -> {
                     try {
                         ready.await();
@@ -261,6 +275,9 @@ public class SharedBlobCacheServiceTests extends ESTestCase {
                                     }
                                     cacheFileRegion.decRef();
                                 }
+                                if (evict[i] == 0) {
+                                    cacheService.forceEvict(x -> true);
+                                }
                             } catch (AlreadyClosedException e) {
                                 // ignore
                             }