Переглянути джерело

Report more details of unobtainable ShardLock (#61255)

Today a common reason for a `ShardLockObtainFailedException` is when a
shard is removed from a node and then assigned straight back to it again
before the node has had a chance to shut the previous shard instance
down. For instance, this can happen if a node briefly leaves the cluster
holding a primary with no in-sync replicas.

The message in this case is typically as follows:

    obtaining shard lock timed out after 5000ms, previous lock details: [shard creation] trying to lock for [shard creation]

This is pretty hard to interpret, and doesn't raise the important
question: "why didn't the shard shut down sooner?"

With this change we reword the message a bit, report the age of the
shard lock, and adjust the details to report that the lock is held by a
closing shard:

    obtaining shard lock for [starting shard] timed out after [5000ms], lock already held for [closing shard] with age [12345ms]

Relates #38807
David Turner 5 роки тому
батько
коміт
98213df946

+ 17 - 5
server/src/main/java/org/elasticsearch/env/NodeEnvironment.java

@@ -41,6 +41,7 @@ import org.elasticsearch.common.CheckedRunnable;
 import org.elasticsearch.common.Randomness;
 import org.elasticsearch.common.SuppressForbidden;
 import org.elasticsearch.common.UUIDs;
+import org.elasticsearch.common.collect.Tuple;
 import org.elasticsearch.common.io.FileSystemUtils;
 import org.elasticsearch.common.lease.Releasable;
 import org.elasticsearch.common.settings.Setting;
@@ -786,6 +787,11 @@ public final class NodeEnvironment  implements Closeable {
                 shardLock.release();
                 logger.trace("released shard lock for [{}]", shardId);
             }
+
+            @Override
+            public void setDetails(String details) {
+                shardLock.setDetails(details);
+            }
         };
     }
 
@@ -817,13 +823,13 @@ public final class NodeEnvironment  implements Closeable {
          */
         private final Semaphore mutex = new Semaphore(1);
         private int waitCount = 1; // guarded by shardLocks
-        private String lockDetails;
         private final ShardId shardId;
+        private volatile Tuple<Long, String> lockDetails;
 
         InternalShardLock(final ShardId shardId, final String details) {
             this.shardId = shardId;
             mutex.acquireUninterruptibly();
-            lockDetails = details;
+            lockDetails = Tuple.tuple(System.nanoTime(), details);
         }
 
         protected void release() {
@@ -854,17 +860,23 @@ public final class NodeEnvironment  implements Closeable {
         void acquire(long timeoutInMillis, final String details) throws ShardLockObtainFailedException {
             try {
                 if (mutex.tryAcquire(timeoutInMillis, TimeUnit.MILLISECONDS)) {
-                    lockDetails = details;
+                    setDetails(details);
                 } else {
+                    final Tuple<Long, String> lockDetails = this.lockDetails; // single volatile read
                     throw new ShardLockObtainFailedException(shardId,
-                        "obtaining shard lock timed out after " + timeoutInMillis + "ms, previous lock details: [" + lockDetails +
-                            "] trying to lock for [" + details + "]");
+                        "obtaining shard lock for [" + details + "] timed out after [" + timeoutInMillis +
+                        "ms], lock already held for [" + lockDetails.v2() + "] with age [" +
+                        TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - lockDetails.v1()) + "ms]");
                 }
             } catch (InterruptedException e) {
                 Thread.currentThread().interrupt();
                 throw new ShardLockObtainFailedException(shardId, "thread interrupted while trying to obtain shard lock", e);
             }
         }
+
+        public void setDetails(String details) {
+            lockDetails = Tuple.tuple(System.nanoTime(), details);
+        }
     }
 
     public boolean hasNodeFile() {

+ 8 - 1
server/src/main/java/org/elasticsearch/env/ShardLock.java

@@ -54,7 +54,14 @@ public abstract class ShardLock implements Closeable {
         }
     }
 
-    protected  abstract void closeInternal();
+    protected abstract void closeInternal();
+
+    /**
+     * Update the details of the holder of this lock. These details are displayed alongside a {@link ShardLockObtainFailedException}. Must
+     * only be called by the holder of this lock.
+     */
+    public void setDetails(String details) {
+    }
 
     @Override
     public String toString() {

+ 4 - 1
server/src/main/java/org/elasticsearch/index/IndexService.java

@@ -406,7 +406,7 @@ public class IndexService extends AbstractIndexComponent implements IndicesClust
         IndexShard indexShard = null;
         ShardLock lock = null;
         try {
-            lock = nodeEnv.shardLock(shardId, "shard creation", TimeUnit.SECONDS.toMillis(5));
+            lock = nodeEnv.shardLock(shardId, "starting shard", TimeUnit.SECONDS.toMillis(5));
             eventListener.beforeIndexShardCreated(shardId, indexSettings);
             ShardPath path;
             try {
@@ -515,6 +515,9 @@ public class IndexService extends AbstractIndexComponent implements IndicesClust
     private void closeShard(String reason, ShardId sId, IndexShard indexShard, Store store, IndexEventListener listener) {
         final int shardId = sId.id();
         final Settings indexSettings = this.getIndexSettings().getSettings();
+        if (store != null) {
+            store.beforeClose();
+        }
         try {
             try {
                 listener.beforeIndexShardClosed(sId, indexShard, indexSettings);

+ 4 - 0
server/src/main/java/org/elasticsearch/index/store/Store.java

@@ -677,6 +677,10 @@ public class Store extends AbstractIndexShardComponent implements Closeable, Ref
         return refCounter.refCount();
     }
 
+    public void beforeClose() {
+        shardLock.setDetails("closing shard");
+    }
+
     static final class StoreDirectory extends FilterDirectory {
 
         private final Logger deletesLogger;