1
0
Эх сурвалжийг харах

Improve check_on_startup docs and logging (#74233)

Today we don't really describe why using `index.shard.check_on_startup`
is such a bad idea, or what to do instead. This commit expands the docs
to clarify what it does, why it's not really necessary and what to do
instead. It also now logs a warning every time the startup checks run to
encourage users to stop using this setting.
David Turner 4 жил өмнө
parent
commit
31e1b1ae22

+ 39 - 13
docs/reference/index-modules.asciidoc

@@ -69,19 +69,6 @@ document distribution. See the
 breaking change].
 ====
 
-`index.shard.check_on_startup`::
-
-Whether or not shards should be checked for corruption before opening. When
-corruption is detected, it will prevent the shard from being opened.
-Accepts:
-`false`::: (default) Don't check for corruption when opening a shard.
-`checksum`::: Check for physical corruption.
-`true`::: Check for both physical and logical corruption. This is much more
-expensive in terms of CPU and memory usage.
-+
-WARNING: Expert only. Checking shards may take a lot of time on large
-indices.
-
 [[index-codec]] `index.codec`::
 
     The +default+ value compresses stored data with LZ4
@@ -130,6 +117,45 @@ to incomplete history on the leader. Defaults to `12h`.
     per request through the use of the `expand_wildcards` parameter. Possible values are
     `true` and `false` (default).
 
+[[index-shard-check-on-startup]] `index.shard.check_on_startup`::
++
+====
+WARNING: Expert users only. This setting enables some very expensive processing
+at shard startup and is only ever useful while diagnosing a problem in your
+cluster. If you do use it, you should do so only temporarily and remove it
+once it is no longer needed.
+
+{es} automatically performs integrity checks on the contents of shards at
+various points during their lifecycle. For instance, it verifies the checksum
+of every file transferred when recovering a replica or taking a snapshot. It
+also verifies the integrity of many important files when opening a shard, which
+happens when starting up a node and when finishing a shard recovery or
+relocation. You can therefore manually verify the integrity of a whole shard
+while it is running by taking a snapshot of it into a fresh repository or by
+recovering it onto a fresh node.
+
+This setting determines whether {es} performs additional integrity checks while
+opening a shard. If these checks detect corruption then they will prevent the
+shard from being opened. It accepts the following values:
+
+`false`::: Don't perform additional checks for corruption when opening a shard.
+This is the default and recommended behaviour.
+
+`checksum`::: Verify that the checksum of every file in the shard matches its
+contents. This will detect cases where the data read from disk differ from the
+data that {es} originally wrote, for instance due to undetected disk corruption
+or other hardware failures. These checks require reading the entire shard from
+disk which takes substantial time and IO bandwidth and may affect cluster
+performance by evicting important data from your filesystem cache.
+
+`true`::: Performs the same checks as `checksum` and also checks for logical
+inconsistencies in the shard, which could for instance be caused by the data
+being corrupted while it was being written due to faulty RAM or other hardware
+failures. These checks require reading the entire shard from disk which takes
+substantial time and IO bandwidth, and then performing various checks on the
+contents of the shard which take substantial time, CPU and memory.
+====
+
 [discrete]
 [[dynamic-index-settings]]
 === Dynamic index settings

+ 24 - 11
server/src/main/java/org/elasticsearch/index/shard/IndexShard.java

@@ -46,27 +46,27 @@ import org.elasticsearch.cluster.routing.RecoverySource;
 import org.elasticsearch.cluster.routing.RecoverySource.SnapshotRecoverySource;
 import org.elasticsearch.cluster.routing.ShardRouting;
 import org.elasticsearch.cluster.routing.allocation.decider.DiskThresholdDecider;
-import org.elasticsearch.core.Booleans;
-import org.elasticsearch.core.CheckedConsumer;
-import org.elasticsearch.core.CheckedFunction;
-import org.elasticsearch.core.CheckedRunnable;
-import org.elasticsearch.core.Nullable;
-import org.elasticsearch.core.Tuple;
 import org.elasticsearch.common.io.stream.BytesStreamOutput;
-import org.elasticsearch.core.Releasable;
-import org.elasticsearch.core.Releasables;
 import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.common.lucene.index.ElasticsearchDirectoryReader;
 import org.elasticsearch.common.metrics.CounterMetric;
 import org.elasticsearch.common.metrics.MeanMetric;
 import org.elasticsearch.common.settings.Settings;
-import org.elasticsearch.core.TimeValue;
 import org.elasticsearch.common.util.BigArrays;
 import org.elasticsearch.common.util.CollectionUtils;
 import org.elasticsearch.common.util.concurrent.AbstractRunnable;
 import org.elasticsearch.common.util.concurrent.AsyncIOProcessor;
 import org.elasticsearch.common.util.concurrent.ThreadContext;
 import org.elasticsearch.common.xcontent.XContentHelper;
+import org.elasticsearch.core.Booleans;
+import org.elasticsearch.core.CheckedConsumer;
+import org.elasticsearch.core.CheckedFunction;
+import org.elasticsearch.core.CheckedRunnable;
+import org.elasticsearch.core.Nullable;
+import org.elasticsearch.core.Releasable;
+import org.elasticsearch.core.Releasables;
+import org.elasticsearch.core.TimeValue;
+import org.elasticsearch.core.Tuple;
 import org.elasticsearch.core.internal.io.IOUtils;
 import org.elasticsearch.gateway.WriteStateException;
 import org.elasticsearch.index.Index;
@@ -2544,6 +2544,11 @@ public class IndexShard extends AbstractIndexShardComponent implements IndicesCl
     public void maybeCheckIndex() {
         recoveryState.setStage(RecoveryState.Stage.VERIFY_INDEX);
         if (Booleans.isTrue(checkIndexOnStartup) || "checksum".equals(checkIndexOnStartup)) {
+            logger.warn(
+                "performing expensive diagnostic checks during shard startup [{}={}]; " +
+                    "these checks should only be enabled temporarily, you must remove this index setting as soon as possible",
+                IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(),
+                checkIndexOnStartup);
             try {
                 checkIndex();
             } catch (IOException ex) {
@@ -2576,7 +2581,13 @@ public class IndexShard extends AbstractIndexShardComponent implements IndicesCl
         if ("checksum".equals(checkIndexOnStartup)) {
             // physical verification only: verify all checksums for the latest commit
             IOException corrupt = null;
-            MetadataSnapshot metadata = snapshotStoreMetadata();
+            final MetadataSnapshot metadata;
+            try {
+                metadata = snapshotStoreMetadata();
+            } catch (IOException e) {
+                logger.warn("check index [failure]", e);
+                throw e;
+            }
             for (Map.Entry<String, StoreFileMetadata> entry : metadata.asMap().entrySet()) {
                 try {
                     Store.checkIntegrity(entry.getValue(), store.directory());
@@ -2601,7 +2612,9 @@ public class IndexShard extends AbstractIndexShardComponent implements IndicesCl
                     // ignore if closed....
                     return;
                 }
-                logger.warn("check index [failure]\n{}", os.bytes().utf8ToString());
+                logger.warn("check index [failure]");
+                // report details in a separate message, it might contain control characters which mess up detection of the failure message
+                logger.warn("{}", os.bytes().utf8ToString());
                 throw new IOException("index check failure");
             }
         }

+ 35 - 8
server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java

@@ -7,6 +7,8 @@
  */
 package org.elasticsearch.index.shard;
 
+import org.apache.logging.log4j.Level;
+import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.DirectoryReader;
@@ -44,20 +46,16 @@ import org.elasticsearch.cluster.routing.ShardRoutingHelper;
 import org.elasticsearch.cluster.routing.ShardRoutingState;
 import org.elasticsearch.cluster.routing.TestShardRouting;
 import org.elasticsearch.cluster.routing.UnassignedInfo;
-import org.elasticsearch.core.CheckedFunction;
 import org.elasticsearch.common.Randomness;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.UUIDs;
 import org.elasticsearch.common.breaker.CircuitBreaker;
 import org.elasticsearch.common.bytes.BytesArray;
-import org.elasticsearch.core.Tuple;
 import org.elasticsearch.common.io.stream.BytesStreamOutput;
 import org.elasticsearch.common.io.stream.StreamInput;
-import org.elasticsearch.core.Releasable;
-import org.elasticsearch.core.Releasables;
+import org.elasticsearch.common.logging.Loggers;
 import org.elasticsearch.common.settings.IndexScopedSettings;
 import org.elasticsearch.common.settings.Settings;
-import org.elasticsearch.core.TimeValue;
 import org.elasticsearch.common.util.concurrent.AbstractRunnable;
 import org.elasticsearch.common.util.concurrent.AtomicArray;
 import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
@@ -65,6 +63,11 @@ import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentFactory;
 import org.elasticsearch.common.xcontent.XContentType;
+import org.elasticsearch.core.CheckedFunction;
+import org.elasticsearch.core.Releasable;
+import org.elasticsearch.core.Releasables;
+import org.elasticsearch.core.TimeValue;
+import org.elasticsearch.core.Tuple;
 import org.elasticsearch.core.internal.io.IOUtils;
 import org.elasticsearch.env.NodeEnvironment;
 import org.elasticsearch.index.IndexModule;
@@ -117,6 +120,7 @@ import org.elasticsearch.snapshots.SnapshotId;
 import org.elasticsearch.test.CorruptionUtils;
 import org.elasticsearch.test.DummyShardLock;
 import org.elasticsearch.test.FieldMaskingReader;
+import org.elasticsearch.test.MockLogAppender;
 import org.elasticsearch.test.store.MockFSDirectoryFactory;
 import org.elasticsearch.threadpool.ThreadPool;
 import org.junit.Assert;
@@ -3089,9 +3093,32 @@ public class IndexShardTests extends IndexShardTestCase {
         IndexShard corruptedShard = newShard(shardRouting, shardPath, indexMetadata, null, null, indexShard.engineFactory,
             indexShard.getGlobalCheckpointSyncer(), indexShard.getRetentionLeaseSyncer(), EMPTY_EVENT_LISTENER);
 
-        final IndexShardRecoveryException indexShardRecoveryException =
-            expectThrows(IndexShardRecoveryException.class, () -> newStartedShard(p -> corruptedShard, true));
-        assertThat(indexShardRecoveryException.getMessage(), equalTo("failed recovery"));
+        final MockLogAppender appender = new MockLogAppender();
+        appender.start();
+        Loggers.addAppender(LogManager.getLogger(IndexShard.class), appender);
+        try {
+            appender.addExpectation(new MockLogAppender.SeenEventExpectation(
+                "expensive checks warning",
+                "org.elasticsearch.index.shard.IndexShard",
+                Level.WARN,
+                "performing expensive diagnostic checks during shard startup [index.shard.check_on_startup=*]; these checks should only " +
+                    "be enabled temporarily, you must remove this index setting as soon as possible"));
+
+            appender.addExpectation(new MockLogAppender.SeenEventExpectation(
+                "failure message",
+                "org.elasticsearch.index.shard.IndexShard",
+                Level.WARN,
+                "check index [failure]*"));
+
+            final IndexShardRecoveryException indexShardRecoveryException =
+                expectThrows(IndexShardRecoveryException.class, () -> newStartedShard(p -> corruptedShard, true));
+            assertThat(indexShardRecoveryException.getMessage(), equalTo("failed recovery"));
+
+            appender.assertAllExpectationsMatched();
+        } finally {
+            Loggers.removeAppender(LogManager.getLogger(IndexShard.class), appender);
+            appender.stop();
+        }
 
         // check that corrupt marker is there
         Files.walkFileTree(indexPath, corruptedVisitor);