Pārlūkot izejas kodu

Distinguish different snapshot failures by log level (#105622)

Today all snapshot failures are reported in the logs at `WARN`,
including a stack trace, but most of them are in fact benign or expected
and do not need any further action. To make it easier to track
actionable problems, this commit downgrades the non-actionable ones to
`INFO` level and suppresses their stack traces.
David Turner 1 gadu atpakaļ
vecāks
revīzija
3cde13cae0

+ 5 - 0
docs/changelog/105622.yaml

@@ -0,0 +1,5 @@
+pr: 105622
+summary: Distinguish different snapshot failures by log level
+area: Snapshot/Restore
+type: enhancement
+issues: []

+ 42 - 4
server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java

@@ -76,6 +76,7 @@ import org.elasticsearch.core.Nullable;
 import org.elasticsearch.core.SuppressForbidden;
 import org.elasticsearch.core.Tuple;
 import org.elasticsearch.index.Index;
+import org.elasticsearch.index.IndexNotFoundException;
 import org.elasticsearch.index.IndexVersion;
 import org.elasticsearch.index.IndexVersions;
 import org.elasticsearch.index.shard.ShardId;
@@ -3835,14 +3836,51 @@ public final class SnapshotsService extends AbstractLifecycleComponent implement
 
         @Override
         public void onFailure(Exception e) {
-            logger.warn(
-                () -> format("[%s][%s] failed to create snapshot", snapshot.getRepository(), snapshot.getSnapshotId().getName()),
-                e
-            );
+            final var logLevel = snapshotFailureLogLevel(e);
+            if (logLevel == Level.INFO && logger.isDebugEnabled() == false) {
+                // suppress stack trace at INFO unless extra verbosity is configured
+                logger.info(
+                    format(
+                        "[%s][%s] failed to create snapshot: %s",
+                        snapshot.getRepository(),
+                        snapshot.getSnapshotId().getName(),
+                        e.getMessage()
+                    )
+                );
+            } else {
+                logger.log(
+                    logLevel,
+                    () -> format("[%s][%s] failed to create snapshot", snapshot.getRepository(), snapshot.getSnapshotId().getName()),
+                    e
+                );
+            }
             listener.onFailure(e);
         }
     }
 
+    private static Level snapshotFailureLogLevel(Exception e) {
+        if (MasterService.isPublishFailureException(e)) {
+            // no action needed, the new master will take things from here
+            return Level.INFO;
+        } else if (e instanceof InvalidSnapshotNameException) {
+            // no action needed, typically ILM-related, or a user error
+            return Level.INFO;
+        } else if (e instanceof IndexNotFoundException) {
+            // not worrying, most likely a user error
+            return Level.INFO;
+        } else if (e instanceof SnapshotException) {
+            if (e.getMessage().contains(ReferenceDocs.UNASSIGNED_SHARDS.toString())) {
+                // non-partial snapshot requested but cluster health is not yellow or green; the health is tracked elsewhere so no need to
+                // make more noise here
+                return Level.INFO;
+            }
+        } else if (e instanceof IllegalArgumentException) {
+            // some other user error
+            return Level.INFO;
+        }
+        return Level.WARN;
+    }
+
     private class SnapshotTaskExecutor implements ClusterStateTaskExecutor<SnapshotTask> {
         @Override
         public ClusterState execute(BatchExecutionContext<SnapshotTask> batchExecutionContext) throws Exception {

+ 172 - 6
server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java

@@ -8,6 +8,7 @@
 
 package org.elasticsearch.snapshots;
 
+import org.apache.logging.log4j.Level;
 import org.apache.lucene.util.SetOnce;
 import org.elasticsearch.ExceptionsHelper;
 import org.elasticsearch.action.ActionListener;
@@ -140,6 +141,7 @@ import org.elasticsearch.features.FeatureService;
 import org.elasticsearch.gateway.MetaStateService;
 import org.elasticsearch.gateway.TransportNodesListGatewayStartedShards;
 import org.elasticsearch.index.Index;
+import org.elasticsearch.index.IndexNotFoundException;
 import org.elasticsearch.index.IndexSettingProviders;
 import org.elasticsearch.index.IndexingPressure;
 import org.elasticsearch.index.analysis.AnalysisRegistry;
@@ -185,6 +187,7 @@ import org.elasticsearch.telemetry.TelemetryProvider;
 import org.elasticsearch.telemetry.tracing.Tracer;
 import org.elasticsearch.test.ClusterServiceUtils;
 import org.elasticsearch.test.ESTestCase;
+import org.elasticsearch.test.MockLogAppender;
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.transport.BytesRefRecycler;
 import org.elasticsearch.transport.DisruptableMockTransport;
@@ -232,6 +235,7 @@ import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.either;
 import static org.hamcrest.Matchers.empty;
 import static org.hamcrest.Matchers.hasSize;
+import static org.hamcrest.Matchers.instanceOf;
 import static org.hamcrest.Matchers.is;
 import static org.hamcrest.Matchers.iterableWithSize;
 import static org.hamcrest.Matchers.lessThanOrEqualTo;
@@ -1391,7 +1395,6 @@ public class SnapshotResiliencyTests extends ESTestCase {
 
         final var indices = IntStream.range(0, between(1, 4)).mapToObj(i -> "index-" + i).sorted().toList();
         final var repoName = "repo";
-        final var originalSnapshotName = "original-snapshot";
 
         var testListener = SubscribableListener
 
@@ -1423,11 +1426,11 @@ public class SnapshotResiliencyTests extends ESTestCase {
                 }
             })
 
-            // Take a full snapshot for use as the source for future clones
+            // Take the snapshot to check the reaction to having unassigned shards
             .<Void>andThen(
                 (l, ignored) -> client().admin()
                     .cluster()
-                    .prepareCreateSnapshot(repoName, originalSnapshotName)
+                    .prepareCreateSnapshot(repoName, randomIdentifier())
                     .setWaitForCompletion(randomBoolean())
                     .execute(new ActionListener<>() {
                         @Override
@@ -1451,9 +1454,172 @@ public class SnapshotResiliencyTests extends ESTestCase {
                     })
             );
 
-        deterministicTaskQueue.runAllRunnableTasks();
-        assertTrue("executed all runnable tasks but test steps are still incomplete", testListener.isDone());
-        safeAwait(testListener); // shouldn't throw
+        MockLogAppender.assertThatLogger(() -> {
+            deterministicTaskQueue.runAllRunnableTasks();
+            assertTrue("executed all runnable tasks but test steps are still incomplete", testListener.isDone());
+            safeAwait(testListener); // shouldn't throw
+        },
+            SnapshotsService.class,
+            new MockLogAppender.SeenEventExpectation(
+                "INFO log",
+                SnapshotsService.class.getCanonicalName(),
+                Level.INFO,
+                "*failed to create snapshot*the following indices have unassigned primary shards*"
+            )
+        );
+    }
+
+    public void testSnapshotNameAlreadyInUseExceptionLogging() {
+        setupTestCluster(1, 1);
+
+        final var repoName = "repo";
+        final var snapshotName = "test-snapshot";
+
+        final var testListener = createRepoAndIndex(repoName, "index", between(1, 2))
+            // take snapshot once
+            .<CreateSnapshotResponse>andThen(
+                (l, ignored) -> client().admin()
+                    .cluster()
+                    .prepareCreateSnapshot(repoName, snapshotName)
+                    .setWaitForCompletion(true)
+                    .execute(l)
+            )
+            // take snapshot again
+            .<CreateSnapshotResponse>andThen(
+                (l, ignored) -> client().admin()
+                    .cluster()
+                    .prepareCreateSnapshot(repoName, snapshotName)
+                    .setWaitForCompletion(randomBoolean())
+                    .execute(new ActionListener<>() {
+                        @Override
+                        public void onResponse(CreateSnapshotResponse createSnapshotResponse) {
+                            fail("snapshot should not have started");
+                        }
+
+                        @Override
+                        public void onFailure(Exception e) {
+                            assertThat(ExceptionsHelper.unwrapCause(e), instanceOf(SnapshotNameAlreadyInUseException.class));
+                            l.onResponse(null);
+                        }
+                    })
+            );
+
+        MockLogAppender.assertThatLogger(() -> {
+            deterministicTaskQueue.runAllRunnableTasks();
+            assertTrue("executed all runnable tasks but test steps are still incomplete", testListener.isDone());
+            safeAwait(testListener); // shouldn't throw
+        },
+            SnapshotsService.class,
+            new MockLogAppender.SeenEventExpectation(
+                "INFO log",
+                SnapshotsService.class.getCanonicalName(),
+                Level.INFO,
+                Strings.format("*failed to create snapshot*Invalid snapshot name [%s]*", snapshotName)
+            )
+        );
+    }
+
+    public void testIndexNotFoundExceptionLogging() {
+        setupTestCluster(1, 0); // no need for data nodes here
+
+        final var repoName = "repo";
+        final var indexName = "does-not-exist";
+
+        final var testListener = SubscribableListener
+            // create repo
+            .<AcknowledgedResponse>newForked(
+                l -> client().admin()
+                    .cluster()
+                    .preparePutRepository(repoName)
+                    .setType(FsRepository.TYPE)
+                    .setSettings(Settings.builder().put("location", randomAlphaOfLength(10)))
+                    .execute(l)
+            )
+            // take snapshot of index that does not exist
+            .<CreateSnapshotResponse>andThen(
+                (l, ignored) -> client().admin()
+                    .cluster()
+                    .prepareCreateSnapshot(repoName, randomIdentifier())
+                    .setIndices(indexName)
+                    .setWaitForCompletion(randomBoolean())
+                    .execute(new ActionListener<>() {
+                        @Override
+                        public void onResponse(CreateSnapshotResponse createSnapshotResponse) {
+                            fail("snapshot should not have started");
+                        }
+
+                        @Override
+                        public void onFailure(Exception e) {
+                            assertThat(ExceptionsHelper.unwrapCause(e), instanceOf(IndexNotFoundException.class));
+                            l.onResponse(null);
+                        }
+                    })
+            );
+
+        MockLogAppender.assertThatLogger(() -> {
+            deterministicTaskQueue.runAllRunnableTasks();
+            assertTrue("executed all runnable tasks but test steps are still incomplete", testListener.isDone());
+            safeAwait(testListener); // shouldn't throw
+        },
+            SnapshotsService.class,
+            new MockLogAppender.SeenEventExpectation(
+                "INFO log",
+                SnapshotsService.class.getCanonicalName(),
+                Level.INFO,
+                Strings.format("failed to create snapshot: no such index [%s]", indexName)
+            )
+        );
+    }
+
+    public void testIllegalArgumentExceptionLogging() {
+        setupTestCluster(1, 0); // no need for data nodes here
+
+        final var repoName = "repo";
+
+        final var testListener = SubscribableListener
+            // create repo
+            .<AcknowledgedResponse>newForked(
+                l -> client().admin()
+                    .cluster()
+                    .preparePutRepository(repoName)
+                    .setType(FsRepository.TYPE)
+                    .setSettings(Settings.builder().put("location", randomAlphaOfLength(10)))
+                    .execute(l)
+            )
+            // attempt to take snapshot with illegal config ('none' is allowed as a feature state iff it's the only one in the list)
+            .<CreateSnapshotResponse>andThen(
+                (l, ignored) -> client().admin()
+                    .cluster()
+                    .prepareCreateSnapshot(repoName, randomIdentifier())
+                    .setFeatureStates("none", "none")
+                    .setWaitForCompletion(randomBoolean())
+                    .execute(new ActionListener<>() {
+                        @Override
+                        public void onResponse(CreateSnapshotResponse createSnapshotResponse) {
+                            fail("snapshot should not have started");
+                        }
+
+                        @Override
+                        public void onFailure(Exception e) {
+                            assertThat(ExceptionsHelper.unwrapCause(e), instanceOf(IllegalArgumentException.class));
+                            l.onResponse(null);
+                        }
+                    })
+            );
+
+        MockLogAppender.assertThatLogger(() -> {
+            deterministicTaskQueue.runAllRunnableTasks();
+            assertTrue("executed all runnable tasks but test steps are still incomplete", testListener.isDone());
+            safeAwait(testListener); // shouldn't throw
+        },
+            SnapshotsService.class,
+            new MockLogAppender.SeenEventExpectation(
+                "INFO log",
+                SnapshotsService.class.getCanonicalName(),
+                Level.INFO,
+                Strings.format("*failed to create snapshot*other feature states were requested: [none, none]", "")
+            )
+        );
     }
 
     private RepositoryData getRepositoryData(Repository repository) {