Răsfoiți Sursa

Docs links from repo analysis failures (#110681)

We still get too many cases about snapshot repositories which claim to
be S3-compatible but then fail repository analysis because of genuine
incompatibilities or implementation bugs. The info is all there in the
manual but it's not very easy to find. This commit adds more detail to
the response message, including docs links, to help direct users to the
information they need.
David Turner 1 an în urmă
părinte
comite
8d3b0ade5c

+ 16 - 0
modules/repository-s3/src/main/java/org/elasticsearch/repositories/s3/S3Repository.java

@@ -14,6 +14,7 @@ import org.elasticsearch.action.ActionListener;
 import org.elasticsearch.action.ActionRunnable;
 import org.elasticsearch.cluster.metadata.RepositoryMetadata;
 import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.ReferenceDocs;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.blobstore.BlobPath;
 import org.elasticsearch.common.blobstore.BlobStore;
@@ -443,4 +444,19 @@ class S3Repository extends MeteredBlobStoreRepository {
         }
         super.doClose();
     }
+
+    @Override
+    public String getAnalysisFailureExtraDetail() {
+        return Strings.format(
+            """
+                Elasticsearch observed the storage system underneath this repository behaved incorrectly which indicates it is not \
+                suitable for use with Elasticsearch snapshots. Typically this happens when using storage other than AWS S3 which \
+                incorrectly claims to be S3-compatible. If so, please report this incompatibility to your storage supplier. Do not report \
+                Elasticsearch issues involving storage systems which claim to be S3-compatible unless you can demonstrate that the same \
+                issue exists when using a genuine AWS S3 repository. See [%s] for further information about repository analysis, and [%s] \
+                for further information about support for S3-compatible repository implementations.""",
+            ReferenceDocs.SNAPSHOT_REPOSITORY_ANALYSIS,
+            ReferenceDocs.S3_COMPATIBLE_REPOSITORIES
+        );
+    }
 }

+ 22 - 0
modules/repository-s3/src/test/java/org/elasticsearch/repositories/s3/S3RepositoryTests.java

@@ -11,6 +11,7 @@ package org.elasticsearch.repositories.s3;
 import com.amazonaws.services.s3.AbstractAmazonS3;
 
 import org.elasticsearch.cluster.metadata.RepositoryMetadata;
+import org.elasticsearch.common.ReferenceDocs;
 import org.elasticsearch.common.settings.ClusterSettings;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.ByteSizeUnit;
@@ -28,6 +29,7 @@ import org.mockito.Mockito;
 
 import java.util.Map;
 
+import static org.hamcrest.Matchers.allOf;
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.is;
 import static org.hamcrest.Matchers.not;
@@ -152,4 +154,24 @@ public class S3RepositoryTests extends ESTestCase {
         );
     }
 
+    public void testAnalysisFailureDetail() {
+        try (
+            S3Repository s3repo = createS3Repo(
+                new RepositoryMetadata("dummy-repo", "mock", Settings.builder().put(S3Repository.BUCKET_SETTING.getKey(), "bucket").build())
+            )
+        ) {
+            assertThat(
+                s3repo.getAnalysisFailureExtraDetail(),
+                allOf(
+                    containsString("storage system underneath this repository behaved incorrectly"),
+                    containsString("incorrectly claims to be S3-compatible"),
+                    containsString("report this incompatibility to your storage supplier"),
+                    containsString("unless you can demonstrate that the same issue exists when using a genuine AWS S3 repository"),
+                    containsString(ReferenceDocs.SNAPSHOT_REPOSITORY_ANALYSIS.toString()),
+                    containsString(ReferenceDocs.S3_COMPATIBLE_REPOSITORIES.toString())
+                )
+            );
+        }
+    }
+
 }

+ 2 - 0
server/src/main/java/org/elasticsearch/common/ReferenceDocs.java

@@ -75,6 +75,8 @@ public enum ReferenceDocs {
     NETWORK_THREADING_MODEL,
     ALLOCATION_EXPLAIN_API,
     NETWORK_BINDING_AND_PUBLISHING,
+    SNAPSHOT_REPOSITORY_ANALYSIS,
+    S3_COMPATIBLE_REPOSITORIES,
     // this comment keeps the ';' on the next line so every entry above has a trailing ',' which makes the diff for adding new links cleaner
     ;
 

+ 12 - 0
server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java

@@ -3946,4 +3946,16 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
     public int getReadBufferSizeInBytes() {
         return bufferSize;
     }
+
+    /**
+     * @return extra information to be included in the exception message emitted on failure of a repository analysis.
+     */
+    public String getAnalysisFailureExtraDetail() {
+        return Strings.format(
+            """
+                Elasticsearch observed the storage system underneath this repository behaved incorrectly which indicates it is not \
+                suitable for use with Elasticsearch snapshots. See [%s] for further information.""",
+            ReferenceDocs.SNAPSHOT_REPOSITORY_ANALYSIS
+        );
+    }
 }

+ 3 - 1
server/src/main/resources/org/elasticsearch/common/reference-docs-links.json

@@ -35,5 +35,7 @@
   "EXECUTABLE_JNA_TMPDIR": "executable-jna-tmpdir.html",
   "NETWORK_THREADING_MODEL": "modules-network.html#modules-network-threading-model",
   "ALLOCATION_EXPLAIN_API": "cluster-allocation-explain.html",
-  "NETWORK_BINDING_AND_PUBLISHING": "modules-network.html#modules-network-binding-publishing"
+  "NETWORK_BINDING_AND_PUBLISHING": "modules-network.html#modules-network-binding-publishing",
+  "SNAPSHOT_REPOSITORY_ANALYSIS": "repo-analysis-api.html",
+  "S3_COMPATIBLE_REPOSITORIES": "repository-s3.html#repository-s3-compatible-services"
 }

+ 1 - 1
x-pack/plugin/snapshot-repo-test-kit/qa/rest/src/yamlRestTest/resources/rest-api-spec/test/10_analyze.yml

@@ -175,6 +175,6 @@ setup:
 
   - match: { status: 500 }
   - match: { error.type: repository_verification_exception }
-  - match: { error.reason: "/.*test_repo_slow..analysis.failed.*/" }
+  - match: { error.reason: "/.*test_repo_slow..Repository.analysis.timed.out.*/" }
   - match: { error.root_cause.0.type: repository_verification_exception }
   - match: { error.root_cause.0.reason: "/.*test_repo_slow..analysis.timed.out.after..1s.*/" }

+ 21 - 3
x-pack/plugin/snapshot-repo-test-kit/src/internalClusterTest/java/org/elasticsearch/repositories/blobstore/testkit/RepositoryAnalysisFailureIT.java

@@ -11,6 +11,7 @@ import org.elasticsearch.ExceptionsHelper;
 import org.elasticsearch.action.ActionListener;
 import org.elasticsearch.cluster.metadata.RepositoryMetadata;
 import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.ReferenceDocs;
 import org.elasticsearch.common.blobstore.BlobContainer;
 import org.elasticsearch.common.blobstore.BlobPath;
 import org.elasticsearch.common.blobstore.BlobStore;
@@ -363,6 +364,17 @@ public class RepositoryAnalysisFailureIT extends AbstractSnapshotIntegTestCase {
         }
     }
 
+    private static void assertAnalysisFailureMessage(String message) {
+        assertThat(
+            message,
+            allOf(
+                containsString("Elasticsearch observed the storage system underneath this repository behaved incorrectly"),
+                containsString("not suitable for use with Elasticsearch snapshots"),
+                containsString(ReferenceDocs.SNAPSHOT_REPOSITORY_ANALYSIS.toString())
+            )
+        );
+    }
+
     public void testTimesOutSpinningRegisterAnalysis() {
         final RepositoryAnalyzeAction.Request request = new RepositoryAnalyzeAction.Request("test-repo");
         request.timeout(TimeValue.timeValueMillis(between(1, 1000)));
@@ -375,7 +387,13 @@ public class RepositoryAnalysisFailureIT extends AbstractSnapshotIntegTestCase {
             }
         });
         final var exception = expectThrows(RepositoryVerificationException.class, () -> analyseRepository(request));
-        assertThat(exception.getMessage(), containsString("analysis failed"));
+        assertThat(
+            exception.getMessage(),
+            allOf(
+                containsString("Repository analysis timed out. Consider specifying a longer timeout"),
+                containsString(ReferenceDocs.SNAPSHOT_REPOSITORY_ANALYSIS.toString())
+            )
+        );
         assertThat(
             asInstanceOf(RepositoryVerificationException.class, exception.getCause()).getMessage(),
             containsString("analysis timed out")
@@ -391,7 +409,7 @@ public class RepositoryAnalysisFailureIT extends AbstractSnapshotIntegTestCase {
             }
         });
         final var exception = expectThrows(RepositoryVerificationException.class, () -> analyseRepository(request));
-        assertThat(exception.getMessage(), containsString("analysis failed"));
+        assertAnalysisFailureMessage(exception.getMessage());
         assertThat(
             asInstanceOf(RepositoryVerificationException.class, ExceptionsHelper.unwrapCause(exception.getCause())).getMessage(),
             allOf(containsString("uncontended register operation failed"), containsString("did not observe any value"))
@@ -407,7 +425,7 @@ public class RepositoryAnalysisFailureIT extends AbstractSnapshotIntegTestCase {
             }
         });
         final var exception = expectThrows(RepositoryVerificationException.class, () -> analyseRepository(request));
-        assertThat(exception.getMessage(), containsString("analysis failed"));
+        assertAnalysisFailureMessage(exception.getMessage());
         final var cause = ExceptionsHelper.unwrapCause(exception.getCause());
         if (cause instanceof IOException ioException) {
             assertThat(ioException.getMessage(), containsString("empty register update rejected"));

+ 26 - 14
x-pack/plugin/snapshot-repo-test-kit/src/main/java/org/elasticsearch/repositories/blobstore/testkit/RepositoryAnalyzeAction.java

@@ -28,6 +28,7 @@ import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.ReferenceDocs;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.UUIDs;
 import org.elasticsearch.common.blobstore.BlobContainer;
@@ -387,6 +388,9 @@ public class RepositoryAnalyzeAction extends HandledTransportAction<RepositoryAn
         private final List<BlobAnalyzeAction.Response> responses;
         private final RepositoryPerformanceSummary.Builder summary = new RepositoryPerformanceSummary.Builder();
 
+        private final RepositoryVerificationException analysisCancelledException;
+        private final RepositoryVerificationException analysisTimedOutException;
+
         public AsyncAction(
             TransportService transportService,
             BlobStoreRepository repository,
@@ -410,6 +414,12 @@ public class RepositoryAnalyzeAction extends HandledTransportAction<RepositoryAn
             this.listener = ActionListener.runBefore(listener, () -> cancellationListener.onResponse(null));
 
             responses = new ArrayList<>(request.blobCount);
+
+            this.analysisCancelledException = new RepositoryVerificationException(request.repositoryName, "analysis cancelled");
+            this.analysisTimedOutException = new RepositoryVerificationException(
+                request.repositoryName,
+                "analysis timed out after [" + request.getTimeout() + "]"
+            );
         }
 
         private boolean setFirstFailure(Exception e) {
@@ -453,12 +463,7 @@ public class RepositoryAnalyzeAction extends HandledTransportAction<RepositoryAn
                 assert e instanceof ElasticsearchTimeoutException : e;
                 if (isRunning()) {
                     // if this CAS fails then we're already failing for some other reason, nbd
-                    setFirstFailure(
-                        new RepositoryVerificationException(
-                            request.repositoryName,
-                            "analysis timed out after [" + request.getTimeout() + "]"
-                        )
-                    );
+                    setFirstFailure(analysisTimedOutException);
                 }
             }
         }
@@ -472,7 +477,7 @@ public class RepositoryAnalyzeAction extends HandledTransportAction<RepositoryAn
             cancellationListener.addTimeout(request.getTimeout(), repository.threadPool(), EsExecutors.DIRECT_EXECUTOR_SERVICE);
             cancellationListener.addListener(new CheckForCancelListener());
 
-            task.addListener(() -> setFirstFailure(new RepositoryVerificationException(request.repositoryName, "analysis cancelled")));
+            task.addListener(() -> setFirstFailure(analysisCancelledException));
 
             final Random random = new Random(request.getSeed());
             final List<DiscoveryNode> nodes = getSnapshotNodes(discoveryNodes);
@@ -873,13 +878,20 @@ public class RepositoryAnalyzeAction extends HandledTransportAction<RepositoryAn
                 );
             } else {
                 logger.debug(() -> "analysis of repository [" + request.repositoryName + "] failed", exception);
-                listener.onFailure(
-                    new RepositoryVerificationException(
-                        request.getRepositoryName(),
-                        "analysis failed, you may need to manually remove [" + blobPath + "]",
-                        exception
-                    )
-                );
+
+                final String failureDetail;
+                if (exception == analysisCancelledException) {
+                    failureDetail = "Repository analysis was cancelled.";
+                } else if (exception == analysisTimedOutException) {
+                    failureDetail = Strings.format("""
+                        Repository analysis timed out. Consider specifying a longer timeout using the [?timeout] request parameter. See \
+                        [%s] for more information.""", ReferenceDocs.SNAPSHOT_REPOSITORY_ANALYSIS);
+                } else {
+                    failureDetail = repository.getAnalysisFailureExtraDetail();
+                }
+                listener.onFailure(new RepositoryVerificationException(request.getRepositoryName(), Strings.format("""
+                    %s Elasticsearch attempted to remove the data it wrote at [%s] but may have left some behind. If so, \
+                    please now remove this data manually.""", failureDetail, blobPath), exception));
             }
         }
     }