Просмотр исходного кода

Add links to troubleshooting docs (#92755)

Users facing problems with cluster coordination seemingly struggle to find the
troubleshooting docs in the manual, without which it's hard to understand or
resolve the problem leading to unnecessary support calls and escalations,
typically at high urgency.

This commit introduces a central list of known (versioned) links to the
relevant parts of the reference manual, and includes those links in the
relevant error messages. There are certainly other places where this
functionality would be valuable, but here we only focus on the troubleshooting
guides for cluster coordination problems.

It does not currently validate that the links work correctly, but that would be
a valuable followup. Experience has shown that it's better to have these links
than not, even given the risk that they might be broken by a future change.

Closes #92741
David Turner 2 лет назад
Родитель
Сommit
629c96ed1a

+ 6 - 0
docs/changelog/92755.yaml

@@ -0,0 +1,6 @@
+pr: 92755
+summary: Add links to troubleshooting docs
+area: Cluster Coordination
+type: enhancement
+issues:
+ - 92741

+ 5 - 2
server/src/main/java/org/elasticsearch/cluster/coordination/ClusterBootstrapService.java

@@ -12,6 +12,7 @@ import org.apache.logging.log4j.Logger;
 import org.elasticsearch.cluster.coordination.CoordinationMetadata.VotingConfiguration;
 import org.elasticsearch.cluster.metadata.Metadata;
 import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.common.ReferenceDocs;
 import org.elasticsearch.common.settings.Setting;
 import org.elasticsearch.common.settings.Setting.Property;
 import org.elasticsearch.common.settings.Settings;
@@ -146,10 +147,12 @@ public class ClusterBootstrapService implements Coordinator.PeerFinderListener {
         logger.warn(
             """
                 this node is locked into cluster UUID [{}] but [{}] is set to {}; \
-                remove this setting to avoid possible data loss caused by subsequent cluster bootstrap attempts""",
+                remove this setting to avoid possible data loss caused by subsequent cluster bootstrap attempts; \
+                for further information see {}""",
             clusterUUID,
             INITIAL_MASTER_NODES_SETTING.getKey(),
-            bootstrapRequirements
+            bootstrapRequirements,
+            ReferenceDocs.INITIAL_MASTER_NODES
         );
     }
 

+ 6 - 1
server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java

@@ -14,6 +14,7 @@ import org.elasticsearch.cluster.coordination.CoordinationMetadata.VotingConfigu
 import org.elasticsearch.cluster.coordination.CoordinationState.VoteCollection;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
+import org.elasticsearch.common.ReferenceDocs;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
 import org.elasticsearch.common.io.stream.Writeable;
@@ -100,7 +101,11 @@ public class ClusterFormationFailureHelper {
                 protected void doRun() {
                     if (isActive()) {
                         logLastFailedJoinAttempt.run();
-                        logger.warn(clusterFormationStateSupplier.get().getDescription());
+                        logger.warn(
+                            "{}; for troubleshooting guidance, see {}",
+                            clusterFormationStateSupplier.get().getDescription(),
+                            ReferenceDocs.DISCOVERY_TROUBLESHOOTING
+                        );
                     }
                 }
 

+ 6 - 2
server/src/main/java/org/elasticsearch/cluster/coordination/JoinReason.java

@@ -8,7 +8,11 @@
 
 package org.elasticsearch.cluster.coordination;
 
+import org.elasticsearch.common.ReferenceDocs;
+import org.elasticsearch.core.Nullable;
+
 /**
- * @param message Message describing the reason for the node joining
+ * @param message      Message describing the reason for the node joining
+ * @param guidanceDocs An optional link to troubleshooting guidance docs
  */
-public record JoinReason(String message) {}
+public record JoinReason(String message, @Nullable ReferenceDocs guidanceDocs) {}

+ 5 - 4
server/src/main/java/org/elasticsearch/cluster/coordination/JoinReasonService.java

@@ -12,6 +12,7 @@ import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.cluster.service.ClusterApplierService;
 import org.elasticsearch.cluster.service.MasterService;
+import org.elasticsearch.common.ReferenceDocs;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
 import org.elasticsearch.core.Nullable;
@@ -261,7 +262,7 @@ public class JoinReasonService {
                 description.append(", [").append(removalCount).append("] total removals");
             }
 
-            return new JoinReason(description.toString());
+            return new JoinReason(description.toString(), isRestarted ? null : ReferenceDocs.UNSTABLE_CLUSTER_TROUBLESHOOTING);
         }
 
         @Override
@@ -270,7 +271,7 @@ public class JoinReasonService {
         }
     }
 
-    private static final JoinReason COMPLETING_ELECTION = new JoinReason("completing election");
-    private static final JoinReason NEW_NODE_JOINING = new JoinReason("joining");
-    private static final JoinReason KNOWN_NODE_REJOINING = new JoinReason("rejoining");
+    private static final JoinReason COMPLETING_ELECTION = new JoinReason("completing election", null);
+    private static final JoinReason NEW_NODE_JOINING = new JoinReason("joining", null);
+    private static final JoinReason KNOWN_NODE_REJOINING = new JoinReason("rejoining", null);
 }

+ 15 - 5
server/src/main/java/org/elasticsearch/cluster/coordination/NodeJoinExecutor.java

@@ -136,11 +136,21 @@ public class NodeJoinExecutor implements ClusterStateTaskExecutor<JoinTask> {
                     }
                 }
                 onTaskSuccess.add(() -> {
-                    logger.info(
-                        "node-join: [{}] with reason [{}]",
-                        nodeJoinTask.node().descriptionWithoutAttributes(),
-                        nodeJoinTask.reason().message()
-                    );
+                    final var reason = nodeJoinTask.reason();
+                    if (reason.guidanceDocs() == null) {
+                        logger.info(
+                            "node-join: [{}] with reason [{}]",
+                            nodeJoinTask.node().descriptionWithoutAttributes(),
+                            reason.message()
+                        );
+                    } else {
+                        logger.warn(
+                            "node-join: [{}] with reason [{}]; for troubleshooting guidance, see {}",
+                            nodeJoinTask.node().descriptionWithoutAttributes(),
+                            reason.message(),
+                            reason.guidanceDocs()
+                        );
+                    }
                     nodeJoinTask.listener().onResponse(null);
                 });
             }

+ 68 - 0
server/src/main/java/org/elasticsearch/common/ReferenceDocs.java

@@ -0,0 +1,68 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.common;
+
+import org.elasticsearch.Build;
+import org.elasticsearch.Version;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Encapsulates links to pages in the reference docs, so that for example we can include URLs in logs and API outputs. Each instance's
+ * {@link #toString()} yields (a string representation of) a URL for the relevant docs.
+ */
+public enum ReferenceDocs {
+    INITIAL_MASTER_NODES("important-settings.html#initial_master_nodes"),
+    DISCOVERY_TROUBLESHOOTING("discovery-troubleshooting.html"),
+    UNSTABLE_CLUSTER_TROUBLESHOOTING("cluster-fault-detection.html#cluster-fault-detection-troubleshooting");
+
+    private final String relativePath;
+
+    ReferenceDocs(String relativePath) {
+        this.relativePath = relativePath;
+    }
+
+    static final String UNRELEASED_VERSION_COMPONENT = "master";
+    static final String VERSION_COMPONENT = getVersionComponent(Version.CURRENT, Build.CURRENT.isSnapshot());
+
+    /**
+     * Compute the version component of the URL path (e.g. {@code 8.5} or {@code master}) for a particular version of Elasticsearch. Exposed
+     * for testing, but all items use {@link #VERSION_COMPONENT} ({@code getVersionComponent(Version.CURRENT, Build.CURRENT.isSnapshot())})
+     * which relates to the current version and build.
+     */
+    static String getVersionComponent(Version version, boolean isSnapshot) {
+        return isSnapshot && version.revision == 0 ? UNRELEASED_VERSION_COMPONENT : version.major + "." + version.minor;
+    }
+
+    @Override
+    public String toString() {
+        return "https://www.elastic.co/guide/en/elasticsearch/reference/" + VERSION_COMPONENT + "/" + relativePath;
+    }
+
+    /**
+     * @return a list of links which are expected to exist for a particular version of Elasticsearch, which is either all links (if the docs
+     * are released) or no links (otherwise). Exposed for testing the behaviour of different versions, but for the current version use
+     * {@link #linksToVerify()}.
+     */
+    static List<ReferenceDocs> linksToVerify(Version version, boolean isSnapshot) {
+        if (version.revision == 0 && isSnapshot == false) {
+            return List.of();
+        }
+        return Arrays.stream(values()).toList();
+    }
+
+    /**
+     * @return a list of links which are expected to exist for the current version of Elasticsearch, which is either all links (if the docs
+     * are released) or no links (otherwise).
+     */
+    public static List<ReferenceDocs> linksToVerify() {
+        return linksToVerify(Version.CURRENT, Build.CURRENT.isSnapshot());
+    }
+}

+ 3 - 1
server/src/test/java/org/elasticsearch/cluster/coordination/ClusterBootstrapServiceTests.java

@@ -707,7 +707,9 @@ public class ClusterBootstrapServiceTests extends ESTestCase {
 
             final var warningMessagePattern = """
                 this node is locked into cluster UUID [test-uuid] but [cluster.initial_master_nodes] is set to [node1, node2]; \
-                remove this setting to avoid possible data loss caused by subsequent cluster bootstrap attempts""";
+                remove this setting to avoid possible data loss caused by subsequent cluster bootstrap attempts; \
+                for further information see \
+                https://www.elastic.co/guide/en/elasticsearch/reference/*/important-settings.html#initial_master_nodes""";
 
             mockAppender.addExpectation(
                 new MockLogAppender.SeenEventExpectation(

+ 9 - 0
server/src/test/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelperTests.java

@@ -112,6 +112,15 @@ public class ClusterFormationFailureHelperTests extends ESTestCase {
         mockLogAppender.addExpectation(
             new MockLogAppender.SeenEventExpectation("master not discovered", LOGGER_NAME, Level.WARN, "master not discovered")
         );
+        mockLogAppender.addExpectation(
+            new MockLogAppender.SeenEventExpectation(
+                "troubleshooting link",
+                LOGGER_NAME,
+                Level.WARN,
+                "* for troubleshooting guidance, see "
+                    + "https://www.elastic.co/guide/en/elasticsearch/reference/*/discovery-troubleshooting.html*"
+            )
+        );
         try (var ignored = mockLogAppender.capturing(ClusterFormationFailureHelper.class)) {
             while (warningCount.get() == 0) {
                 assertTrue(clusterFormationFailureHelper.isRunning());

+ 16 - 9
server/src/test/java/org/elasticsearch/cluster/coordination/JoinReasonServiceTests.java

@@ -12,6 +12,7 @@ import org.elasticsearch.Version;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodeRole;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
+import org.elasticsearch.common.ReferenceDocs;
 import org.elasticsearch.common.UUIDs;
 import org.elasticsearch.common.transport.TransportAddress;
 import org.elasticsearch.test.ESTestCase;
@@ -59,7 +60,7 @@ public class JoinReasonServiceTests extends ESTestCase {
 
         assertThat(
             joinReasonService.getJoinReason(discoveryNode, LEADER),
-            matches("joining, removed [1.2s/1234ms] ago by [" + master.getName() + "]")
+            matchesNeedingGuidance("joining, removed [1.2s/1234ms] ago by [" + master.getName() + "]")
         );
 
         joinReasonService.onNodeRemoved(discoveryNode, "test removal");
@@ -67,7 +68,7 @@ public class JoinReasonServiceTests extends ESTestCase {
 
         assertThat(
             joinReasonService.getJoinReason(discoveryNode, LEADER),
-            matches("joining, removed [5.5s/5555ms] ago with reason [test removal]")
+            matchesNeedingGuidance("joining, removed [5.5s/5555ms] ago with reason [test removal]")
         );
 
         joinReasonService.onClusterStateApplied(withNode);
@@ -75,14 +76,14 @@ public class JoinReasonServiceTests extends ESTestCase {
 
         assertThat(
             joinReasonService.getJoinReason(discoveryNode, LEADER),
-            matches("joining, removed [0ms] ago by [" + master.getName() + "], [2] total removals")
+            matchesNeedingGuidance("joining, removed [0ms] ago by [" + master.getName() + "], [2] total removals")
         );
 
         joinReasonService.onNodeRemoved(discoveryNode, "second test removal");
 
         assertThat(
             joinReasonService.getJoinReason(discoveryNode, LEADER),
-            matches("joining, removed [0ms] ago with reason [second test removal], [2] total removals")
+            matchesNeedingGuidance("joining, removed [0ms] ago with reason [second test removal], [2] total removals")
         );
 
         final DiscoveryNode rebootedNode = new DiscoveryNode(
@@ -109,7 +110,7 @@ public class JoinReasonServiceTests extends ESTestCase {
 
         assertThat(
             joinReasonService.getJoinReason(rebootedNode, LEADER),
-            matches("joining, removed [0ms] ago with reason [third test removal]")
+            matchesNeedingGuidance("joining, removed [0ms] ago with reason [third test removal]")
         );
 
         joinReasonService.onClusterStateApplied(withRebootedNode);
@@ -118,7 +119,7 @@ public class JoinReasonServiceTests extends ESTestCase {
 
         assertThat(
             joinReasonService.getJoinReason(rebootedNode, LEADER),
-            matches("joining, removed [0ms] ago with reason [fourth test removal], [2] total removals")
+            matchesNeedingGuidance("joining, removed [0ms] ago with reason [fourth test removal], [2] total removals")
         );
 
         joinReasonService.onClusterStateApplied(withRebootedNode);
@@ -127,7 +128,7 @@ public class JoinReasonServiceTests extends ESTestCase {
 
         assertThat(
             joinReasonService.getJoinReason(discoveryNode, LEADER),
-            matches("joining, removed [0ms] ago by [" + master.getName() + "]")
+            matchesNeedingGuidance("joining, removed [0ms] ago by [" + master.getName() + "]")
         );
 
         assertThat(
@@ -179,7 +180,10 @@ public class JoinReasonServiceTests extends ESTestCase {
 
         // remove almost enough other nodes and verify that we're still tracking the target node
         joinReasonService.onClusterStateApplied(almostCleanupNodes);
-        assertThat(joinReasonService.getJoinReason(targetNode, LEADER), matches("joining, removed [1ms] ago with reason [test]"));
+        assertThat(
+            joinReasonService.getJoinReason(targetNode, LEADER),
+            matchesNeedingGuidance("joining, removed [1ms] ago with reason [test]")
+        );
 
         // remove one more node to trigger the cleanup and forget about the target node
         joinReasonService.onClusterStateApplied(cleanupNodes);
@@ -202,7 +206,10 @@ public class JoinReasonServiceTests extends ESTestCase {
     }
 
     private static Matcher<JoinReason> matches(String message) {
-        return equalTo(new JoinReason(message));
+        return equalTo(new JoinReason(message, null));
     }
 
+    private static Matcher<JoinReason> matchesNeedingGuidance(String message) {
+        return equalTo(new JoinReason(message, ReferenceDocs.UNSTABLE_CLUSTER_TROUBLESHOOTING));
+    }
 }

+ 31 - 1
server/src/test/java/org/elasticsearch/cluster/coordination/NodeJoinExecutorTests.java

@@ -28,6 +28,7 @@ import org.elasticsearch.cluster.routing.RerouteService;
 import org.elasticsearch.cluster.routing.allocation.AllocationService;
 import org.elasticsearch.cluster.service.ClusterStateTaskExecutorUtils;
 import org.elasticsearch.common.Priority;
+import org.elasticsearch.common.ReferenceDocs;
 import org.elasticsearch.common.UUIDs;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.test.ClusterServiceUtils;
@@ -174,7 +175,7 @@ public class NodeJoinExecutorTests extends ESTestCase {
         return VersionUtils.randomVersionBetween(random(), Version.CURRENT.minimumIndexCompatibilityVersion(), Version.CURRENT);
     }
 
-    private static final JoinReason TEST_REASON = new JoinReason("test");
+    private static final JoinReason TEST_REASON = new JoinReason("test", null);
 
     public void testUpdatesNodeWithNewRoles() throws Exception {
         // Node roles vary by version, and new roles are suppressed for BWC. This means we can receive a join from a node that's already
@@ -624,6 +625,35 @@ public class NodeJoinExecutorTests extends ESTestCase {
                 )
             );
             appender.assertAllExpectationsMatched();
+
+            final var node2 = new DiscoveryNode(UUIDs.base64UUID(), buildNewFakeTransportAddress(), Version.CURRENT);
+            final var testReasonWithLink = new JoinReason("test", ReferenceDocs.UNSTABLE_CLUSTER_TROUBLESHOOTING);
+            appender.addExpectation(
+                new MockLogAppender.SeenEventExpectation(
+                    "warn message with troubleshooting link",
+                    LOGGER_NAME,
+                    Level.WARN,
+                    "node-join: ["
+                        + node2.descriptionWithoutAttributes()
+                        + "] with reason ["
+                        + testReasonWithLink.message()
+                        + "]; for troubleshooting guidance, see https://www.elastic.co/guide/en/elasticsearch/reference/*"
+                )
+            );
+            assertNull(
+                PlainActionFuture.<Void, RuntimeException>get(
+                    future -> clusterService.getMasterService()
+                        .submitStateUpdateTask(
+                            "test",
+                            JoinTask.singleNode(node2, testReasonWithLink, future, 0L),
+                            ClusterStateTaskConfig.build(Priority.NORMAL),
+                            executor
+                        ),
+                    10,
+                    TimeUnit.SECONDS
+                )
+            );
+            appender.assertAllExpectationsMatched();
         } finally {
             TestThreadPool.terminate(threadPool, 10, TimeUnit.SECONDS);
         }

+ 57 - 0
server/src/test/java/org/elasticsearch/common/ReferenceDocsTests.java

@@ -0,0 +1,57 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.common;
+
+import org.elasticsearch.Version;
+import org.elasticsearch.common.io.Streams;
+import org.elasticsearch.core.SuppressForbidden;
+import org.elasticsearch.test.ESTestCase;
+
+import java.io.IOException;
+import java.net.URL;
+
+import static org.elasticsearch.common.ReferenceDocs.getVersionComponent;
+import static org.elasticsearch.common.ReferenceDocs.linksToVerify;
+
+public class ReferenceDocsTests extends ESTestCase {
+
+    public void testVersionComponent() {
+        // Snapshot x.y.0 versions are unreleased so link to master
+        assertEquals("master", getVersionComponent(Version.V_8_7_0, true));
+
+        // Snapshot x.y.z versions with z>0 mean that x.y.0 is released so have a properly versioned docs link
+        assertEquals("8.5", getVersionComponent(Version.V_8_5_1, true));
+
+        // Non-snapshot versions are to be released so have a properly versioned docs link
+        assertEquals("8.7", getVersionComponent(Version.V_8_7_0, false));
+    }
+
+    public void testLinksToVerify() {
+        // Snapshot x.y.0 versions are unreleased so we link to master and these links are expected to exist
+        assertFalse(linksToVerify(Version.V_8_7_0, true).isEmpty());
+
+        // Snapshot x.y.z versions with z>0 mean that x.y.0 is released so links are expected to exist
+        assertFalse(linksToVerify(Version.V_8_5_1, true).isEmpty());
+
+        // Non-snapshot x.y.0 versions may not be released yet, and the docs are published on release, so we cannot verify these links
+        assertTrue(linksToVerify(Version.V_8_7_0, false).isEmpty());
+    }
+
+    @AwaitsFix(bugUrl = "TODO")
+    @SuppressForbidden(reason = "never executed")
+    public void testDocsExist() throws IOException {
+        // cannot run as a unit test due to security manager restrictions - TODO create a separate Gradle task for this
+        for (ReferenceDocs docsLink : linksToVerify()) {
+            try (var stream = new URL(docsLink.toString()).openStream()) {
+                Streams.readFully(stream);
+                // TODO also for URLs that contain a fragment id, verify that the fragment exists
+            }
+        }
+    }
+}

+ 1 - 1
server/src/test/java/org/elasticsearch/indices/cluster/ClusterStateChanges.java

@@ -397,7 +397,7 @@ public class ClusterStateChanges {
         return execute(transportClusterRerouteAction, request, state);
     }
 
-    private static final JoinReason DUMMY_REASON = new JoinReason("dummy reason");
+    private static final JoinReason DUMMY_REASON = new JoinReason("dummy reason", null);
 
     public ClusterState addNode(ClusterState clusterState, DiscoveryNode discoveryNode) {
         return runTasks(