Browse Source

Make allocation explanations more actionable (#83983)

The cluster allocation explain API includes a top-level status
indicating to the user whether the shard can be assigned/rebalanced/etc
or not. Today this status is fairly terse and experience shows that
users sometimes struggle to understand how to interpret it and to decide
on follow-up actions.

This commit makes the top-level explanation more detailed and
actionable. For instance, in the cases like `THROTTLED` where the status
is transient we instruct the user to wait; if a shard is lost we say to
restore it from a snapshot; if a shard cannot be assigned we say to
choose a specific node where its assignment is expected and to address
the obstacles.

Co-authored-by: James Rodewig <james.rodewig@elastic.co>
David Turner 3 years ago
parent
commit
02f38e3da9

+ 5 - 0
docs/changelog/83983.yaml

@@ -0,0 +1,5 @@
+pr: 83983
+summary: Make allocation explanations more actionable
+area: Allocation
+type: enhancement
+issues: []

+ 7 - 6
docs/reference/cluster/allocation-explain.asciidoc

@@ -107,7 +107,8 @@ GET _cluster/allocation/explain
 ----
 // TEST[continued]
 
-The API response indicates the shard is allocated to a nonexistent node.
+The API response indicates the shard can only be allocated to a nonexistent
+node.
 
 [source,console-result]
 ----
@@ -122,7 +123,7 @@ The API response indicates the shard is allocated to a nonexistent node.
     "last_allocation_status" : "no"
   },
   "can_allocate" : "no",                          <3>
-  "allocate_explanation" : "cannot allocate because allocation is not permitted to any of the nodes",
+  "allocate_explanation" : "Elasticsearch isn't allowed to allocate this shard to any of the nodes in the cluster. Choose a node to which you expect this shard to be allocated, find this node in the node-by-node explanation, and address the reasons which prevent Elasticsearch from allocating this shard there.",
   "node_allocation_decisions" : [
     {
       "node_id" : "8qt2rY-pT6KNZB3-hGfLnw",
@@ -171,7 +172,7 @@ primary shard that was previously allocated.
     "last_allocation_status" : "no_valid_shard_copy"
   },
   "can_allocate" : "no_valid_shard_copy",
-  "allocate_explanation" : "cannot allocate because a previous copy of the primary shard existed but can no longer be found on the nodes in the cluster"
+  "allocate_explanation" : "Elasticsearch can't allocate this shard because there are no copies of its data in the cluster. Elasticsearch will allocate this shard when a node holding a good copy of its data joins the cluster. If no such node is available, restore this index from a recent snapshot."
 }
 ----
 // NOTCONSOLE
@@ -195,7 +196,7 @@ unassigned due to <<delayed-allocation,delayed allocation>>.
     "last_allocation_status" : "no_attempt"
   },
   "can_allocate" : "allocation_delayed",
-  "allocate_explanation" : "cannot allocate because the cluster is still waiting 59.8s for the departed node holding a replica to rejoin, despite being allowed to allocate the shard to at least one other node",
+  "allocate_explanation" : "The node containing this shard copy recently left the cluster. Elasticsearch is waiting for it to return. If the node does not return within [%s] then Elasticsearch will allocate this shard to another node. Please wait.",
   "configured_delay" : "1m",                      <1>
   "configured_delay_in_millis" : 60000,
   "remaining_delay" : "59.8s",                    <2>
@@ -260,7 +261,7 @@ and must be reallocated.
     }
   ],
   "can_move_to_other_node" : "no",                <3>
-  "move_explanation" : "cannot move shard to another node, even though it is not allowed to remain on its current node",
+  "move_explanation" : "This shard may not remain on its current node, but Elasticsearch isn't allowed to move it to another node. Choose a node to which you expect this shard to be allocated, find this node in the node-by-node explanation, and address the reasons which prevent Elasticsearch from allocating this shard there.",
   "node_allocation_decisions" : [
     {
       "node_id" : "_P8olZS8Twax9u6ioN-GGA",
@@ -305,7 +306,7 @@ cluster balance.
   "can_remain_on_current_node" : "yes",
   "can_rebalance_cluster" : "yes",                <1>
   "can_rebalance_to_other_node" : "no",           <2>
-  "rebalance_explanation" : "cannot rebalance as no target node exists that can both allocate this shard and improve the cluster balance",
+  "rebalance_explanation" : "Elasticsearch cannot rebalance this shard to another node since there is no node to which allocation is permitted which would improve the cluster balance. If you expect this shard to be rebalanced to another node, find this node in the node-by-node explanation and address the reasons which prevent Elasticsearch from rebalancing this shard there.",
   "node_allocation_decisions" : [
     {
       "node_id" : "oE3EGFc8QN-Tdi5FFEprIA",

+ 32 - 65
server/src/internalClusterTest/java/org/elasticsearch/action/admin/cluster/allocation/ClusterAllocationExplainIT.java

@@ -21,6 +21,7 @@ import org.elasticsearch.cluster.routing.UnassignedInfo.AllocationStatus;
 import org.elasticsearch.cluster.routing.UnassignedInfo.Reason;
 import org.elasticsearch.cluster.routing.allocation.AllocateUnassignedDecision;
 import org.elasticsearch.cluster.routing.allocation.AllocationDecision;
+import org.elasticsearch.cluster.routing.allocation.Explanations;
 import org.elasticsearch.cluster.routing.allocation.MoveDecision;
 import org.elasticsearch.cluster.routing.allocation.NodeAllocationResult;
 import org.elasticsearch.cluster.routing.allocation.decider.Decision;
@@ -47,6 +48,7 @@ import java.util.Map;
 import java.util.Set;
 
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
+import static org.hamcrest.Matchers.allOf;
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.greaterThan;
@@ -114,16 +116,9 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
                 || allocateDecision.getAllocationDecision() == AllocationDecision.AWAITING_INFO
         );
         if (allocateDecision.getAllocationDecision() == AllocationDecision.NO_VALID_SHARD_COPY) {
-            assertEquals(
-                "cannot allocate because a previous copy of the primary shard existed but can no longer be "
-                    + "found on the nodes in the cluster",
-                allocateDecision.getExplanation()
-            );
+            assertEquals(Explanations.Allocation.NO_COPIES, allocateDecision.getExplanation());
         } else {
-            assertEquals(
-                "cannot allocate because information about existing shard data is still being retrieved from some of the nodes",
-                allocateDecision.getExplanation()
-            );
+            assertEquals(Explanations.Allocation.AWAITING_INFO, allocateDecision.getExplanation());
         }
         assertNull(allocateDecision.getAllocationId());
         assertNull(allocateDecision.getTargetNode());
@@ -143,11 +138,7 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
                 parser.nextToken();
                 assertEquals("allocate_explanation", parser.currentName());
                 parser.nextToken();
-                assertEquals(
-                    "cannot allocate because a previous copy of the primary shard existed but can no longer be found "
-                        + "on the nodes in the cluster",
-                    parser.text()
-                );
+                assertEquals(Explanations.Allocation.NO_COPIES, parser.text());
                 verifyStaleShardCopyNodeDecisions(parser, 1, Collections.emptySet());
             }
         }
@@ -215,10 +206,13 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
         assertTrue(allocateDecision.isDecisionTaken());
         assertFalse(moveDecision.isDecisionTaken());
         assertEquals(AllocationDecision.ALLOCATION_DELAYED, allocateDecision.getAllocationDecision());
-        assertThat(allocateDecision.getExplanation(), startsWith("cannot allocate because the cluster is still waiting"));
         assertThat(
             allocateDecision.getExplanation(),
-            containsString("despite being allowed to allocate the shard to at least one other node")
+            allOf(
+                containsString("The node containing this shard copy recently left the cluster. Elasticsearch is waiting for it to return."),
+                containsString("If the node does not return within ["),
+                containsString("] then Elasticsearch will allocate this shard to another node. Please wait.")
+            )
         );
         assertNull(allocateDecision.getAllocationId());
         assertNull(allocateDecision.getTargetNode());
@@ -265,7 +259,10 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
             parser.nextToken();
             assertEquals("allocate_explanation", parser.currentName());
             parser.nextToken();
-            assertThat(parser.text(), startsWith("cannot allocate because the cluster is still waiting"));
+            assertThat(
+                parser.text(),
+                startsWith("The node containing this shard copy recently left the cluster. Elasticsearch is waiting for it to return.")
+            );
             parser.nextToken();
             assertEquals("configured_delay_in_millis", parser.currentName());
             parser.nextToken();
@@ -346,12 +343,9 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
         AllocationDecision decisionToAllocate = allocateDecision.getAllocationDecision();
         assertTrue(decisionToAllocate == AllocationDecision.AWAITING_INFO || decisionToAllocate == AllocationDecision.NO);
         if (decisionToAllocate == AllocationDecision.AWAITING_INFO) {
-            assertEquals(
-                "cannot allocate because information about existing shard data is still being retrieved from some of the nodes",
-                allocateDecision.getExplanation()
-            );
+            assertEquals(Explanations.Allocation.AWAITING_INFO, allocateDecision.getExplanation());
         } else {
-            assertEquals("cannot allocate because allocation is not permitted to any of the nodes", allocateDecision.getExplanation());
+            assertEquals(Explanations.Allocation.ALL_NODES_FORBIDDEN, allocateDecision.getExplanation());
         }
         assertNull(allocateDecision.getAllocationId());
         assertNull(allocateDecision.getTargetNode());
@@ -399,12 +393,9 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
             assertEquals("allocate_explanation", parser.currentName());
             parser.nextToken();
             if (allocationDecision.equals("awaiting_info")) {
-                assertEquals(
-                    "cannot allocate because information about existing shard data is still being retrieved " + "from some of the nodes",
-                    parser.text()
-                );
+                assertEquals(Explanations.Allocation.AWAITING_INFO, parser.text());
             } else {
-                assertEquals("cannot allocate because allocation is not permitted to any of the nodes", parser.text());
+                assertEquals(Explanations.Allocation.ALL_NODES_FORBIDDEN, parser.text());
             }
             Map<String, AllocationDecision> nodeDecisions = new HashMap<>();
             for (String nodeName : internalCluster().getNodeNames()) {
@@ -462,7 +453,7 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
         assertTrue(allocateDecision.isDecisionTaken());
         assertFalse(moveDecision.isDecisionTaken());
         assertEquals(AllocationDecision.NO, allocateDecision.getAllocationDecision());
-        assertEquals("cannot allocate because allocation is not permitted to any of the nodes", allocateDecision.getExplanation());
+        assertEquals(Explanations.Allocation.ALL_NODES_FORBIDDEN, allocateDecision.getExplanation());
         assertNull(allocateDecision.getAllocationId());
         assertNull(allocateDecision.getTargetNode());
         assertEquals(0L, allocateDecision.getConfiguredDelayInMillis());
@@ -502,12 +493,9 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
             assertEquals("allocate_explanation", parser.currentName());
             parser.nextToken();
             if (allocationDecision.equals("awaiting_info")) {
-                assertEquals(
-                    "cannot allocate because information about existing shard data is still being retrieved " + "from some of the nodes",
-                    parser.text()
-                );
+                assertEquals(Explanations.Allocation.AWAITING_INFO, parser.text());
             } else {
-                assertEquals("cannot allocate because allocation is not permitted to any of the nodes", parser.text());
+                assertEquals(Explanations.Allocation.ALL_NODES_FORBIDDEN, parser.text());
             }
             Map<String, AllocationDecision> nodeDecisions = new HashMap<>();
             for (String nodeName : internalCluster().getNodeNames()) {
@@ -563,10 +551,7 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
         assertFalse(allocateDecision.isDecisionTaken());
         assertTrue(moveDecision.isDecisionTaken());
         assertEquals(AllocationDecision.NO, moveDecision.getAllocationDecision());
-        assertEquals(
-            "cannot move shard to another node, even though it is not allowed to remain on its current node",
-            moveDecision.getExplanation()
-        );
+        assertEquals(Explanations.Move.NO, moveDecision.getExplanation());
         assertFalse(moveDecision.canRemain());
         assertFalse(moveDecision.forceMove());
         assertFalse(moveDecision.canRebalanceCluster());
@@ -629,7 +614,7 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
             parser.nextToken();
             assertEquals("move_explanation", parser.currentName());
             parser.nextToken();
-            assertEquals("cannot move shard to another node, even though it is not allowed to remain on its current node", parser.text());
+            assertEquals(Explanations.Move.NO, parser.text());
             verifyNodeDecisions(parser, allNodeDecisions(AllocationDecision.NO, true), includeYesDecisions, false);
             assertEquals(Token.END_OBJECT, parser.nextToken());
         }
@@ -685,10 +670,7 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
         assertFalse(allocateDecision.isDecisionTaken());
         assertTrue(moveDecision.isDecisionTaken());
         assertEquals(AllocationDecision.NO, moveDecision.getAllocationDecision());
-        assertEquals(
-            "rebalancing is not allowed, even though there is at least one node on which the shard can be allocated",
-            moveDecision.getExplanation()
-        );
+        assertEquals(Explanations.Rebalance.CANNOT_REBALANCE_CAN_ALLOCATE, moveDecision.getExplanation());
         assertTrue(moveDecision.canRemain());
         assertFalse(moveDecision.forceMove());
         assertFalse(moveDecision.canRebalanceCluster());
@@ -744,10 +726,7 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
             parser.nextToken();
             assertEquals("rebalance_explanation", parser.currentName());
             parser.nextToken();
-            assertEquals(
-                "rebalancing is not allowed, even though there is at least one node on which the shard can be allocated",
-                parser.text()
-            );
+            assertEquals(Explanations.Rebalance.CANNOT_REBALANCE_CAN_ALLOCATE, parser.text());
             verifyNodeDecisions(parser, allNodeDecisions(AllocationDecision.YES, true), includeYesDecisions, false);
             assertEquals(Token.END_OBJECT, parser.nextToken());
         }
@@ -803,10 +782,7 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
         assertFalse(allocateDecision.isDecisionTaken());
         assertTrue(moveDecision.isDecisionTaken());
         assertEquals(AllocationDecision.NO, moveDecision.getAllocationDecision());
-        assertEquals(
-            "cannot rebalance as no target node exists that can both allocate this shard and improve the cluster balance",
-            moveDecision.getExplanation()
-        );
+        assertEquals(Explanations.Rebalance.ALREADY_BALANCED, moveDecision.getExplanation());
         assertTrue(moveDecision.canRemain());
         assertFalse(moveDecision.forceMove());
         assertTrue(moveDecision.canRebalanceCluster());
@@ -854,10 +830,7 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
             parser.nextToken();
             assertEquals("rebalance_explanation", parser.currentName());
             parser.nextToken();
-            assertEquals(
-                "cannot rebalance as no target node exists that can both allocate this shard and improve the cluster balance",
-                parser.text()
-            );
+            assertEquals(Explanations.Rebalance.ALREADY_BALANCED, parser.text());
             verifyNodeDecisions(parser, allNodeDecisions(AllocationDecision.WORSE_BALANCE, true), includeYesDecisions, false);
             assertEquals(Token.END_OBJECT, parser.nextToken());
         }
@@ -913,10 +886,7 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
         assertFalse(allocateDecision.isDecisionTaken());
         assertTrue(moveDecision.isDecisionTaken());
         assertEquals(AllocationDecision.NO, moveDecision.getAllocationDecision());
-        assertEquals(
-            "cannot rebalance as no target node exists that can both allocate this shard and improve the cluster balance",
-            moveDecision.getExplanation()
-        );
+        assertEquals(Explanations.Rebalance.ALREADY_BALANCED, moveDecision.getExplanation());
         assertTrue(moveDecision.canRemain());
         assertFalse(moveDecision.forceMove());
         assertTrue(moveDecision.canRebalanceCluster());
@@ -972,10 +942,7 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
             parser.nextToken();
             assertEquals("rebalance_explanation", parser.currentName());
             parser.nextToken();
-            assertEquals(
-                "cannot rebalance as no target node exists that can both allocate this shard and improve the cluster balance",
-                parser.text()
-            );
+            assertEquals(Explanations.Rebalance.ALREADY_BALANCED, parser.text());
             verifyNodeDecisions(parser, allNodeDecisions(AllocationDecision.NO, true), includeYesDecisions, false);
             assertEquals(Token.END_OBJECT, parser.nextToken());
         }
@@ -1027,7 +994,7 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
         assertFalse(allocateDecision.isDecisionTaken());
         assertTrue(moveDecision.isDecisionTaken());
         assertEquals(AllocationDecision.NO, moveDecision.getAllocationDecision());
-        assertEquals("rebalancing is not allowed", moveDecision.getExplanation());
+        assertEquals(Explanations.Rebalance.CANNOT_REBALANCE_CANNOT_ALLOCATE, moveDecision.getExplanation());
         assertTrue(moveDecision.canRemain());
         assertFalse(moveDecision.forceMove());
         assertFalse(moveDecision.canRebalanceCluster());
@@ -1076,7 +1043,7 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
             parser.nextToken();
             assertEquals("rebalance_explanation", parser.currentName());
             parser.nextToken();
-            assertEquals("rebalancing is not allowed", parser.text());
+            assertEquals(Explanations.Rebalance.CANNOT_REBALANCE_CANNOT_ALLOCATE, parser.text());
             verifyNodeDecisions(parser, allNodeDecisions(AllocationDecision.NO, false), includeYesDecisions, false);
             assertEquals(Token.END_OBJECT, parser.nextToken());
         }
@@ -1209,7 +1176,7 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
             parser.nextToken();
             assertEquals("allocate_explanation", parser.currentName());
             parser.nextToken();
-            assertEquals("cannot allocate because all found copies of the shard are either stale or corrupt", parser.text());
+            assertEquals(Explanations.Allocation.ALL_COPIES_INVALID, parser.text());
             verifyStaleShardCopyNodeDecisions(parser, 2, Collections.singleton(restartedNode));
         }
     }

+ 20 - 26
server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocateUnassignedDecision.java

@@ -22,6 +22,7 @@ import java.io.IOException;
 import java.util.Collections;
 import java.util.EnumMap;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Objects;
 
@@ -258,33 +259,26 @@ public class AllocateUnassignedDecision extends AbstractAllocationDecision {
     @Override
     public String getExplanation() {
         checkDecisionState();
-        AllocationDecision allocationDecision = getAllocationDecision();
-        if (allocationDecision == AllocationDecision.YES) {
-            return "can allocate the shard";
-        } else if (allocationDecision == AllocationDecision.THROTTLED) {
-            return "allocation temporarily throttled";
-        } else if (allocationDecision == AllocationDecision.AWAITING_INFO) {
-            return "cannot allocate because information about existing shard data is still being retrieved from some of the nodes";
-        } else if (allocationDecision == AllocationDecision.NO_VALID_SHARD_COPY) {
-            if (hasNodeWithStaleOrCorruptShard()) {
-                return "cannot allocate because all found copies of the shard are either stale or corrupt";
-            } else {
-                return "cannot allocate because a previous copy of the primary shard existed but can no longer be found on "
-                    + "the nodes in the cluster";
-            }
-        } else if (allocationDecision == AllocationDecision.ALLOCATION_DELAYED) {
-            return "cannot allocate because the cluster is still waiting "
-                + TimeValue.timeValueMillis(remainingDelayInMillis)
-                + " for the departed node holding a replica to rejoin"
-                + (atLeastOneNodeWithYesDecision() ? ", despite being allowed to allocate the shard to at least one other node" : "");
-        } else {
-            assert allocationDecision == AllocationDecision.NO;
-            if (reuseStore) {
-                return "cannot allocate because allocation is not permitted to any of the nodes that hold an in-sync shard copy";
-            } else {
-                return "cannot allocate because allocation is not permitted to any of the nodes";
+        return switch (getAllocationDecision()) {
+            case YES -> Explanations.Allocation.YES;
+            case THROTTLED -> Explanations.Allocation.THROTTLED;
+            case AWAITING_INFO -> Explanations.Allocation.AWAITING_INFO;
+            case NO_VALID_SHARD_COPY -> hasNodeWithStaleOrCorruptShard()
+                ? Explanations.Allocation.ALL_COPIES_INVALID
+                : Explanations.Allocation.NO_COPIES;
+            case ALLOCATION_DELAYED -> String.format(
+                Locale.ROOT,
+                atLeastOneNodeWithYesDecision()
+                    ? Explanations.Allocation.DELAYED_WITH_ALTERNATIVE
+                    : Explanations.Allocation.DELAYED_WITHOUT_ALTERNATIVE,
+                TimeValue.timeValueMillis(remainingDelayInMillis)
+            );
+            case NO -> reuseStore ? Explanations.Allocation.EXISTING_STORES_FORBIDDEN : Explanations.Allocation.ALL_NODES_FORBIDDEN;
+            case WORSE_BALANCE, NO_ATTEMPT -> {
+                assert false : getAllocationDecision();
+                yield getAllocationDecision().toString();
             }
-        }
+        };
     }
 
     private boolean hasNodeWithStaleOrCorruptShard() {

+ 109 - 0
server/src/main/java/org/elasticsearch/cluster/routing/allocation/Explanations.java

@@ -0,0 +1,109 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.cluster.routing.allocation;
+
+public final class Explanations {
+
+    public static final class Allocation {
+
+        public static final String YES = """
+            Elasticsearch can allocate the shard.""";
+
+        public static final String THROTTLED = """
+            Elasticsearch is currently busy with other activities. It expects to be able to allocate this shard when those activities \
+            finish. Please wait.""";
+
+        public static final String AWAITING_INFO = """
+            Elasticsearch is retrieving information about this shard from one or more nodes. It will make an allocation decision after it \
+            receives this information. Please wait.""";
+
+        public static final String ALL_COPIES_INVALID = """
+            Elasticsearch can't allocate this shard because all the copies of its data in the cluster are stale or corrupt. Elasticsearch \
+            will allocate this shard when a node containing a good copy of its data joins the cluster. If no such node is available, \
+            restore this index from a recent snapshot.""";
+
+        public static final String NO_COPIES = """
+            Elasticsearch can't allocate this shard because there are no copies of its data in the cluster. Elasticsearch will allocate \
+            this shard when a node holding a good copy of its data joins the cluster. If no such node is available, restore this index \
+            from a recent snapshot.""";
+
+        public static final String DELAYED_WITH_ALTERNATIVE = """
+            The node containing this shard copy recently left the cluster. Elasticsearch is waiting for it to return. If the node does not \
+            return within [%s] then Elasticsearch will allocate this shard to another node. Please wait.""";
+
+        public static final String DELAYED_WITHOUT_ALTERNATIVE = """
+            The node holding this shard copy recently left the cluster. Elasticsearch is waiting for it to return. If the node does not \
+            return within [%s] then Elasticsearch will attempt to allocate this shard to another node, but it cannot be allocated to any \
+            other node currently in the cluster. If you expect this shard to be allocated to another node, find this node in the \
+            node-by-node explanation and address the reasons which prevent Elasticsearch from allocating this shard there.""";
+
+        public static final String EXISTING_STORES_FORBIDDEN = """
+            Elasticsearch isn't allowed to allocate this shard to any of the nodes in the cluster that hold an in-sync copy of its data. \
+            Choose a node to which you expect this shard to be allocated, find this node in the node-by-node explanation, and address the \
+            reasons which prevent Elasticsearch from allocating this shard there.""";
+
+        public static final String ALL_NODES_FORBIDDEN = """
+            Elasticsearch isn't allowed to allocate this shard to any of the nodes in the cluster. Choose a node to which you expect this \
+            shard to be allocated, find this node in the node-by-node explanation, and address the reasons which prevent Elasticsearch \
+            from allocating this shard there.""";
+
+    }
+
+    public static final class Rebalance {
+
+        public static final String YES = """
+            Elasticsearch can rebalance this shard to another node.""";
+
+        public static final String ALREADY_BALANCED = """
+            This shard is in a well-balanced location and satisfies all allocation rules so it will remain on this node. Elasticsearch \
+            cannot improve the cluster balance by moving it to another node. If you expect this shard to be rebalanced to another node, \
+            find the other node in the node-by-node explanation and address the reasons which prevent Elasticsearch from rebalancing this \
+            shard there.""";
+
+        public static final String AWAITING_INFO = """
+            Elasticsearch is currently retrieving information about this shard from one or more nodes. It will make a rebalancing decision \
+            after it receives this information. Please wait.""";
+
+        public static final String CLUSTER_THROTTLE = """
+            Elasticsearch is currently busy with other activities. It will rebalance this shard when those activities finish. Please \
+            wait.""";
+
+        public static final String NODE_THROTTLE = """
+            Elasticsearch is attempting to rebalance this shard to another node, but the nodes involved are currently busy with other \
+            activities. The shard will be rebalanced when those activities finish. Please wait.""";
+
+        public static final String CANNOT_REBALANCE_CAN_ALLOCATE = """
+            Elasticsearch is allowed to allocate this shard to another node but it isn't allowed to rebalance the shard there. If you \
+            expect this shard to be rebalanced to another node, find this node in the node-by-node explanation and address the reasons \
+            which prevent Elasticsearch from rebalancing this shard there.""";
+
+        public static final String CANNOT_REBALANCE_CANNOT_ALLOCATE = """
+            Elasticsearch is not allowed to allocate or rebalance this shard to another node. If you expect this shard to be rebalanced to \
+            another node, find this node in the node-by-node explanation and address the reasons which prevent Elasticsearch from \
+            rebalancing this shard there.""";
+
+    }
+
+    public static final class Move {
+
+        public static final String YES = """
+            This shard may not remain on its current node. Elasticsearch will move it to another node.""";
+
+        public static final String THROTTLED = """
+            This shard may not remain on its current node. Elasticsearch is currently busy with other activities and will move this shard \
+            to another node when those activities finish. Please wait.""";
+
+        public static final String NO = """
+            This shard may not remain on its current node, but Elasticsearch isn't allowed to move it to another node. Choose a node to \
+            which you expect this shard to be allocated, find this node in the node-by-node explanation, and address the reasons which \
+            prevent Elasticsearch from allocating this shard there.""";
+
+    }
+
+}

+ 25 - 28
server/src/main/java/org/elasticsearch/cluster/routing/allocation/MoveDecision.java

@@ -245,44 +245,41 @@ public final class MoveDecision extends AbstractAllocationDecision {
     @Override
     public String getExplanation() {
         checkDecisionState();
-        String explanation;
         if (clusterRebalanceDecision != null) {
             // it was a decision to rebalance the shard, because the shard was allowed to remain on its current node
             if (allocationDecision == AllocationDecision.AWAITING_INFO) {
-                explanation = "cannot rebalance as information about existing copies of this shard in the cluster is still being gathered";
-            } else if (clusterRebalanceDecision.type() == Type.NO) {
-                explanation = "rebalancing is not allowed"
-                    + (atLeastOneNodeWithYesDecision()
-                        ? ", even though there " + "is at least one node on which the shard can be allocated"
-                        : "");
-            } else if (clusterRebalanceDecision.type() == Type.THROTTLE) {
-                explanation = "rebalancing is throttled";
-            } else {
-                assert clusterRebalanceDecision.type() == Type.YES;
-                if (getTargetNode() != null) {
-                    if (allocationDecision == AllocationDecision.THROTTLED) {
-                        explanation = "shard rebalancing throttled";
+                return Explanations.Rebalance.AWAITING_INFO;
+            }
+            return switch (clusterRebalanceDecision.type()) {
+                case NO -> atLeastOneNodeWithYesDecision()
+                    ? Explanations.Rebalance.CANNOT_REBALANCE_CAN_ALLOCATE
+                    : Explanations.Rebalance.CANNOT_REBALANCE_CANNOT_ALLOCATE;
+                case THROTTLE -> Explanations.Rebalance.CLUSTER_THROTTLE;
+                case YES -> {
+                    if (getTargetNode() != null) {
+                        if (allocationDecision == AllocationDecision.THROTTLED) {
+                            yield Explanations.Rebalance.NODE_THROTTLE;
+                        } else {
+                            yield Explanations.Rebalance.YES;
+                        }
                     } else {
-                        explanation = "can rebalance shard";
+                        yield Explanations.Rebalance.ALREADY_BALANCED;
                     }
-                } else {
-                    explanation = "cannot rebalance as no target node exists that can both allocate this shard "
-                        + "and improve the cluster balance";
                 }
-            }
+            };
         } else {
             // it was a decision to force move the shard
             assert canRemain() == false;
-            if (allocationDecision == AllocationDecision.YES) {
-                explanation = "shard cannot remain on this node and is force-moved to another node";
-            } else if (allocationDecision == AllocationDecision.THROTTLED) {
-                explanation = "shard cannot remain on this node but is throttled on moving to another node";
-            } else {
-                assert allocationDecision == AllocationDecision.NO;
-                explanation = "cannot move shard to another node, even though it is not allowed to remain on its current node";
-            }
+            return switch (allocationDecision) {
+                case YES -> Explanations.Move.YES;
+                case THROTTLED -> Explanations.Move.THROTTLED;
+                case NO -> Explanations.Move.NO;
+                case WORSE_BALANCE, AWAITING_INFO, ALLOCATION_DELAYED, NO_VALID_SHARD_COPY, NO_ATTEMPT -> {
+                    assert false : allocationDecision;
+                    yield allocationDecision.toString();
+                }
+            };
         }
-        return explanation;
     }
 
     @Override

+ 32 - 22
server/src/test/java/org/elasticsearch/action/admin/cluster/allocation/ClusterAllocationExplanationTests.java

@@ -16,6 +16,7 @@ import org.elasticsearch.cluster.routing.TestShardRouting;
 import org.elasticsearch.cluster.routing.UnassignedInfo;
 import org.elasticsearch.cluster.routing.allocation.AllocateUnassignedDecision;
 import org.elasticsearch.cluster.routing.allocation.AllocationDecision;
+import org.elasticsearch.cluster.routing.allocation.Explanations;
 import org.elasticsearch.cluster.routing.allocation.MoveDecision;
 import org.elasticsearch.cluster.routing.allocation.ShardAllocationDecision;
 import org.elasticsearch.cluster.routing.allocation.decider.Decision;
@@ -98,9 +99,8 @@ public final class ClusterAllocationExplanationTests extends ESTestCase {
               "can_remain_on_current_node": "yes",
               "can_rebalance_cluster": "yes",
               "can_rebalance_to_other_node": "no",
-              "rebalance_explanation": "cannot rebalance as no target node exists that can both allocate this shard \
-            and improve the cluster balance"
-            }""".formatted(cae.getCurrentNode().getAddress())), Strings.toString(builder));
+              "rebalance_explanation": "%s"
+            }""".formatted(cae.getCurrentNode().getAddress(), Explanations.Rebalance.ALREADY_BALANCED)), Strings.toString(builder));
     }
 
     public void testRandomShardExplanationToXContent() throws Exception {
@@ -108,25 +108,35 @@ public final class ClusterAllocationExplanationTests extends ESTestCase {
         XContentBuilder builder = XContentFactory.jsonBuilder();
         cae.toXContent(builder, ToXContent.EMPTY_PARAMS);
         final String actual = Strings.toString(builder);
-        assertThat(actual, equalTo(XContentHelper.stripWhitespace("""
-            {
-              "note": "%s",
-              "index": "idx",
-              "shard": 0,
-              "primary": true,
-              "current_state": "started",
-              "current_node": {
-                "id": "node-0",
-                "name": "",
-                "transport_address": "%s",
-                "weight_ranking": 3
-              },
-              "can_remain_on_current_node": "yes",
-              "can_rebalance_cluster": "yes",
-              "can_rebalance_to_other_node": "no",
-              "rebalance_explanation": "cannot rebalance as no target node exists that can both allocate this shard \
-            and improve the cluster balance"
-            }""".formatted(ClusterAllocationExplanation.NO_SHARD_SPECIFIED_MESSAGE, cae.getCurrentNode().getAddress()))));
+        assertThat(
+            actual,
+            equalTo(
+                XContentHelper.stripWhitespace(
+                    """
+                        {
+                          "note": "%s",
+                          "index": "idx",
+                          "shard": 0,
+                          "primary": true,
+                          "current_state": "started",
+                          "current_node": {
+                            "id": "node-0",
+                            "name": "",
+                            "transport_address": "%s",
+                            "weight_ranking": 3
+                          },
+                          "can_remain_on_current_node": "yes",
+                          "can_rebalance_cluster": "yes",
+                          "can_rebalance_to_other_node": "no",
+                          "rebalance_explanation": "%s"
+                        }""".formatted(
+                        ClusterAllocationExplanation.NO_SHARD_SPECIFIED_MESSAGE,
+                        cae.getCurrentNode().getAddress(),
+                        Explanations.Rebalance.ALREADY_BALANCED
+                    )
+                )
+            )
+        );
         assertThat(
             actual,
             allOf(

+ 20 - 22
server/src/test/java/org/elasticsearch/cluster/routing/allocation/AllocateUnassignedDecisionTests.java

@@ -19,30 +19,31 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.Locale;
 import java.util.stream.Collectors;
 
 import static java.util.Collections.emptyList;
 import static java.util.Collections.emptyMap;
 import static java.util.Collections.emptySet;
-import static org.hamcrest.Matchers.startsWith;
+import static org.hamcrest.Matchers.equalTo;
 
 /**
  * Unit tests for the {@link AllocateUnassignedDecision} class.
  */
 public class AllocateUnassignedDecisionTests extends ESTestCase {
 
-    private DiscoveryNode node1 = new DiscoveryNode("node1", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT);
-    private DiscoveryNode node2 = new DiscoveryNode("node2", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT);
+    private final DiscoveryNode node1 = new DiscoveryNode("node1", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT);
+    private final DiscoveryNode node2 = new DiscoveryNode("node2", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT);
 
     public void testDecisionNotTaken() {
         AllocateUnassignedDecision allocateUnassignedDecision = AllocateUnassignedDecision.NOT_TAKEN;
         assertFalse(allocateUnassignedDecision.isDecisionTaken());
-        expectThrows(IllegalStateException.class, () -> allocateUnassignedDecision.getAllocationDecision());
-        expectThrows(IllegalStateException.class, () -> allocateUnassignedDecision.getAllocationStatus());
-        expectThrows(IllegalStateException.class, () -> allocateUnassignedDecision.getAllocationId());
-        expectThrows(IllegalStateException.class, () -> allocateUnassignedDecision.getTargetNode());
-        expectThrows(IllegalStateException.class, () -> allocateUnassignedDecision.getNodeDecisions());
-        expectThrows(IllegalStateException.class, () -> allocateUnassignedDecision.getExplanation());
+        expectThrows(IllegalStateException.class, allocateUnassignedDecision::getAllocationDecision);
+        expectThrows(IllegalStateException.class, allocateUnassignedDecision::getAllocationStatus);
+        expectThrows(IllegalStateException.class, allocateUnassignedDecision::getAllocationId);
+        expectThrows(IllegalStateException.class, allocateUnassignedDecision::getTargetNode);
+        expectThrows(IllegalStateException.class, allocateUnassignedDecision::getNodeDecisions);
+        expectThrows(IllegalStateException.class, allocateUnassignedDecision::getExplanation);
     }
 
     public void testNoDecision() {
@@ -56,14 +57,14 @@ public class AllocateUnassignedDecisionTests extends ESTestCase {
         assertEquals(AllocationDecision.fromAllocationStatus(allocationStatus), noDecision.getAllocationDecision());
         assertEquals(allocationStatus, noDecision.getAllocationStatus());
         if (allocationStatus == AllocationStatus.FETCHING_SHARD_DATA) {
-            assertEquals(
-                "cannot allocate because information about existing shard data is still being retrieved from " + "some of the nodes",
-                noDecision.getExplanation()
-            );
+            assertEquals(Explanations.Allocation.AWAITING_INFO, noDecision.getExplanation());
         } else if (allocationStatus == AllocationStatus.DELAYED_ALLOCATION) {
-            assertThat(noDecision.getExplanation(), startsWith("cannot allocate because the cluster is still waiting"));
+            assertThat(
+                noDecision.getExplanation(),
+                equalTo(String.format(Locale.ROOT, Explanations.Allocation.DELAYED_WITHOUT_ALTERNATIVE, "0s"))
+            );
         } else {
-            assertThat(noDecision.getExplanation(), startsWith("cannot allocate because a previous copy of the primary shard existed"));
+            assertThat(noDecision.getExplanation(), equalTo(Explanations.Allocation.NO_COPIES));
         }
         assertNull(noDecision.getNodeDecisions());
         assertNull(noDecision.getTargetNode());
@@ -78,12 +79,9 @@ public class AllocateUnassignedDecisionTests extends ESTestCase {
         assertEquals(AllocationDecision.NO, noDecision.getAllocationDecision());
         assertEquals(AllocationStatus.DECIDERS_NO, noDecision.getAllocationStatus());
         if (reuseStore) {
-            assertEquals(
-                "cannot allocate because allocation is not permitted to any of the nodes that hold an in-sync shard copy",
-                noDecision.getExplanation()
-            );
+            assertEquals(Explanations.Allocation.EXISTING_STORES_FORBIDDEN, noDecision.getExplanation());
         } else {
-            assertEquals("cannot allocate because allocation is not permitted to any of the nodes", noDecision.getExplanation());
+            assertEquals(Explanations.Allocation.ALL_NODES_FORBIDDEN, noDecision.getExplanation());
         }
         assertEquals(nodeDecisions.stream().sorted().collect(Collectors.toList()), noDecision.getNodeDecisions());
         // node1 should be sorted first b/c of better weight ranking
@@ -103,7 +101,7 @@ public class AllocateUnassignedDecisionTests extends ESTestCase {
         assertTrue(throttleDecision.isDecisionTaken());
         assertEquals(AllocationDecision.THROTTLED, throttleDecision.getAllocationDecision());
         assertEquals(AllocationStatus.DECIDERS_THROTTLED, throttleDecision.getAllocationStatus());
-        assertThat(throttleDecision.getExplanation(), startsWith("allocation temporarily throttled"));
+        assertThat(throttleDecision.getExplanation(), equalTo(Explanations.Allocation.THROTTLED));
         assertEquals(nodeDecisions.stream().sorted().collect(Collectors.toList()), throttleDecision.getNodeDecisions());
         // node2 should be sorted first b/c a THROTTLE is higher than a NO decision
         assertEquals("node2", throttleDecision.getNodeDecisions().iterator().next().getNode().getId());
@@ -120,7 +118,7 @@ public class AllocateUnassignedDecisionTests extends ESTestCase {
         assertTrue(yesDecision.isDecisionTaken());
         assertEquals(AllocationDecision.YES, yesDecision.getAllocationDecision());
         assertNull(yesDecision.getAllocationStatus());
-        assertEquals("can allocate the shard", yesDecision.getExplanation());
+        assertEquals(Explanations.Allocation.YES, yesDecision.getExplanation());
         assertEquals(nodeDecisions.stream().sorted().collect(Collectors.toList()), yesDecision.getNodeDecisions());
         assertEquals("node2", yesDecision.getTargetNode().getId());
         assertEquals(allocId, yesDecision.getAllocationId());

+ 4 - 14
server/src/test/java/org/elasticsearch/cluster/routing/allocation/AllocationServiceTests.java

@@ -278,25 +278,15 @@ public class AllocationServiceTests extends ESTestCase {
             equalTo(UnassignedInfo.AllocationStatus.NO_VALID_SHARD_COPY)
         );
         assertThat(shardAllocationDecision.getAllocateDecision().getAllocationDecision(), equalTo(AllocationDecision.NO_VALID_SHARD_COPY));
-        assertThat(
-            shardAllocationDecision.getAllocateDecision().getExplanation(),
-            equalTo(
-                "cannot allocate because a previous copy of "
-                    + "the primary shard existed but can no longer be found on the nodes in the cluster"
-            )
-        );
+        assertThat(shardAllocationDecision.getAllocateDecision().getExplanation(), equalTo(Explanations.Allocation.NO_COPIES));
 
         for (NodeAllocationResult nodeAllocationResult : shardAllocationDecision.getAllocateDecision().nodeDecisions) {
             assertThat(nodeAllocationResult.getNodeDecision(), equalTo(AllocationDecision.NO));
             assertThat(nodeAllocationResult.getCanAllocateDecision().type(), equalTo(Decision.Type.NO));
             assertThat(nodeAllocationResult.getCanAllocateDecision().label(), equalTo("allocator_plugin"));
-            assertThat(
-                nodeAllocationResult.getCanAllocateDecision().getExplanation(),
-                equalTo(
-                    "finding the previous copies of this shard requires an allocator called [unknown] but that allocator "
-                        + "was not found; perhaps the corresponding plugin is not installed"
-                )
-            );
+            assertThat(nodeAllocationResult.getCanAllocateDecision().getExplanation(), equalTo("""
+                finding the previous copies of this shard requires an allocator called [unknown] but that allocator was not found; \
+                perhaps the corresponding plugin is not installed"""));
         }
     }
 

+ 8 - 10
server/src/test/java/org/elasticsearch/cluster/routing/allocation/BalancedSingleShardTests.java

@@ -36,9 +36,9 @@ import java.util.Set;
 
 import static java.util.Collections.emptySet;
 import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.greaterThanOrEqualTo;
 import static org.hamcrest.Matchers.lessThan;
-import static org.hamcrest.Matchers.startsWith;
 
 /**
  * Tests for balancing a single shard, see {@link Balancer#decideRebalance(ShardRouting)}.
@@ -69,10 +69,7 @@ public class BalancedSingleShardTests extends ESAllocationTestCase {
         MoveDecision rebalanceDecision = allocator.decideShardAllocation(shard, routingAllocation).getMoveDecision();
         assertNotNull(rebalanceDecision.getClusterRebalanceDecision());
         assertEquals(AllocationDecision.AWAITING_INFO, rebalanceDecision.getAllocationDecision());
-        assertThat(
-            rebalanceDecision.getExplanation(),
-            startsWith("cannot rebalance as information about existing copies of this shard in the cluster is still being gathered")
-        );
+        assertThat(rebalanceDecision.getExplanation(), equalTo(Explanations.Rebalance.AWAITING_INFO));
         assertEquals(clusterState.nodes().getSize() - 1, rebalanceDecision.getNodeDecisions().size());
         assertNull(rebalanceDecision.getTargetNode());
 
@@ -99,7 +96,11 @@ public class BalancedSingleShardTests extends ESAllocationTestCase {
         assertEquals(AllocationDecision.fromDecisionType(canRebalanceDecision.type()), rebalanceDecision.getAllocationDecision());
         assertThat(
             rebalanceDecision.getExplanation(),
-            containsString(canRebalanceDecision.type() == Type.THROTTLE ? "rebalancing is throttled" : "rebalancing is not allowed")
+            containsString(
+                canRebalanceDecision.type() == Type.THROTTLE
+                    ? Explanations.Rebalance.CLUSTER_THROTTLE
+                    : Explanations.Rebalance.CANNOT_REBALANCE_CANNOT_ALLOCATE
+            )
         );
         assertNotNull(rebalanceDecision.getNodeDecisions());
         assertNull(rebalanceDecision.getTargetNode());
@@ -138,10 +139,7 @@ public class BalancedSingleShardTests extends ESAllocationTestCase {
         MoveDecision rebalanceDecision = rebalance.v2();
         assertEquals(Type.YES, rebalanceDecision.getClusterRebalanceDecision().type());
         assertEquals(AllocationDecision.NO, rebalanceDecision.getAllocationDecision());
-        assertThat(
-            rebalanceDecision.getExplanation(),
-            startsWith("cannot rebalance as no target node exists that can both allocate this shard and improve the cluster balance")
-        );
+        assertThat(rebalanceDecision.getExplanation(), equalTo(Explanations.Rebalance.ALREADY_BALANCED));
         assertEquals(clusterState.nodes().getSize() - 1, rebalanceDecision.getNodeDecisions().size());
         assertNull(rebalanceDecision.getTargetNode());
         int prevRanking = 0;