Browse Source

Add node "roles" to allocation explain response (#98550)

Report node "roles" in the /_cluster/allocation/explain response.
Nodes with limited sets of roles may affect shard distribution in ways
users did not originally consider, so it is helpful to surface this
information along with node allocation decision explanations.
Dianna Hohensee 2 years ago
parent
commit
a25e176692

+ 5 - 0
docs/changelog/98550.yaml

@@ -0,0 +1,5 @@
+pr: 98550
+summary: Report a node's "roles" setting in the /_cluster/allocation/explain response
+area: Allocation
+type: enhancement
+issues: [97859]

+ 17 - 9
docs/reference/cluster/allocation-explain.asciidoc

@@ -41,19 +41,19 @@ explanations for shard allocations in the cluster. For unassigned shards,
 the explain API provides an explanation for why the shard is unassigned.
 For assigned shards, the explain API provides an explanation for why the
 shard is remaining on its current node and has not moved or rebalanced to
-another node. This API can be very useful when attempting to diagnose why a 
-shard is unassigned or why a shard continues to remain on its current node when 
+another node. This API can be very useful when attempting to diagnose why a
+shard is unassigned or why a shard continues to remain on its current node when
 you might expect otherwise.
 
 [[cluster-allocation-explain-api-query-params]]
 ==== {api-query-parms-title}
 
 `include_disk_info`::
-    (Optional, Boolean) If `true`, returns information about disk usage and 
+    (Optional, Boolean) If `true`, returns information about disk usage and
     shard sizes. Defaults to `false`.
-    
+
 `include_yes_decisions`::
-    (Optional, Boolean) If `true`, returns 'YES' decisions in explanation. 
+    (Optional, Boolean) If `true`, returns 'YES' decisions in explanation.
     Defaults to `false`.
 
 [[cluster-allocation-explain-api-request-body]]
@@ -65,15 +65,15 @@ you might expect otherwise.
     parameter.
 
 `index`::
-    (Optional, string) Specifies the name of the index that you would like an 
+    (Optional, string) Specifies the name of the index that you would like an
     explanation for.
 
 `primary`::
-    (Optional, Boolean) If `true`, returns explanation for the primary shard 
+    (Optional, Boolean) If `true`, returns explanation for the primary shard
     for the given shard ID.
 
 `shard`::
-    (Optional, integer) Specifies the ID of the shard that you would like an 
+    (Optional, integer) Specifies the ID of the shard that you would like an
     explanation for.
 
 [[cluster-allocation-explain-api-examples]]
@@ -130,6 +130,7 @@ node.
       "node_id" : "8qt2rY-pT6KNZB3-hGfLnw",
       "node_name" : "node-0",
       "transport_address" : "127.0.0.1:9401",
+      "roles" : ["data", "data_cold", "data_content", "data_frozen", "data_hot", "data_warm", "ingest", "master", "ml", "remote_cluster_client", "transform"],
       "node_attributes" : {},
       "node_decision" : "no",                     <4>
       "weight_ranking" : 1,
@@ -147,6 +148,7 @@ node.
 // TESTRESPONSE[s/"at" : "[^"]*"/"at" : $body.$_path/]
 // TESTRESPONSE[s/"node_id" : "[^"]*"/"node_id" : $body.$_path/]
 // TESTRESPONSE[s/"transport_address" : "[^"]*"/"transport_address" : $body.$_path/]
+// TESTRESPONSE[s/"roles" : \[("[a-z_]*",)*("[a-z_]*")\]/"roles" : $body.$_path/]
 // TESTRESPONSE[s/"node_attributes" : \{\}/"node_attributes" : $body.$_path/]
 
 <1> The current state of the shard.
@@ -207,12 +209,14 @@ unassigned due to <<delayed-allocation,delayed allocation>>.
       "node_id" : "pmnHu_ooQWCPEFobZGbpWw",
       "node_name" : "node_t2",
       "transport_address" : "127.0.0.1:9402",
+      "roles" : ["data_content", "data_hot"],
       "node_decision" : "yes"
     },
     {
       "node_id" : "3sULLVJrRneSg0EfBB-2Ew",
       "node_name" : "node_t0",
       "transport_address" : "127.0.0.1:9400",
+      "roles" : ["data_content", "data_hot"],
       "node_decision" : "no",
       "store" : {                                 <3>
         "matching_size" : "4.2kb",
@@ -251,7 +255,8 @@ and must be reallocated.
   "current_node" : {
     "id" : "8lWJeJ7tSoui0bxrwuNhTA",
     "name" : "node_t1",
-    "transport_address" : "127.0.0.1:9401"
+    "transport_address" : "127.0.0.1:9401",
+    "roles" : ["data_content", "data_hot"]
   },
   "can_remain_on_current_node" : "no",            <1>
   "can_remain_decisions" : [                      <2>
@@ -268,6 +273,7 @@ and must be reallocated.
       "node_id" : "_P8olZS8Twax9u6ioN-GGA",
       "node_name" : "node_t0",
       "transport_address" : "127.0.0.1:9400",
+      "roles" : ["data_content", "data_hot"],
       "node_decision" : "no",
       "weight_ranking" : 1,
       "deciders" : [
@@ -302,6 +308,7 @@ cluster balance.
     "id" : "wLzJm4N4RymDkBYxwWoJsg",
     "name" : "node_t0",
     "transport_address" : "127.0.0.1:9400",
+    "roles" : ["data_content", "data_hot"],
     "weight_ranking" : 1
   },
   "can_remain_on_current_node" : "yes",
@@ -313,6 +320,7 @@ cluster balance.
       "node_id" : "oE3EGFc8QN-Tdi5FFEprIA",
       "node_name" : "node_t1",
       "transport_address" : "127.0.0.1:9401",
+      "roles" : ["data_content", "data_hot"],
       "node_decision" : "worse_balance",          <3>
       "weight_ranking" : 1
     }

+ 6 - 4
docs/reference/tab-widgets/troubleshooting/data/diagnose-unassigned-shards.asciidoc

@@ -20,12 +20,12 @@ In order to diagnose the unassigned shards, follow the next steps:
 . Log in to the {ess-console}[{ecloud} console].
 +
 
-. On the **Elasticsearch Service** panel, click the name of your deployment. 
+. On the **Elasticsearch Service** panel, click the name of your deployment.
 +
 
 NOTE: If the name of your deployment is disabled your {kib} instances might be
 unhealthy, in which case please contact https://support.elastic.co[Elastic Support].
-If your deployment doesn't include {kib}, all you need to do is 
+If your deployment doesn't include {kib}, all you need to do is
 {cloud}/ec-access-kibana.html[enable it first].
 
 . Open your deployment's side navigation menu (placed under the Elastic logo in the upper left corner)
@@ -106,6 +106,7 @@ The response will look like this:
       "node_id" : "8qt2rY-pT6KNZB3-hGfLnw",
       "node_name" : "node-0",
       "transport_address" : "127.0.0.1:9401",
+      "roles": ["data_content", "data_hot"],
       "node_attributes" : {},
       "node_decision" : "no",                     <4>
       "weight_ranking" : 1,
@@ -151,7 +152,7 @@ settings>> and <<cluster-update-settings,cluster update settings>> APIs to the
 correct values in order to allow the index to be allocated.
 
 For more guidance on fixing the most common causes for unassinged shards please follow
-<<fix-red-yellow-cluster-status, this guide>> or contact https://support.elastic.co[Elastic Support]. 
+<<fix-red-yellow-cluster-status, this guide>> or contact https://support.elastic.co[Elastic Support].
 
 //end::kibana-api-ex[]
 // end::cloud[]
@@ -231,6 +232,7 @@ The response will look like this:
       "node_id" : "8qt2rY-pT6KNZB3-hGfLnw",
       "node_name" : "node-0",
       "transport_address" : "127.0.0.1:9401",
+      "roles": ["data_content", "data_hot"]
       "node_attributes" : {},
       "node_decision" : "no",                     <4>
       "weight_ranking" : 1,
@@ -276,7 +278,7 @@ settings>> and <<cluster-update-settings,cluster update settings>> APIs to the
 correct values in order to allow the index to be allocated.
 
 For more guidance on fixing the most common causes for unassinged shards please follow
-<<fix-red-yellow-cluster-status, this guide>>. 
+<<fix-red-yellow-cluster-status, this guide>>.
 
 // end::self-managed[]
 

+ 18 - 0
rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.allocation_explain/10_basic.yml

@@ -92,3 +92,21 @@
   - is_true: can_rebalance_cluster
   - is_true: can_rebalance_to_other_node
   - is_true: rebalance_explanation
+
+---
+"Cluster allocation explanation response includes node's roles":
+  - skip:
+      version: " - 8.10.99"
+      reason: The roles field was introduced in 8.11.0
+
+  - do:
+      indices.create:
+        index: test
+
+  - match: { acknowledged: true }
+
+  - do:
+      cluster.allocation_explain:
+        body: { "index": "test", "shard": 0, "primary": true }
+
+  - is_true: current_node.roles

+ 69 - 0
server/src/internalClusterTest/java/org/elasticsearch/action/admin/cluster/allocation/ClusterAllocationExplainIT.java

@@ -15,6 +15,7 @@ import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.health.ClusterHealthStatus;
 import org.elasticsearch.cluster.metadata.IndexMetadata;
 import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.cluster.node.DiscoveryNodeRole;
 import org.elasticsearch.cluster.routing.ShardRoutingState;
 import org.elasticsearch.cluster.routing.UnassignedInfo;
 import org.elasticsearch.cluster.routing.UnassignedInfo.AllocationStatus;
@@ -1143,6 +1144,67 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
         }
     }
 
+    public void testExplainRolesOutput() throws Exception {
+        logger.info("--> Starting first node with \"roles\": [\"master\", \"data_hot\", \"ingest\"]");
+        List<String> firstNodeRoles = List.of("data_hot", "ingest", "master");
+        Settings firstNodeSettings = Settings.builder().putList("node.roles", firstNodeRoles).build();
+        internalCluster().startNode(firstNodeSettings);
+
+        logger.info("--> Creating an index on the first node");
+        prepareIndex(1, 0);
+
+        logger.info("--> Starting a second node, which won't have the index, with \"roles\": [\"data_cold\", \"data_frozen\"]");
+        List<String> secondNodeRoles = List.of("data_cold", "data_frozen");
+        Settings secondNodeSettings = Settings.builder().putList("node.roles", secondNodeRoles).build();
+        internalCluster().startNode(secondNodeSettings);
+
+        boolean includeYesDecisions = randomBoolean();
+        boolean includeDiskInfo = randomBoolean();
+        ClusterAllocationExplanation explanation = runExplain(true, includeYesDecisions, includeDiskInfo);
+
+        assertEquals(
+            Set.of(DiscoveryNodeRole.DATA_HOT_NODE_ROLE, DiscoveryNodeRole.INGEST_ROLE, DiscoveryNodeRole.MASTER_ROLE),
+            explanation.getCurrentNode().getRoles()
+        );
+
+        try (XContentParser parser = getParser(explanation)) {
+            // Fast-forward to the "current_node" object, which contains "roles".
+            do {
+                parser.nextToken();
+                assertNotEquals(Token.END_OBJECT, parser.currentToken());
+                // START_OBJECT has a null currentName(), so check for that before de-referencing.
+            } while (parser.currentName() == null || (parser.currentName().equals("current_node")) == false);
+            assertEquals(Token.START_OBJECT, parser.nextToken());
+
+            // Fast-forward to "roles" field in the "current_node" object.
+            do {
+                parser.nextToken();
+                assertNotEquals(Token.END_OBJECT, parser.currentToken());
+            } while ((parser.currentName().equals("roles")) == false);
+
+            // Check that the "roles" reported are those explicitly set via Settings for the first node, which possesses the shard.
+            // Note: list() implicitly consumes the parser START_ARRAY and END_ARRAY tokens.
+            assertEquals(firstNodeRoles, parser.list());
+
+            // Fast-forward to the "node_allocation_decisions" object, which contains "roles".
+            do {
+                parser.nextToken();
+                // START_OBJECT has a null currentName(), so check for that before de-referencing.
+            } while (parser.currentName() == null || (parser.currentName().equals("node_allocation_decisions")) == false);
+            assertEquals(Token.START_ARRAY, parser.nextToken());
+            assertEquals(Token.START_OBJECT, parser.nextToken());
+
+            // Fast-forward to "roles" field in the "node_allocation_decisions" object.
+            do {
+                parser.nextToken();
+                assertNotEquals(Token.END_OBJECT, parser.currentToken());
+            } while ((parser.currentName().equals("roles")) == false);
+
+            // Check that the "roles" reported are those explicitly set via Settings for the second node, which does not possess the shard.
+            assertEquals(secondNodeRoles, parser.list());
+        }
+    }
+
     private void verifyClusterInfo(ClusterInfo clusterInfo, boolean includeDiskInfo, int numNodes) {
         if (includeDiskInfo) {
             assertThat(clusterInfo.getNodeMostAvailableDiskUsages().size(), greaterThanOrEqualTo(0));
@@ -1302,8 +1364,11 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
                         parser.currentName().equals("id")
                             || parser.currentName().equals("name")
                             || parser.currentName().equals("transport_address")
+                            || parser.currentName().equals("roles")
                             || parser.currentName().equals("weight_ranking")
                     );
+                } else if (token == Token.START_ARRAY || token == Token.END_ARRAY) {
+                    assertEquals("roles", parser.currentName());
                 } else {
                     assertTrue(token.isValue());
                     assertNotNull(parser.text());
@@ -1429,6 +1494,10 @@ public final class ClusterAllocationExplainIT extends ESIntegTestCase {
         parser.nextToken();
         assertNotNull(parser.text());
         parser.nextToken();
+        assertEquals("roles", parser.currentName());
+        parser.nextToken();
+        assertNotEquals(0, parser.list().size());
+        parser.nextToken();
         assertEquals("node_decision", parser.currentName());
         parser.nextToken();
         return nodeName;

+ 6 - 0
server/src/main/java/org/elasticsearch/cluster/routing/allocation/AbstractAllocationDecision.java

@@ -9,6 +9,7 @@
 package org.elasticsearch.cluster.routing.allocation;
 
 import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.cluster.node.DiscoveryNodeRole;
 import org.elasticsearch.cluster.routing.allocation.decider.Decision.Type;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
@@ -110,6 +111,11 @@ public abstract class AbstractAllocationDecision implements ToXContentFragment,
             }
             builder.endObject();
         }
+        builder.startArray("roles");
+        for (DiscoveryNodeRole role : node.getRoles()) {
+            builder.value(role.roleName());
+        }
+        builder.endArray();
         return builder;
     }
 

+ 4 - 1
server/src/test/java/org/elasticsearch/action/admin/cluster/allocation/ClusterAllocationExplainActionTests.java

@@ -35,6 +35,7 @@ import org.elasticsearch.xcontent.XContentFactory;
 import java.time.Instant;
 import java.util.Collections;
 import java.util.Locale;
+import java.util.stream.Collectors;
 
 import static org.elasticsearch.action.admin.cluster.allocation.TransportClusterAllocationExplainAction.findShardToExplain;
 import static org.hamcrest.Matchers.allOf;
@@ -109,6 +110,7 @@ public class ClusterAllocationExplainActionTests extends ESTestCase {
             cae.getCurrentNode().getId(),
             cae.getCurrentNode().getName(),
             cae.getCurrentNode().getAddress(),
+            cae.getCurrentNode().getRoles().stream().map(r -> '"' + r.roleName() + '"').collect(Collectors.joining(", ", "[", "]")),
             explanation };
         assertEquals(XContentHelper.stripWhitespace(Strings.format("""
             {
@@ -120,7 +122,8 @@ public class ClusterAllocationExplainActionTests extends ESTestCase {
               "current_node": {
                 "id": "%s",
                 "name": "%s",
-                "transport_address": "%s"
+                "transport_address": "%s",
+                "roles": %s
               },
               "explanation": "%s"
             }""", args)), Strings.toString(builder));

+ 2 - 0
server/src/test/java/org/elasticsearch/action/admin/cluster/allocation/ClusterAllocationExplanationTests.java

@@ -93,6 +93,7 @@ public final class ClusterAllocationExplanationTests extends ESTestCase {
                 "id": "node-0",
                 "name": "",
                 "transport_address": "%s",
+                "roles": [],
                 "weight_ranking": 3
               },
               "can_remain_on_current_node": "yes",
@@ -123,6 +124,7 @@ public final class ClusterAllocationExplanationTests extends ESTestCase {
                                 "id": "node-0",
                                 "name": "",
                                 "transport_address": "%s",
+                                "roles": [],
                                 "weight_ranking": 3
                               },
                               "can_remain_on_current_node": "yes",