Преглед изворни кода

Return a default user action if no actions could be determined (#87079)

If a shard is unassigned for a reason other than NO_VALID_SHARD_COPY or the deciders returning 
a NO response, then no user actions are returned. If the deciders return no, but we cannot determine 
a root cause in the indicator we return a default user action. If we could not generate any user actions 
for a shard being unassigned, we should always add one.
James Baiera пре 3 година
родитељ
комит
12bf4b0897

+ 5 - 0
docs/changelog/87079.yaml

@@ -0,0 +1,5 @@
+pr: 87079
+summary: Return a default user action if no actions could be determined
+area: Health
+type: enhancement
+issues: []

+ 12 - 7
server/src/main/java/org/elasticsearch/cluster/routing/allocation/ShardsAvailabilityHealthIndicatorService.java

@@ -369,6 +369,7 @@ public class ShardsAvailabilityHealthIndicatorService implements HealthIndicator
      */
     List<UserAction.Definition> diagnoseUnassignedShardRouting(ShardRouting shardRouting, ClusterState state) {
         List<UserAction.Definition> actions = new ArrayList<>();
+        LOGGER.trace("Diagnosing unassigned shard [{}] due to reason [{}]", shardRouting.shardId(), shardRouting.unassignedInfo());
         switch (shardRouting.unassignedInfo().getLastAllocationStatus()) {
             case NO_VALID_SHARD_COPY:
                 if (UnassignedInfo.Reason.NODE_LEFT == shardRouting.unassignedInfo().getReason()) {
@@ -381,6 +382,9 @@ public class ShardsAvailabilityHealthIndicatorService implements HealthIndicator
             default:
                 break;
         }
+        if (actions.isEmpty()) {
+            actions.add(ACTION_CHECK_ALLOCATION_EXPLAIN_API);
+        }
         return actions;
     }
 
@@ -392,7 +396,7 @@ public class ShardsAvailabilityHealthIndicatorService implements HealthIndicator
      * @return a list of actions for the user to take
      */
     private List<UserAction.Definition> explainAllocationsAndDiagnoseDeciders(ShardRouting shardRouting, ClusterState state) {
-        LOGGER.trace("Diagnosing shard [{}]", shardRouting.shardId());
+        LOGGER.trace("Executing allocation explain on shard [{}]", shardRouting.shardId());
         RoutingAllocation allocation = new RoutingAllocation(
             allocationService.getAllocationDeciders(),
             state,
@@ -403,12 +407,13 @@ public class ShardsAvailabilityHealthIndicatorService implements HealthIndicator
         allocation.setDebugMode(RoutingAllocation.DebugMode.ON);
         ShardAllocationDecision shardAllocationDecision = allocationService.explainShardAllocation(shardRouting, allocation);
         AllocateUnassignedDecision allocateDecision = shardAllocationDecision.getAllocateDecision();
-        LOGGER.trace(
-            "[{}]: Obtained decision: [{}/{}]",
-            shardRouting.shardId(),
-            allocateDecision.isDecisionTaken(),
-            allocateDecision.getAllocationDecision()
-        );
+        if (LOGGER.isTraceEnabled()) {
+            if (allocateDecision.isDecisionTaken()) {
+                LOGGER.trace("[{}]: Allocation decision [{}]", shardRouting.shardId(), allocateDecision.getAllocationDecision());
+            } else {
+                LOGGER.trace("[{}]: Decision taken [false]", shardRouting.shardId());
+            }
+        }
         if (allocateDecision.isDecisionTaken() && AllocationDecision.NO == allocateDecision.getAllocationDecision()) {
             if (LOGGER.isTraceEnabled()) {
                 LOGGER.trace(

+ 43 - 40
server/src/test/java/org/elasticsearch/cluster/routing/allocation/ShardsAvailabilityHealthIndicatorServiceTests.java

@@ -50,6 +50,7 @@ import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
+import java.util.Optional;
 import java.util.Set;
 import java.util.UUID;
 import java.util.stream.Collectors;
@@ -159,7 +160,7 @@ public class ShardsAvailabilityHealthIndicatorServiceTests extends ESTestCase {
                             List.of(ImpactArea.SEARCH)
                         )
                     ),
-                    Collections.emptyList()
+                    List.of(new UserAction(ACTION_CHECK_ALLOCATION_EXPLAIN_API, List.of("yellow-index")))
                 )
             )
         );
@@ -187,7 +188,7 @@ public class ShardsAvailabilityHealthIndicatorServiceTests extends ESTestCase {
                             List.of(ImpactArea.INGEST, ImpactArea.SEARCH)
                         )
                     ),
-                    Collections.emptyList()
+                    List.of(new UserAction(ACTION_CHECK_ALLOCATION_EXPLAIN_API, List.of("red-index")))
                 )
             )
         );
@@ -212,7 +213,7 @@ public class ShardsAvailabilityHealthIndicatorServiceTests extends ESTestCase {
                             List.of(ImpactArea.INGEST, ImpactArea.SEARCH)
                         )
                     ),
-                    Collections.emptyList()
+                    List.of(new UserAction(ACTION_CHECK_ALLOCATION_EXPLAIN_API, List.of("red-index")))
                 )
             )
         );
@@ -393,7 +394,7 @@ public class ShardsAvailabilityHealthIndicatorServiceTests extends ESTestCase {
                             List.of(ImpactArea.SEARCH)
                         )
                     ),
-                    Collections.emptyList()
+                    List.of(new UserAction(ACTION_CHECK_ALLOCATION_EXPLAIN_API, List.of("restarting-index")))
                 )
             )
         );
@@ -470,7 +471,7 @@ public class ShardsAvailabilityHealthIndicatorServiceTests extends ESTestCase {
                             List.of(ImpactArea.INGEST, ImpactArea.SEARCH)
                         )
                     ),
-                    Collections.emptyList()
+                    List.of(new UserAction(ACTION_CHECK_ALLOCATION_EXPLAIN_API, List.of("restarting-index")))
                 )
             )
         );
@@ -542,10 +543,10 @@ public class ShardsAvailabilityHealthIndicatorServiceTests extends ESTestCase {
             .numberOfReplicas(0)
             .build();
 
-        // Cluster state with index, but its only shard is unassigned because the allocation deciders said none are allowed
+        // Cluster state with index, but its only shard is unassigned (Either deciders said no, or a node left)
         var clusterState = createClusterStateWith(
             List.of(indexMetadata),
-            List.of(index("red-index", new ShardAllocation(randomNodeId(), UNAVAILABLE, decidersNo()))),
+            List.of(index("red-index", new ShardAllocation(randomNodeId(), UNAVAILABLE, randomFrom(decidersNo(), nodeLeft())))),
             List.of(),
             List.of()
         );
@@ -577,22 +578,7 @@ public class ShardsAvailabilityHealthIndicatorServiceTests extends ESTestCase {
 
         // Get the list of user actions that are generated for this unassigned index shard
         ShardRouting shardRouting = clusterState.routingTable().index(indexMetadata.getIndex()).shard(0).primaryShard();
-        List<UserAction.Definition> actions = service.diagnoseAllocationResults(
-            shardRouting,
-            clusterState,
-            List.of(
-                new NodeAllocationResult(
-                    new DiscoveryNode(randomNodeId(), buildNewFakeTransportAddress(), Version.CURRENT),
-                    new Decision.Multi().add(Decision.single(Decision.Type.YES, EnableAllocationDecider.NAME, null))
-                        .add(Decision.single(Decision.Type.YES, "data_tier", null))
-                        .add(Decision.single(Decision.Type.YES, ShardsLimitAllocationDecider.NAME, null))
-                        .add(Decision.single(Decision.Type.YES, FilterAllocationDecider.NAME, null))
-                        .add(Decision.single(Decision.Type.YES, SameShardAllocationDecider.NAME, null))
-                        .add(Decision.single(Decision.Type.NO, AwarenessAllocationDecider.NAME, null)), // Unhandled in indicator
-                    1
-                )
-            )
-        );
+        List<UserAction.Definition> actions = service.diagnoseUnassignedShardRouting(shardRouting, clusterState);
 
         assertThat(actions, hasSize(1));
         assertThat(actions, contains(ACTION_CHECK_ALLOCATION_EXPLAIN_API));
@@ -1338,9 +1324,9 @@ public class ShardsAvailabilityHealthIndicatorServiceTests extends ESTestCase {
             shardId,
             primary,
             getSource(primary, allocation.state),
-            allocation.unassignedInfo == null ? new UnassignedInfo(UnassignedInfo.Reason.INDEX_CREATED, null) : allocation.unassignedInfo
+            new UnassignedInfo(UnassignedInfo.Reason.INDEX_CREATED, null)
         );
-        if (allocation.state == UNAVAILABLE || allocation.state == INITIALIZING) {
+        if (allocation.state == INITIALIZING) {
             return routing;
         }
         routing = routing.initialize(allocation.nodeId, null, 0);
@@ -1348,22 +1334,24 @@ public class ShardsAvailabilityHealthIndicatorServiceTests extends ESTestCase {
         if (allocation.state == AVAILABLE) {
             return routing;
         }
-        routing = routing.moveToUnassigned(
-            new UnassignedInfo(
-                UnassignedInfo.Reason.NODE_RESTARTING,
-                null,
-                null,
-                -1,
-                allocation.unassignedTimeNanos != null ? allocation.unassignedTimeNanos : 0,
-                0,
-                false,
-                UnassignedInfo.AllocationStatus.DELAYED_ALLOCATION,
-                Set.of(),
-                allocation.nodeId
-            )
-        );
+        if (allocation.state == UNAVAILABLE) {
+            return routing.moveToUnassigned(Optional.ofNullable(allocation.unassignedInfo).orElse(randomFrom(nodeLeft(), decidersNo())));
+        }
         if (allocation.state == RESTARTING) {
-            return routing;
+            return routing.moveToUnassigned(
+                new UnassignedInfo(
+                    UnassignedInfo.Reason.NODE_RESTARTING,
+                    null,
+                    null,
+                    -1,
+                    allocation.unassignedTimeNanos != null ? allocation.unassignedTimeNanos : 0,
+                    0,
+                    false,
+                    UnassignedInfo.AllocationStatus.DELAYED_ALLOCATION,
+                    Set.of(),
+                    allocation.nodeId
+                )
+            );
         }
 
         throw new AssertionError("Unexpected state [" + allocation.state + "]");
@@ -1422,6 +1410,21 @@ public class ShardsAvailabilityHealthIndicatorServiceTests extends ESTestCase {
         );
     }
 
+    private static UnassignedInfo nodeLeft() {
+        return new UnassignedInfo(
+            UnassignedInfo.Reason.NODE_LEFT,
+            null,
+            null,
+            0,
+            0,
+            0,
+            false,
+            UnassignedInfo.AllocationStatus.NO_ATTEMPT,
+            Collections.emptySet(),
+            null
+        );
+    }
+
     private static UnassignedInfo decidersNo() {
         return new UnassignedInfo(
             UnassignedInfo.Reason.ALLOCATION_FAILED,