Browse Source

Add node id to shard failure message (#28024)

This will help in the allocation explain API to figure out which node a shard was last allocated to before it failed.

Closes #28018
Yannick Welsch 7 years ago
parent
commit
2603391c00

+ 2 - 1
core/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java

@@ -181,7 +181,8 @@ public class AllocationService extends AbstractComponent {
                         shardToFail.shardId(), shardToFail, failedShard);
                 }
                 int failedAllocations = failedShard.unassignedInfo() != null ? failedShard.unassignedInfo().getNumFailedAllocations() : 0;
-                UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.ALLOCATION_FAILED, failedShardEntry.getMessage(),
+                String message = "failed shard on node [" + shardToFail.currentNodeId() + "]: " + failedShardEntry.getMessage();
+                UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.ALLOCATION_FAILED, message,
                     failedShardEntry.getFailure(), failedAllocations + 1, currentNanoTime, System.currentTimeMillis(), false,
                     AllocationStatus.NO_ATTEMPT);
                 routingNodes.failShard(logger, failedShard, unassignedInfo, indexMetaData, allocation.changes());

+ 2 - 2
core/src/test/java/org/elasticsearch/cluster/routing/UnassignedInfoTests.java

@@ -260,8 +260,8 @@ public class UnassignedInfoTests extends ESAllocationTestCase {
         assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(1));
         assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo(), notNullValue());
         assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getReason(), equalTo(UnassignedInfo.Reason.ALLOCATION_FAILED));
-        assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getMessage(), equalTo("test fail"));
-        assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getDetails(), equalTo("test fail"));
+        assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getMessage(), equalTo("failed shard on node [" + shardToFail.currentNodeId() + "]: test fail"));
+        assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getDetails(), equalTo("failed shard on node [" + shardToFail.currentNodeId() + "]: test fail"));
         assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getUnassignedTimeInMillis(), greaterThan(0L));
     }
 

+ 10 - 9
core/src/test/java/org/elasticsearch/cluster/routing/allocation/MaxRetryAllocationDeciderTests.java

@@ -43,6 +43,7 @@ import java.util.List;
 import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING;
 import static org.elasticsearch.cluster.routing.ShardRoutingState.STARTED;
 import static org.elasticsearch.cluster.routing.ShardRoutingState.UNASSIGNED;
+import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.not;
 
@@ -98,7 +99,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
             assertEquals(routingTable.index("idx").shards().size(), 1);
             assertEquals(routingTable.index("idx").shard(0).shards().get(0).state(), INITIALIZING);
             assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations(), i+1);
-            assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), "boom" + i);
+            assertThat(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), containsString("boom" + i));
         }
         // now we go and check that we are actually stick to unassigned on the next failure
         List<FailedShard> failedShards = Collections.singletonList(
@@ -111,7 +112,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
         assertEquals(routingTable.index("idx").shards().size(), 1);
         assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations(), retries);
         assertEquals(routingTable.index("idx").shard(0).shards().get(0).state(), UNASSIGNED);
-        assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), "boom");
+        assertThat(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), containsString("boom"));
 
         // manual resetting of retry count
         newState = strategy.reroute(clusterState, new AllocationCommands(), false, true).getClusterState();
@@ -123,7 +124,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
         assertEquals(routingTable.index("idx").shards().size(), 1);
         assertEquals(0, routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations());
         assertEquals(INITIALIZING, routingTable.index("idx").shard(0).shards().get(0).state());
-        assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), "boom");
+        assertThat(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), containsString("boom"));
 
         // again fail it N-1 times
         for (int i = 0; i < retries-1; i++) {
@@ -138,7 +139,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
         assertEquals(routingTable.index("idx").shards().size(), 1);
         assertEquals(i + 1, routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations());
         assertEquals(INITIALIZING, routingTable.index("idx").shard(0).shards().get(0).state());
-        assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), "boom");
+        assertThat(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), containsString("boom"));
         }
 
         // now we go and check that we are actually stick to unassigned on the next failure
@@ -152,7 +153,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
         assertEquals(routingTable.index("idx").shards().size(), 1);
         assertEquals(retries, routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations());
         assertEquals(UNASSIGNED, routingTable.index("idx").shard(0).shards().get(0).state());
-        assertEquals("boom", routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage());
+        assertThat(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getMessage(), containsString("boom"));
     }
 
     public void testFailedAllocation() {
@@ -172,7 +173,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
             ShardRouting unassignedPrimary = routingTable.index("idx").shard(0).shards().get(0);
             assertEquals(unassignedPrimary.state(), INITIALIZING);
             assertEquals(unassignedPrimary.unassignedInfo().getNumFailedAllocations(), i+1);
-            assertEquals(unassignedPrimary.unassignedInfo().getMessage(), "boom" + i);
+            assertThat(unassignedPrimary.unassignedInfo().getMessage(), containsString("boom" + i));
             // MaxRetryAllocationDecider#canForceAllocatePrimary should return YES decisions because canAllocate returns YES here
             assertEquals(Decision.YES, new MaxRetryAllocationDecider(Settings.EMPTY).canForceAllocatePrimary(
                 unassignedPrimary, null, new RoutingAllocation(null, null, clusterState, null, 0)));
@@ -190,7 +191,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
             ShardRouting unassignedPrimary = routingTable.index("idx").shard(0).shards().get(0);
             assertEquals(unassignedPrimary.unassignedInfo().getNumFailedAllocations(), retries);
             assertEquals(unassignedPrimary.state(), UNASSIGNED);
-            assertEquals(unassignedPrimary.unassignedInfo().getMessage(), "boom");
+            assertThat(unassignedPrimary.unassignedInfo().getMessage(), containsString("boom"));
             // MaxRetryAllocationDecider#canForceAllocatePrimary should return a NO decision because canAllocate returns NO here
             assertEquals(Decision.NO, new MaxRetryAllocationDecider(Settings.EMPTY).canForceAllocatePrimary(
                 unassignedPrimary, null, new RoutingAllocation(null, null, clusterState, null, 0)));
@@ -212,7 +213,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
         ShardRouting unassignedPrimary = routingTable.index("idx").shard(0).shards().get(0);
         assertEquals(unassignedPrimary.unassignedInfo().getNumFailedAllocations(), retries);
         assertEquals(unassignedPrimary.state(), INITIALIZING);
-        assertEquals(unassignedPrimary.unassignedInfo().getMessage(), "boom");
+        assertThat(unassignedPrimary.unassignedInfo().getMessage(), containsString("boom"));
         // bumped up the max retry count, so canForceAllocatePrimary should return a YES decision
         assertEquals(Decision.YES, new MaxRetryAllocationDecider(Settings.EMPTY).canForceAllocatePrimary(
             routingTable.index("idx").shard(0).shards().get(0), null, new RoutingAllocation(null, null, clusterState, null, 0)));
@@ -239,7 +240,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
         unassignedPrimary = routingTable.index("idx").shard(0).shards().get(0);
         assertEquals(unassignedPrimary.unassignedInfo().getNumFailedAllocations(), 1);
         assertEquals(unassignedPrimary.state(), UNASSIGNED);
-        assertEquals(unassignedPrimary.unassignedInfo().getMessage(), "ZOOOMG");
+        assertThat(unassignedPrimary.unassignedInfo().getMessage(), containsString("ZOOOMG"));
         // Counter reset, so MaxRetryAllocationDecider#canForceAllocatePrimary should return a YES decision
         assertEquals(Decision.YES, new MaxRetryAllocationDecider(Settings.EMPTY).canForceAllocatePrimary(
             unassignedPrimary, null, new RoutingAllocation(null, null, clusterState, null, 0)));