Browse Source

Limit shard realocation retries (#90296)

This change ensures that elasticsearch would not indefinitely retry relocating shard if operation fails.
Ievgen Degtiarenko 3 years ago
parent
commit
24cf87186d
17 changed files with 466 additions and 139 deletions
  1. 5 0
      docs/changelog/90296.yaml
  2. 28 7
      docs/reference/search/search-shards.asciidoc
  3. 27 0
      server/src/internalClusterTest/java/org/elasticsearch/indices/IndicesLifecycleListenerIT.java
  4. 56 0
      server/src/main/java/org/elasticsearch/cluster/routing/RelocationFailureInfo.java
  5. 5 0
      server/src/main/java/org/elasticsearch/cluster/routing/RoutingChangesObserver.java
  6. 40 24
      server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java
  7. 77 8
      server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java
  8. 1 1
      server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java
  9. 7 0
      server/src/main/java/org/elasticsearch/cluster/routing/allocation/RoutingNodesChangedObserver.java
  10. 3 1
      server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDecider.java
  11. 41 21
      server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java
  12. 18 0
      server/src/test/java/org/elasticsearch/cluster/ClusterStateTests.java
  13. 32 0
      server/src/test/java/org/elasticsearch/cluster/routing/RelocationFailureInfoTests.java
  14. 5 0
      server/src/test/java/org/elasticsearch/cluster/routing/ShardRoutingTests.java
  15. 106 77
      server/src/test/java/org/elasticsearch/cluster/routing/allocation/MaxRetryAllocationDeciderTests.java
  16. 2 0
      test/framework/src/main/java/org/elasticsearch/cluster/routing/ShardRoutingHelper.java
  17. 13 0
      test/framework/src/main/java/org/elasticsearch/cluster/routing/TestShardRouting.java

+ 5 - 0
docs/changelog/90296.yaml

@@ -0,0 +1,5 @@
+pr: 90296
+summary: Limit shard realocation retries
+area: Allocation
+type: enhancement
+issues: []

+ 28 - 7
docs/reference/search/search-shards.asciidoc

@@ -87,55 +87,70 @@ The API returns the following result:
       {
         "index": "my-index-000001",
         "node": "JklnKbD7Tyqi9TP3_Q_tBg",
+        "relocating_node": null,
         "primary": true,
         "shard": 0,
         "state": "STARTED",
         "allocation_id": {"id":"0TvkCyF7TAmM1wHP4a42-A"},
-        "relocating_node": null
+        "relocation_failure_info" : {
+          "failed_attempts" : 0
+        }
       }
     ],
     [
       {
         "index": "my-index-000001",
         "node": "JklnKbD7Tyqi9TP3_Q_tBg",
+        "relocating_node": null,
         "primary": true,
         "shard": 1,
         "state": "STARTED",
         "allocation_id": {"id":"fMju3hd1QHWmWrIgFnI4Ww"},
-        "relocating_node": null
+        "relocation_failure_info" : {
+          "failed_attempts" : 0
+        }
       }
     ],
     [
       {
         "index": "my-index-000001",
         "node": "JklnKbD7Tyqi9TP3_Q_tBg",
+        "relocating_node": null,
         "primary": true,
         "shard": 2,
         "state": "STARTED",
         "allocation_id": {"id":"Nwl0wbMBTHCWjEEbGYGapg"},
-        "relocating_node": null
+        "relocation_failure_info" : {
+          "failed_attempts" : 0
+        }
       }
     ],
     [
       {
         "index": "my-index-000001",
         "node": "JklnKbD7Tyqi9TP3_Q_tBg",
+        "relocating_node": null,
         "primary": true,
         "shard": 3,
         "state": "STARTED",
         "allocation_id": {"id":"bU_KLGJISbW0RejwnwDPKw"},
-        "relocating_node": null
+        "relocation_failure_info" : {
+          "failed_attempts" : 0
+        }
       }
     ],
     [
       {
         "index": "my-index-000001",
         "node": "JklnKbD7Tyqi9TP3_Q_tBg",
+        "relocating_node": null,
         "primary": true,
         "shard": 4,
         "state": "STARTED",
         "allocation_id": {"id":"DMs7_giNSwmdqVukF7UydA"},
-        "relocating_node": null
+        "relocation_failure_info" : {
+          "failed_attempts" : 0
+        }
       }
     ]
   ]
@@ -171,22 +186,28 @@ The API returns the following result:
       {
         "index": "my-index-000001",
         "node": "JklnKbD7Tyqi9TP3_Q_tBg",
+        "relocating_node": null,
         "primary": true,
         "shard": 2,
         "state": "STARTED",
         "allocation_id": {"id":"fMju3hd1QHWmWrIgFnI4Ww"},
-        "relocating_node": null
+        "relocation_failure_info" : {
+          "failed_attempts" : 0
+        }
       }
     ],
     [
       {
         "index": "my-index-000001",
         "node": "JklnKbD7Tyqi9TP3_Q_tBg",
+        "relocating_node": null,
         "primary": true,
         "shard": 3,
         "state": "STARTED",
         "allocation_id": {"id":"0TvkCyF7TAmM1wHP4a42-A"},
-        "relocating_node": null
+        "relocation_failure_info" : {
+          "failed_attempts" : 0
+        }
       }
     ]
   ]

+ 27 - 0
server/src/internalClusterTest/java/org/elasticsearch/indices/IndicesLifecycleListenerIT.java

@@ -43,6 +43,7 @@ import java.util.concurrent.CopyOnWriteArrayList;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
 
+import static org.elasticsearch.cluster.metadata.IndexMetadata.INDEX_ROUTING_EXCLUDE_GROUP_PREFIX;
 import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS;
 import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_SHARDS;
 import static org.elasticsearch.index.shard.IndexShardState.CLOSED;
@@ -137,6 +138,32 @@ public class IndicesLifecycleListenerIT extends ESIntegTestCase {
         assertThat(state.nodes().resolveNode(shard.get(0).currentNodeId()).getName(), Matchers.equalTo(node1));
     }
 
+    public void testRelocationFailureNotRetriedForever() {
+        String node1 = internalCluster().startNode();
+        client().admin()
+            .indices()
+            .prepareCreate("index1")
+            .setSettings(Settings.builder().put(SETTING_NUMBER_OF_SHARDS, 1).put(SETTING_NUMBER_OF_REPLICAS, 0))
+            .get();
+        ensureGreen("index1");
+        for (int i = 0; i < 2; i++) {
+            internalCluster().getInstance(MockIndexEventListener.TestEventListener.class, internalCluster().startNode())
+                .setNewDelegate(new IndexShardStateChangeListener() {
+                    @Override
+                    public void beforeIndexCreated(Index index, Settings indexSettings) {
+                        throw new RuntimeException("FAIL");
+                    }
+                });
+        }
+        assertAcked(
+            client().admin()
+                .indices()
+                .prepareUpdateSettings("index1")
+                .setSettings(Settings.builder().put(INDEX_ROUTING_EXCLUDE_GROUP_PREFIX + "._name", node1))
+        );
+        ensureGreen("index1");
+    }
+
     public void testIndexStateShardChanged() throws Throwable {
         // start with a single node
         String node1 = internalCluster().startNode();

+ 56 - 0
server/src/main/java/org/elasticsearch/cluster/routing/RelocationFailureInfo.java

@@ -0,0 +1,56 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.cluster.routing;
+
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.common.io.stream.Writeable;
+import org.elasticsearch.xcontent.ToXContentFragment;
+import org.elasticsearch.xcontent.XContentBuilder;
+
+import java.io.IOException;
+
+/**
+ * Holds additional information as to why the shard failed to relocate.
+ */
+public record RelocationFailureInfo(int failedRelocations) implements ToXContentFragment, Writeable {
+
+    public static final RelocationFailureInfo NO_FAILURES = new RelocationFailureInfo(0);
+
+    public RelocationFailureInfo {
+        assert failedRelocations >= 0 : "Expect non-negative failures count, got: " + failedRelocations;
+    }
+
+    public static RelocationFailureInfo readFrom(StreamInput in) throws IOException {
+        int failures = in.readVInt();
+        return failures == 0 ? NO_FAILURES : new RelocationFailureInfo(failures);
+    }
+
+    @Override
+    public void writeTo(StreamOutput out) throws IOException {
+        out.writeVInt(failedRelocations);
+    }
+
+    public RelocationFailureInfo incFailedRelocations() {
+        return new RelocationFailureInfo(failedRelocations + 1);
+    }
+
+    @Override
+    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
+        builder.startObject("relocation_failure_info");
+        builder.field("failed_attempts", failedRelocations);
+        builder.endObject();
+        return builder;
+    }
+
+    @Override
+    public String toString() {
+        return "failed_attempts[" + failedRelocations + "]";
+    }
+}

+ 5 - 0
server/src/main/java/org/elasticsearch/cluster/routing/RoutingChangesObserver.java

@@ -32,6 +32,11 @@ public interface RoutingChangesObserver {
      */
     default void unassignedInfoUpdated(ShardRouting unassignedShard, UnassignedInfo newUnassignedInfo) {}
 
+    /**
+     * Called when a relocating shard's failure information was updated
+     */
+    default void relocationFailureInfoUpdated(ShardRouting relocatedShard, RelocationFailureInfo relocationFailureInfo) {}
+
     /**
      * Called when a shard is failed or cancelled.
      */

+ 40 - 24
server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java

@@ -990,30 +990,6 @@ public class RoutingNodes extends AbstractCollection<RoutingNode> {
             ignored.add(shard);
         }
 
-        public void resetFailedAllocationCounter(RoutingChangesObserver routingChangesObserver) {
-            final RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator = iterator();
-            while (unassignedIterator.hasNext()) {
-                ShardRouting shardRouting = unassignedIterator.next();
-                UnassignedInfo unassignedInfo = shardRouting.unassignedInfo();
-                unassignedIterator.updateUnassigned(
-                    new UnassignedInfo(
-                        unassignedInfo.getNumFailedAllocations() > 0 ? UnassignedInfo.Reason.MANUAL_ALLOCATION : unassignedInfo.getReason(),
-                        unassignedInfo.getMessage(),
-                        unassignedInfo.getFailure(),
-                        0,
-                        unassignedInfo.getUnassignedTimeInNanos(),
-                        unassignedInfo.getUnassignedTimeInMillis(),
-                        unassignedInfo.isDelayed(),
-                        unassignedInfo.getLastAllocationStatus(),
-                        Collections.emptySet(),
-                        unassignedInfo.getLastAllocatedNodeId()
-                    ),
-                    shardRouting.recoverySource(),
-                    routingChangesObserver
-                );
-            }
-        }
-
         public class UnassignedIterator implements Iterator<ShardRouting>, ExistingShardsAllocator.UnassignedAllocationHandler {
 
             private final ListIterator<ShardRouting> iterator;
@@ -1293,6 +1269,46 @@ public class RoutingNodes extends AbstractCollection<RoutingNode> {
         }
     }
 
+    public void resetFailedCounter(RoutingChangesObserver routingChangesObserver) {
+        final var unassignedIterator = unassigned().iterator();
+        while (unassignedIterator.hasNext()) {
+            ShardRouting shardRouting = unassignedIterator.next();
+            UnassignedInfo unassignedInfo = shardRouting.unassignedInfo();
+            unassignedIterator.updateUnassigned(
+                new UnassignedInfo(
+                    unassignedInfo.getNumFailedAllocations() > 0 ? UnassignedInfo.Reason.MANUAL_ALLOCATION : unassignedInfo.getReason(),
+                    unassignedInfo.getMessage(),
+                    unassignedInfo.getFailure(),
+                    0,
+                    unassignedInfo.getUnassignedTimeInNanos(),
+                    unassignedInfo.getUnassignedTimeInMillis(),
+                    unassignedInfo.isDelayed(),
+                    unassignedInfo.getLastAllocationStatus(),
+                    Collections.emptySet(),
+                    unassignedInfo.getLastAllocatedNodeId()
+                ),
+                shardRouting.recoverySource(),
+                routingChangesObserver
+            );
+        }
+
+        for (RoutingNode routingNode : this) {
+            var shardsWithRelocationFailures = new ArrayList<ShardRouting>();
+            for (ShardRouting shardRouting : routingNode) {
+                if (shardRouting.relocationFailureInfo() != null && shardRouting.relocationFailureInfo().failedRelocations() > 0) {
+                    shardsWithRelocationFailures.add(shardRouting);
+                }
+            }
+
+            for (ShardRouting original : shardsWithRelocationFailures) {
+                ShardRouting updated = original.updateRelocationFailure(RelocationFailureInfo.NO_FAILURES);
+                routingNode.update(original, updated);
+                assignedShardsRemove(original);
+                assignedShardsAdd(updated);
+            }
+        }
+    }
+
     /**
      * Creates an iterator over shards interleaving between nodes: The iterator returns the first shard from
      * the first node, then the first shard of the second node, etc. until one shard from each node has been returned.

+ 77 - 8
server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java

@@ -36,6 +36,7 @@ public final class ShardRouting implements Writeable, ToXContentObject {
      */
     public static final long UNAVAILABLE_EXPECTED_SHARD_SIZE = -1;
     private static final Version EXPECTED_SHARD_SIZE_FOR_STARTED_VERSION = Version.V_8_5_0;
+    private static final Version RELOCATION_FAILURE_INFO_VERSION = Version.V_8_6_0;
 
     private final ShardId shardId;
     private final String currentNodeId;
@@ -49,8 +50,11 @@ public final class ShardRouting implements Writeable, ToXContentObject {
     private final String relocatingNodeId;
     private final boolean primary;
     private final ShardRoutingState state;
+    @Nullable
     private final RecoverySource recoverySource;
+    @Nullable
     private final UnassignedInfo unassignedInfo;
+    private final RelocationFailureInfo relocationFailureInfo;
     private final AllocationId allocationId;
     private final long expectedShardSize;
     @Nullable
@@ -68,6 +72,7 @@ public final class ShardRouting implements Writeable, ToXContentObject {
         ShardRoutingState state,
         RecoverySource recoverySource,
         UnassignedInfo unassignedInfo,
+        RelocationFailureInfo relocationFailureInfo,
         AllocationId allocationId,
         long expectedShardSize
     ) {
@@ -78,10 +83,12 @@ public final class ShardRouting implements Writeable, ToXContentObject {
         this.state = state;
         this.recoverySource = recoverySource;
         this.unassignedInfo = unassignedInfo;
+        this.relocationFailureInfo = relocationFailureInfo;
         this.allocationId = allocationId;
         this.expectedShardSize = expectedShardSize;
         this.targetRelocatingShard = initializeTargetRelocatingShard();
         assert (state == ShardRoutingState.UNASSIGNED && unassignedInfo == null) == false : "unassigned shard must be created with meta";
+        assert relocationFailureInfo != null : "relocation failure info must be always set";
         assert (state == ShardRoutingState.UNASSIGNED || state == ShardRoutingState.INITIALIZING) == (recoverySource != null)
             : "recovery source only available on unassigned or initializing shard but was " + state;
         assert recoverySource == null || recoverySource == PeerRecoverySource.INSTANCE || primary
@@ -103,6 +110,7 @@ public final class ShardRouting implements Writeable, ToXContentObject {
                 ShardRoutingState.INITIALIZING,
                 PeerRecoverySource.INSTANCE,
                 unassignedInfo,
+                RelocationFailureInfo.NO_FAILURES,
                 AllocationId.newTargetRelocation(allocationId),
                 expectedShardSize
             );
@@ -128,6 +136,7 @@ public final class ShardRouting implements Writeable, ToXContentObject {
             ShardRoutingState.UNASSIGNED,
             recoverySource,
             unassignedInfo,
+            RelocationFailureInfo.NO_FAILURES,
             null,
             UNAVAILABLE_EXPECTED_SHARD_SIZE
         );
@@ -241,6 +250,11 @@ public final class ShardRouting implements Writeable, ToXContentObject {
         return unassignedInfo;
     }
 
+    @Nullable
+    public RelocationFailureInfo relocationFailureInfo() {
+        return relocationFailureInfo;
+    }
+
     /**
      * An id that uniquely identifies an allocation.
      */
@@ -289,16 +303,19 @@ public final class ShardRouting implements Writeable, ToXContentObject {
             recoverySource = null;
         }
         unassignedInfo = in.readOptionalWriteable(UnassignedInfo::new);
+        if (in.getVersion().onOrAfter(RELOCATION_FAILURE_INFO_VERSION)) {
+            relocationFailureInfo = RelocationFailureInfo.readFrom(in);
+        } else {
+            relocationFailureInfo = RelocationFailureInfo.NO_FAILURES;
+        }
         allocationId = in.readOptionalWriteable(AllocationId::new);
-        final long shardSize;
         if (state == ShardRoutingState.RELOCATING
             || state == ShardRoutingState.INITIALIZING
             || (state == ShardRoutingState.STARTED && in.getVersion().onOrAfter(EXPECTED_SHARD_SIZE_FOR_STARTED_VERSION))) {
-            shardSize = in.readLong();
+            expectedShardSize = in.readLong();
         } else {
-            shardSize = UNAVAILABLE_EXPECTED_SHARD_SIZE;
+            expectedShardSize = UNAVAILABLE_EXPECTED_SHARD_SIZE;
         }
-        expectedShardSize = shardSize;
         targetRelocatingShard = initializeTargetRelocatingShard();
     }
 
@@ -321,6 +338,9 @@ public final class ShardRouting implements Writeable, ToXContentObject {
             recoverySource.writeTo(out);
         }
         out.writeOptionalWriteable(unassignedInfo);
+        if (out.getVersion().onOrAfter(RELOCATION_FAILURE_INFO_VERSION)) {
+            relocationFailureInfo.writeTo(out);
+        }
         out.writeOptionalWriteable(allocationId);
         if (state == ShardRoutingState.RELOCATING
             || state == ShardRoutingState.INITIALIZING
@@ -336,7 +356,7 @@ public final class ShardRouting implements Writeable, ToXContentObject {
     }
 
     public ShardRouting updateUnassigned(UnassignedInfo unassignedInfo, RecoverySource recoverySource) {
-        assert this.unassignedInfo != null : "can only update unassign info if they are already set";
+        assert this.unassignedInfo != null : "can only update unassigned info if it is already set";
         assert this.unassignedInfo.isDelayed() || (unassignedInfo.isDelayed() == false) : "cannot transition from non-delayed to delayed";
         return new ShardRouting(
             shardId,
@@ -346,6 +366,23 @@ public final class ShardRouting implements Writeable, ToXContentObject {
             state,
             recoverySource,
             unassignedInfo,
+            relocationFailureInfo,
+            allocationId,
+            expectedShardSize
+        );
+    }
+
+    public ShardRouting updateRelocationFailure(RelocationFailureInfo relocationFailureInfo) {
+        assert this.relocationFailureInfo != null : "can only update relocation failure info info if it is already set";
+        return new ShardRouting(
+            shardId,
+            currentNodeId,
+            relocatingNodeId,
+            primary,
+            state,
+            recoverySource,
+            unassignedInfo,
+            relocationFailureInfo,
             allocationId,
             expectedShardSize
         );
@@ -374,6 +411,7 @@ public final class ShardRouting implements Writeable, ToXContentObject {
             ShardRoutingState.UNASSIGNED,
             recoverySource,
             unassignedInfo,
+            RelocationFailureInfo.NO_FAILURES,
             null,
             UNAVAILABLE_EXPECTED_SHARD_SIZE
         );
@@ -401,6 +439,7 @@ public final class ShardRouting implements Writeable, ToXContentObject {
             ShardRoutingState.INITIALIZING,
             recoverySource,
             unassignedInfo,
+            RelocationFailureInfo.NO_FAILURES,
             allocationId,
             expectedShardSize
         );
@@ -421,6 +460,7 @@ public final class ShardRouting implements Writeable, ToXContentObject {
             ShardRoutingState.RELOCATING,
             recoverySource,
             null,
+            relocationFailureInfo,
             AllocationId.newRelocation(allocationId),
             expectedShardSize
         );
@@ -442,6 +482,25 @@ public final class ShardRouting implements Writeable, ToXContentObject {
             ShardRoutingState.STARTED,
             recoverySource,
             null,
+            relocationFailureInfo.incFailedRelocations(),
+            AllocationId.cancelRelocation(allocationId),
+            UNAVAILABLE_EXPECTED_SHARD_SIZE
+        );
+    }
+
+    public ShardRouting asBeforeRelocation() {
+        assert state == ShardRoutingState.RELOCATING : this;
+        assert assignedToNode() : this;
+        assert relocatingNodeId != null : this;
+        return new ShardRouting(
+            shardId,
+            currentNodeId,
+            null,
+            primary,
+            ShardRoutingState.STARTED,
+            recoverySource,
+            null,
+            RelocationFailureInfo.NO_FAILURES,
             AllocationId.cancelRelocation(allocationId),
             UNAVAILABLE_EXPECTED_SHARD_SIZE
         );
@@ -465,6 +524,7 @@ public final class ShardRouting implements Writeable, ToXContentObject {
             state,
             recoverySource,
             unassignedInfo,
+            relocationFailureInfo,
             AllocationId.finishRelocation(allocationId),
             expectedShardSize
         );
@@ -485,6 +545,7 @@ public final class ShardRouting implements Writeable, ToXContentObject {
             ShardRoutingState.INITIALIZING,
             recoverySource,
             unassignedInfo,
+            relocationFailureInfo,
             AllocationId.newInitializing(),
             expectedShardSize
         );
@@ -511,6 +572,7 @@ public final class ShardRouting implements Writeable, ToXContentObject {
             ShardRoutingState.STARTED,
             null,
             null,
+            RelocationFailureInfo.NO_FAILURES,
             allocationId,
             expectedShardSize
         );
@@ -534,6 +596,7 @@ public final class ShardRouting implements Writeable, ToXContentObject {
             state,
             recoverySource,
             unassignedInfo,
+            relocationFailureInfo,
             allocationId,
             expectedShardSize
         );
@@ -557,6 +620,7 @@ public final class ShardRouting implements Writeable, ToXContentObject {
             state,
             PeerRecoverySource.INSTANCE,
             unassignedInfo,
+            relocationFailureInfo,
             allocationId,
             expectedShardSize
         );
@@ -699,7 +763,9 @@ public final class ShardRouting implements Writeable, ToXContentObject {
             return false;
         }
         ShardRouting that = (ShardRouting) o;
-        return Objects.equals(unassignedInfo, that.unassignedInfo) && equalsIgnoringMetadata(that);
+        return equalsIgnoringMetadata(that)
+            && Objects.equals(unassignedInfo, that.unassignedInfo)
+            && Objects.equals(relocationFailureInfo, that.relocationFailureInfo);
     }
 
     /**
@@ -720,6 +786,7 @@ public final class ShardRouting implements Writeable, ToXContentObject {
             h = 31 * h + (recoverySource != null ? recoverySource.hashCode() : 0);
             h = 31 * h + (allocationId != null ? allocationId.hashCode() : 0);
             h = 31 * h + (unassignedInfo != null ? unassignedInfo.hashCode() : 0);
+            h = 31 * h + (relocationFailureInfo != null ? relocationFailureInfo.hashCode() : 0);
             hashCode = h;
         }
         return h;
@@ -752,9 +819,10 @@ public final class ShardRouting implements Writeable, ToXContentObject {
         if (allocationId != null) {
             sb.append(", a").append(allocationId);
         }
-        if (this.unassignedInfo != null) {
-            sb.append(", ").append(unassignedInfo.toString());
+        if (unassignedInfo != null) {
+            sb.append(", ").append(unassignedInfo);
         }
+        sb.append(", ").append(relocationFailureInfo);
         if (expectedShardSize != UNAVAILABLE_EXPECTED_SHARD_SIZE) {
             sb.append(", expected_shard_size[").append(expectedShardSize).append("]");
         }
@@ -783,6 +851,7 @@ public final class ShardRouting implements Writeable, ToXContentObject {
         if (unassignedInfo != null) {
             unassignedInfo.toXContent(builder, params);
         }
+        relocationFailureInfo.toXContent(builder, params);
         return builder.endObject();
     }
 

+ 1 - 1
server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java

@@ -387,7 +387,7 @@ public class AllocationService {
         allocation.ignoreDisable(true);
 
         if (retryFailed) {
-            allocation.routingNodes().unassigned().resetFailedAllocationCounter(allocation.changes());
+            allocation.routingNodes().resetFailedCounter(allocation.changes());
         }
 
         RoutingExplanations explanations = commands.execute(allocation, explain);

+ 7 - 0
server/src/main/java/org/elasticsearch/cluster/routing/allocation/RoutingNodesChangedObserver.java

@@ -8,6 +8,7 @@
 
 package org.elasticsearch.cluster.routing.allocation;
 
+import org.elasticsearch.cluster.routing.RelocationFailureInfo;
 import org.elasticsearch.cluster.routing.RoutingChangesObserver;
 import org.elasticsearch.cluster.routing.RoutingNodes;
 import org.elasticsearch.cluster.routing.ShardRouting;
@@ -53,6 +54,12 @@ public class RoutingNodesChangedObserver implements RoutingChangesObserver {
         setChanged();
     }
 
+    @Override
+    public void relocationFailureInfoUpdated(ShardRouting relocatedShard, RelocationFailureInfo relocationFailureInfo) {
+        assert relocatedShard.active() : "expected active shard " + relocatedShard;
+        setChanged();
+    }
+
     @Override
     public void shardFailed(ShardRouting failedShard, UnassignedInfo unassignedInfo) {
         assert failedShard.assignedToNode() : "expected assigned shard " + failedShard;

+ 3 - 1
server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDecider.java

@@ -162,9 +162,11 @@ public class DiskThresholdDecider extends AllocationDecider {
         if (subtractShardsMovingAway) {
             for (ShardRouting routing : node.relocating()) {
                 String actualPath = clusterInfo.getDataPath(routing);
+                // This branch is needed as the map key contains some information that might have changed (eg shard status)
+                // It could be removed once https://github.com/elastic/elasticsearch/issues/90109 is fixed
                 if (actualPath == null) {
                     // we might know the path of this shard from before when it was relocating
-                    actualPath = clusterInfo.getDataPath(routing.cancelRelocation());
+                    actualPath = clusterInfo.getDataPath(routing.asBeforeRelocation());
                 }
                 if (dataPath.equals(actualPath)) {
                     totalSize -= getExpectedShardSize(routing, 0L, clusterInfo, null, metadata, routingTable);

+ 41 - 21
server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java

@@ -8,7 +8,7 @@
 
 package org.elasticsearch.cluster.routing.allocation.decider;
 
-import org.elasticsearch.cluster.metadata.IndexMetadata;
+import org.elasticsearch.cluster.routing.RelocationFailureInfo;
 import org.elasticsearch.cluster.routing.RoutingNode;
 import org.elasticsearch.cluster.routing.ShardRouting;
 import org.elasticsearch.cluster.routing.UnassignedInfo;
@@ -40,35 +40,34 @@ public class MaxRetryAllocationDecider extends AllocationDecider {
 
     @Override
     public Decision canAllocate(ShardRouting shardRouting, RoutingAllocation allocation) {
-        final UnassignedInfo unassignedInfo = shardRouting.unassignedInfo();
+        final int maxRetries = SETTING_ALLOCATION_MAX_RETRY.get(allocation.metadata().getIndexSafe(shardRouting.index()).getSettings());
+
+        final var unassignedInfo = shardRouting.unassignedInfo();
         final int numFailedAllocations = unassignedInfo == null ? 0 : unassignedInfo.getNumFailedAllocations();
         if (numFailedAllocations > 0) {
-            return decisionWithFailures(shardRouting, allocation, unassignedInfo, numFailedAllocations);
+            final var decision = numFailedAllocations >= maxRetries ? Decision.NO : Decision.YES;
+            return allocation.debugDecision() ? debugDecision(decision, unassignedInfo, numFailedAllocations, maxRetries) : decision;
+        }
+
+        final var relocationFailureInfo = shardRouting.relocationFailureInfo();
+        final int numFailedRelocations = relocationFailureInfo == null ? 0 : relocationFailureInfo.failedRelocations();
+        if (numFailedRelocations > 0) {
+            final var decision = numFailedRelocations >= maxRetries ? Decision.NO : Decision.YES;
+            return allocation.debugDecision() ? debugDecision(decision, relocationFailureInfo, numFailedRelocations, maxRetries) : decision;
         }
-        return YES_NO_FAILURES;
-    }
 
-    private static Decision decisionWithFailures(
-        ShardRouting shardRouting,
-        RoutingAllocation allocation,
-        UnassignedInfo unassignedInfo,
-        int numFailedAllocations
-    ) {
-        final IndexMetadata indexMetadata = allocation.metadata().getIndexSafe(shardRouting.index());
-        final int maxRetry = SETTING_ALLOCATION_MAX_RETRY.get(indexMetadata.getSettings());
-        final Decision res = numFailedAllocations >= maxRetry ? Decision.NO : Decision.YES;
-        return allocation.debugDecision() ? debugDecision(res, unassignedInfo, numFailedAllocations, maxRetry) : res;
+        return YES_NO_FAILURES;
     }
 
-    private static Decision debugDecision(Decision decision, UnassignedInfo unassignedInfo, int numFailedAllocations, int maxRetry) {
+    private static Decision debugDecision(Decision decision, UnassignedInfo info, int numFailedAllocations, int maxRetries) {
         if (decision.type() == Decision.Type.NO) {
             return Decision.single(
                 Decision.Type.NO,
                 NAME,
-                "shard has exceeded the maximum number of retries [%d] on "
-                    + "failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [%s]",
-                maxRetry,
-                unassignedInfo.toString()
+                "shard has exceeded the maximum number of retries [%d] on failed allocation attempts - "
+                    + "manually call [/_cluster/reroute?retry_failed=true] to retry, [%s]",
+                maxRetries,
+                info.toString()
             );
         } else {
             return Decision.single(
@@ -76,7 +75,28 @@ public class MaxRetryAllocationDecider extends AllocationDecider {
                 NAME,
                 "shard has failed allocating [%d] times but [%d] retries are allowed",
                 numFailedAllocations,
-                maxRetry
+                maxRetries
+            );
+        }
+    }
+
+    private static Decision debugDecision(Decision decision, RelocationFailureInfo info, int numFailedRelocations, int maxRetries) {
+        if (decision.type() == Decision.Type.NO) {
+            return Decision.single(
+                Decision.Type.NO,
+                NAME,
+                "shard has exceeded the maximum number of retries [%d] on failed relocation attempts - "
+                    + "manually call [/_cluster/reroute?retry_failed=true] to retry, [%s]",
+                maxRetries,
+                info.toString()
+            );
+        } else {
+            return Decision.single(
+                Decision.Type.YES,
+                NAME,
+                "shard has failed relocating [%d] times but [%d] retries are allowed",
+                numFailedRelocations,
+                maxRetries
             );
         }
     }

+ 18 - 0
server/src/test/java/org/elasticsearch/cluster/ClusterStateTests.java

@@ -301,6 +301,9 @@ public class ClusterStateTests extends ESTestCase {
                           "index": "index",
                           "allocation_id": {
                             "id": "%s"
+                          },
+                          "relocation_failure_info" : {
+                            "failed_attempts" : 0
                           }
                         }
                       ]
@@ -321,6 +324,9 @@ public class ClusterStateTests extends ESTestCase {
                       "index": "index",
                       "allocation_id": {
                         "id": "%s"
+                      },
+                      "relocation_failure_info" : {
+                        "failed_attempts" : 0
                       }
                     }
                   ],
@@ -505,6 +511,9 @@ public class ClusterStateTests extends ESTestCase {
                           "index" : "index",
                           "allocation_id" : {
                             "id" : "%s"
+                          },
+                          "relocation_failure_info" : {
+                            "failed_attempts" : 0
                           }
                         }
                       ]
@@ -525,6 +534,9 @@ public class ClusterStateTests extends ESTestCase {
                       "index" : "index",
                       "allocation_id" : {
                         "id" : "%s"
+                      },
+                      "relocation_failure_info" : {
+                        "failed_attempts" : 0
                       }
                     }
                   ],
@@ -716,6 +728,9 @@ public class ClusterStateTests extends ESTestCase {
                           "index" : "index",
                           "allocation_id" : {
                             "id" : "%s"
+                          },
+                          "relocation_failure_info" : {
+                            "failed_attempts" : 0
                           }
                         }
                       ]
@@ -736,6 +751,9 @@ public class ClusterStateTests extends ESTestCase {
                       "index" : "index",
                       "allocation_id" : {
                         "id" : "%s"
+                      },
+                      "relocation_failure_info" : {
+                        "failed_attempts" : 0
                       }
                     }
                   ],

+ 32 - 0
server/src/test/java/org/elasticsearch/cluster/routing/RelocationFailureInfoTests.java

@@ -0,0 +1,32 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.cluster.routing;
+
+import org.elasticsearch.common.io.stream.Writeable;
+import org.elasticsearch.test.AbstractWireSerializingTestCase;
+
+import java.io.IOException;
+
+public class RelocationFailureInfoTests extends AbstractWireSerializingTestCase<RelocationFailureInfo> {
+
+    @Override
+    protected Writeable.Reader<RelocationFailureInfo> instanceReader() {
+        return RelocationFailureInfo::readFrom;
+    }
+
+    @Override
+    protected RelocationFailureInfo createTestInstance() {
+        return new RelocationFailureInfo(randomIntBetween(1, 10));
+    }
+
+    @Override
+    protected RelocationFailureInfo mutateInstance(RelocationFailureInfo instance) throws IOException {
+        return new RelocationFailureInfo(instance.failedRelocations() + 1);
+    }
+}

+ 5 - 0
server/src/test/java/org/elasticsearch/cluster/routing/ShardRoutingTests.java

@@ -136,6 +136,7 @@ public class ShardRoutingTests extends ESTestCase {
                         otherRouting.state(),
                         otherRouting.recoverySource(),
                         otherRouting.unassignedInfo(),
+                        otherRouting.relocationFailureInfo(),
                         otherRouting.allocationId(),
                         otherRouting.getExpectedShardSize()
                     );
@@ -150,6 +151,7 @@ public class ShardRoutingTests extends ESTestCase {
                         otherRouting.state(),
                         otherRouting.recoverySource(),
                         otherRouting.unassignedInfo(),
+                        otherRouting.relocationFailureInfo(),
                         otherRouting.allocationId(),
                         otherRouting.getExpectedShardSize()
                     );
@@ -167,6 +169,7 @@ public class ShardRoutingTests extends ESTestCase {
                             otherRouting.state(),
                             otherRouting.recoverySource(),
                             otherRouting.unassignedInfo(),
+                            otherRouting.relocationFailureInfo(),
                             otherRouting.allocationId(),
                             otherRouting.getExpectedShardSize()
                         );
@@ -185,6 +188,7 @@ public class ShardRoutingTests extends ESTestCase {
                             otherRouting.state(),
                             otherRouting.recoverySource(),
                             otherRouting.unassignedInfo(),
+                            otherRouting.relocationFailureInfo(),
                             otherRouting.allocationId(),
                             otherRouting.getExpectedShardSize()
                         );
@@ -208,6 +212,7 @@ public class ShardRoutingTests extends ESTestCase {
                                 new IndexId("test", UUIDs.randomBase64UUID(random()))
                             ),
                             otherRouting.unassignedInfo(),
+                            otherRouting.relocationFailureInfo(),
                             otherRouting.allocationId(),
                             otherRouting.getExpectedShardSize()
                         );

+ 106 - 77
server/src/test/java/org/elasticsearch/cluster/routing/allocation/MaxRetryAllocationDeciderTests.java

@@ -9,6 +9,7 @@
 package org.elasticsearch.cluster.routing.allocation;
 
 import org.elasticsearch.Version;
+import org.elasticsearch.cluster.ClusterInfo;
 import org.elasticsearch.cluster.ClusterName;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.ESAllocationTestCase;
@@ -25,10 +26,12 @@ import org.elasticsearch.cluster.routing.allocation.decider.Decision;
 import org.elasticsearch.cluster.routing.allocation.decider.MaxRetryAllocationDecider;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.snapshots.EmptySnapshotsInfoService;
+import org.elasticsearch.snapshots.SnapshotShardSizeInfo;
 import org.elasticsearch.test.gateway.TestGatewayAllocator;
 
-import java.util.Collections;
 import java.util.List;
+import java.util.Objects;
+import java.util.function.Consumer;
 
 import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING;
 import static org.elasticsearch.cluster.routing.ShardRoutingState.STARTED;
@@ -39,44 +42,34 @@ import static org.hamcrest.Matchers.not;
 
 public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
 
-    private AllocationService strategy;
-
-    @Override
-    public void setUp() throws Exception {
-        super.setUp();
-        strategy = new AllocationService(
-            new AllocationDeciders(Collections.singleton(new MaxRetryAllocationDecider())),
-            new TestGatewayAllocator(),
-            new BalancedShardsAllocator(Settings.EMPTY),
-            EmptyClusterInfoService.INSTANCE,
-            EmptySnapshotsInfoService.INSTANCE
-        );
-    }
+    private final MaxRetryAllocationDecider decider = new MaxRetryAllocationDecider();
+    private final AllocationService strategy = new AllocationService(
+        new AllocationDeciders(List.of(decider)),
+        new TestGatewayAllocator(),
+        new BalancedShardsAllocator(Settings.EMPTY),
+        EmptyClusterInfoService.INSTANCE,
+        EmptySnapshotsInfoService.INSTANCE
+    );
 
     private ClusterState createInitialClusterState() {
-        Metadata.Builder metaBuilder = Metadata.builder();
-        metaBuilder.put(IndexMetadata.builder("idx").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(0));
-        Metadata metadata = metaBuilder.build();
-        RoutingTable.Builder routingTableBuilder = RoutingTable.builder();
-        routingTableBuilder.addAsNew(metadata.index("idx"));
+        Metadata metadata = Metadata.builder()
+            .put(IndexMetadata.builder("idx").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(0))
+            .build();
+        RoutingTable routingTable = RoutingTable.builder().addAsNew(metadata.index("idx")).build();
 
-        RoutingTable routingTable = routingTableBuilder.build();
         ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
             .metadata(metadata)
             .routingTable(routingTable)
-            .build();
-        clusterState = ClusterState.builder(clusterState)
             .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2")))
             .build();
-        RoutingTable prevRoutingTable = routingTable;
-        routingTable = strategy.reroute(clusterState, "reroute").routingTable();
-        clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
 
-        assertEquals(prevRoutingTable.index("idx").size(), 1);
-        assertEquals(prevRoutingTable.index("idx").shard(0).shard(0).state(), UNASSIGNED);
+        assertEquals(clusterState.routingTable().index("idx").size(), 1);
+        assertEquals(clusterState.routingTable().index("idx").shard(0).shard(0).state(), UNASSIGNED);
 
-        assertEquals(routingTable.index("idx").size(), 1);
-        assertEquals(routingTable.index("idx").shard(0).shard(0).state(), INITIALIZING);
+        clusterState = strategy.reroute(clusterState, "reroute");
+
+        assertEquals(clusterState.routingTable().index("idx").size(), 1);
+        assertEquals(clusterState.routingTable().index("idx").shard(0).shard(0).state(), INITIALIZING);
         return clusterState;
     }
 
@@ -86,15 +79,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
         final int retries = MaxRetryAllocationDecider.SETTING_ALLOCATION_MAX_RETRY.get(Settings.EMPTY);
         // now fail it N-1 times
         for (int i = 0; i < retries - 1; i++) {
-            List<FailedShard> failedShards = Collections.singletonList(
-                new FailedShard(
-                    routingTable.index("idx").shard(0).shard(0),
-                    "boom" + i,
-                    new UnsupportedOperationException(),
-                    randomBoolean()
-                )
-            );
-            ClusterState newState = strategy.applyFailedShards(clusterState, failedShards, List.of());
+            ClusterState newState = applyShardFailure(clusterState, routingTable.index("idx").shard(0).shard(0), "boom" + i);
             assertThat(newState, not(equalTo(clusterState)));
             clusterState = newState;
             routingTable = newState.routingTable();
@@ -104,10 +89,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
             assertThat(routingTable.index("idx").shard(0).shard(0).unassignedInfo().getMessage(), containsString("boom" + i));
         }
         // now we go and check that we are actually stick to unassigned on the next failure
-        List<FailedShard> failedShards = Collections.singletonList(
-            new FailedShard(routingTable.index("idx").shard(0).shard(0), "boom", new UnsupportedOperationException(), randomBoolean())
-        );
-        ClusterState newState = strategy.applyFailedShards(clusterState, failedShards, List.of());
+        ClusterState newState = applyShardFailure(clusterState, routingTable.index("idx").shard(0).shard(0), "boom");
         assertThat(newState, not(equalTo(clusterState)));
         clusterState = newState;
         routingTable = newState.routingTable();
@@ -130,11 +112,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
 
         // again fail it N-1 times
         for (int i = 0; i < retries - 1; i++) {
-            failedShards = Collections.singletonList(
-                new FailedShard(routingTable.index("idx").shard(0).shard(0), "boom", new UnsupportedOperationException(), randomBoolean())
-            );
-
-            newState = strategy.applyFailedShards(clusterState, failedShards, List.of());
+            newState = applyShardFailure(clusterState, routingTable.index("idx").shard(0).shard(0), "boom");
             assertThat(newState, not(equalTo(clusterState)));
             clusterState = newState;
             routingTable = newState.routingTable();
@@ -145,10 +123,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
         }
 
         // now we go and check that we are actually stick to unassigned on the next failure
-        failedShards = Collections.singletonList(
-            new FailedShard(routingTable.index("idx").shard(0).shard(0), "boom", new UnsupportedOperationException(), randomBoolean())
-        );
-        newState = strategy.applyFailedShards(clusterState, failedShards, List.of());
+        newState = applyShardFailure(clusterState, routingTable.index("idx").shard(0).shard(0), "boom");
         assertThat(newState, not(equalTo(clusterState)));
         clusterState = newState;
         routingTable = newState.routingTable();
@@ -164,15 +139,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
         final int retries = MaxRetryAllocationDecider.SETTING_ALLOCATION_MAX_RETRY.get(Settings.EMPTY);
         // now fail it N-1 times
         for (int i = 0; i < retries - 1; i++) {
-            List<FailedShard> failedShards = Collections.singletonList(
-                new FailedShard(
-                    routingTable.index("idx").shard(0).shard(0),
-                    "boom" + i,
-                    new UnsupportedOperationException(),
-                    randomBoolean()
-                )
-            );
-            ClusterState newState = strategy.applyFailedShards(clusterState, failedShards, List.of());
+            ClusterState newState = applyShardFailure(clusterState, routingTable.index("idx").shard(0).shard(0), "boom" + i);
             assertThat(newState, not(equalTo(clusterState)));
             clusterState = newState;
             routingTable = newState.routingTable();
@@ -184,15 +151,12 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
             // MaxRetryAllocationDecider#canForceAllocatePrimary should return YES decisions because canAllocate returns YES here
             assertEquals(
                 Decision.Type.YES,
-                new MaxRetryAllocationDecider().canForceAllocatePrimary(unassignedPrimary, null, newRoutingAllocation(clusterState)).type()
+                decider.canForceAllocatePrimary(unassignedPrimary, null, newRoutingAllocation(clusterState)).type()
             );
         }
         // now we go and check that we are actually stick to unassigned on the next failure
         {
-            List<FailedShard> failedShards = Collections.singletonList(
-                new FailedShard(routingTable.index("idx").shard(0).shard(0), "boom", new UnsupportedOperationException(), randomBoolean())
-            );
-            ClusterState newState = strategy.applyFailedShards(clusterState, failedShards, List.of());
+            ClusterState newState = applyShardFailure(clusterState, routingTable.index("idx").shard(0).shard(0), "boom");
             assertThat(newState, not(equalTo(clusterState)));
             clusterState = newState;
             routingTable = newState.routingTable();
@@ -204,7 +168,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
             // MaxRetryAllocationDecider#canForceAllocatePrimary should return a NO decision because canAllocate returns NO here
             assertEquals(
                 Decision.Type.NO,
-                new MaxRetryAllocationDecider().canForceAllocatePrimary(unassignedPrimary, null, newRoutingAllocation(clusterState)).type()
+                decider.canForceAllocatePrimary(unassignedPrimary, null, newRoutingAllocation(clusterState)).type()
             );
         }
 
@@ -240,11 +204,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
         // bumped up the max retry count, so canForceAllocatePrimary should return a YES decision
         assertEquals(
             Decision.Type.YES,
-            new MaxRetryAllocationDecider().canForceAllocatePrimary(
-                routingTable.index("idx").shard(0).shard(0),
-                null,
-                newRoutingAllocation(clusterState)
-            ).type()
+            decider.canForceAllocatePrimary(routingTable.index("idx").shard(0).shard(0), null, newRoutingAllocation(clusterState)).type()
         );
 
         // now we start the shard
@@ -257,10 +217,7 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
         assertEquals(routingTable.index("idx").shard(0).shard(0).state(), STARTED);
 
         // now fail again and see if it has a new counter
-        List<FailedShard> failedShards = Collections.singletonList(
-            new FailedShard(routingTable.index("idx").shard(0).shard(0), "ZOOOMG", new UnsupportedOperationException(), randomBoolean())
-        );
-        newState = strategy.applyFailedShards(clusterState, failedShards, List.of());
+        newState = applyShardFailure(clusterState, routingTable.index("idx").shard(0).shard(0), "ZOOOMG");
         assertThat(newState, not(equalTo(clusterState)));
         clusterState = newState;
         routingTable = newState.routingTable();
@@ -272,10 +229,83 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
         // Counter reset, so MaxRetryAllocationDecider#canForceAllocatePrimary should return a YES decision
         assertEquals(
             Decision.Type.YES,
-            new MaxRetryAllocationDecider().canForceAllocatePrimary(unassignedPrimary, null, newRoutingAllocation(clusterState)).type()
+            decider.canForceAllocatePrimary(unassignedPrimary, null, newRoutingAllocation(clusterState)).type()
+        );
+    }
+
+    public void testFailedRelocation() {
+        ClusterState clusterState = createInitialClusterState();
+        clusterState = startInitializingShardsAndReroute(strategy, clusterState);
+
+        int retries = MaxRetryAllocationDecider.SETTING_ALLOCATION_MAX_RETRY.get(Settings.EMPTY);
+
+        // shard could be relocated while retries are not exhausted
+        for (int i = 0; i < retries; i++) {
+            clusterState = withRoutingAllocation(clusterState, allocation -> {
+                var source = allocation.routingTable().index("idx").shard(0).shard(0);
+                var targetNodeId = Objects.equals(source.currentNodeId(), "node1") ? "node2" : "node1";
+                assertThat(decider.canAllocate(source, allocation).type(), equalTo(Decision.Type.YES));
+                allocation.routingNodes().relocateShard(source, targetNodeId, 0, allocation.changes());
+            });
+            clusterState = applyShardFailure(
+                clusterState,
+                clusterState.getRoutingTable().index("idx").shard(0).shard(0).getTargetRelocatingShard(),
+                "boom" + i
+            );
+
+            var relocationFailureInfo = clusterState.getRoutingTable().index("idx").shard(0).shard(0).relocationFailureInfo();
+            assertThat(relocationFailureInfo.failedRelocations(), equalTo(i + 1));
+        }
+
+        // shard could not be relocated when retries are exhausted
+        withRoutingAllocation(clusterState, allocation -> {
+            var source = allocation.routingTable().index("idx").shard(0).shard(0);
+            assertThat(decider.canAllocate(source, allocation).type(), equalTo(Decision.Type.NO));
+        });
+
+        // manually reset retry count
+        clusterState = strategy.reroute(clusterState, new AllocationCommands(), false, true).clusterState();
+
+        // shard could be relocated again
+        withRoutingAllocation(clusterState, allocation -> {
+            var source = allocation.routingTable().index("idx").shard(0).shard(0);
+            assertThat(decider.canAllocate(source, allocation).type(), equalTo(Decision.Type.YES));
+        });
+    }
+
+    private ClusterState applyShardFailure(ClusterState clusterState, ShardRouting shardRouting, String message) {
+        return strategy.applyFailedShards(
+            clusterState,
+            List.of(new FailedShard(shardRouting, message, new RuntimeException("test"), randomBoolean())),
+            List.of()
         );
     }
 
+    private static ClusterState withRoutingAllocation(ClusterState clusterState, Consumer<RoutingAllocation> block) {
+        RoutingAllocation allocation = new RoutingAllocation(
+            null,
+            clusterState.mutableRoutingNodes(),
+            clusterState,
+            ClusterInfo.EMPTY,
+            SnapshotShardSizeInfo.EMPTY,
+            0L
+        );
+        block.accept(allocation);
+        return updateClusterState(clusterState, allocation);
+    }
+
+    private static ClusterState updateClusterState(ClusterState state, RoutingAllocation allocation) {
+        assert allocation.metadata() == state.metadata();
+        if (allocation.routingNodesChanged() == false) {
+            return state;
+        }
+        final RoutingTable newRoutingTable = RoutingTable.of(state.routingTable().version(), allocation.routingNodes());
+        final Metadata newMetadata = allocation.updateMetadataWithRoutingChanges(newRoutingTable);
+        assert newRoutingTable.validate(newMetadata);
+
+        return state.copyAndUpdate(builder -> builder.routingTable(newRoutingTable).metadata(newMetadata));
+    }
+
     private RoutingAllocation newRoutingAllocation(ClusterState clusterState) {
         final var routingAllocation = new RoutingAllocation(null, clusterState, null, null, 0);
         if (randomBoolean()) {
@@ -283,5 +313,4 @@ public class MaxRetryAllocationDeciderTests extends ESAllocationTestCase {
         }
         return routingAllocation;
     }
-
 }

+ 2 - 0
test/framework/src/main/java/org/elasticsearch/cluster/routing/ShardRoutingHelper.java

@@ -48,6 +48,7 @@ public class ShardRoutingHelper {
             ShardRoutingState.INITIALIZING,
             recoverySource,
             new UnassignedInfo(UnassignedInfo.Reason.REINITIALIZED, null),
+            RelocationFailureInfo.NO_FAILURES,
             copy.allocationId(),
             copy.getExpectedShardSize()
         );
@@ -66,6 +67,7 @@ public class ShardRoutingHelper {
             routing.state(),
             recoverySource,
             routing.unassignedInfo(),
+            routing.relocationFailureInfo(),
             routing.allocationId(),
             routing.getExpectedShardSize()
         );

+ 13 - 0
test/framework/src/main/java/org/elasticsearch/cluster/routing/TestShardRouting.java

@@ -22,6 +22,7 @@ import static org.apache.lucene.tests.util.LuceneTestCase.random;
 import static org.elasticsearch.test.ESTestCase.randomAlphaOfLength;
 import static org.elasticsearch.test.ESTestCase.randomBoolean;
 import static org.elasticsearch.test.ESTestCase.randomFrom;
+import static org.elasticsearch.test.ESTestCase.randomIntBetween;
 
 /**
  * A helper that allows to create shard routing instances within tests, while not requiring to expose
@@ -42,6 +43,7 @@ public class TestShardRouting {
             state,
             buildRecoveryTarget(primary, state),
             buildUnassignedInfo(state),
+            buildRelocationFailureInfo(state),
             buildAllocationId(state),
             -1
         );
@@ -62,6 +64,7 @@ public class TestShardRouting {
             state,
             recoverySource,
             buildUnassignedInfo(state),
+            buildRelocationFailureInfo(state),
             buildAllocationId(state),
             -1
         );
@@ -99,6 +102,7 @@ public class TestShardRouting {
             state,
             buildRecoveryTarget(primary, state),
             buildUnassignedInfo(state),
+            buildRelocationFailureInfo(state),
             buildAllocationId(state),
             -1
         );
@@ -139,6 +143,7 @@ public class TestShardRouting {
             state,
             buildRecoveryTarget(primary, state),
             buildUnassignedInfo(state),
+            buildRelocationFailureInfo(state),
             allocationId,
             -1
         );
@@ -179,6 +184,7 @@ public class TestShardRouting {
             state,
             buildRecoveryTarget(primary, state),
             unassignedInfo,
+            buildRelocationFailureInfo(state),
             buildAllocationId(state),
             -1
         );
@@ -230,6 +236,13 @@ public class TestShardRouting {
         };
     }
 
+    private static RelocationFailureInfo buildRelocationFailureInfo(ShardRoutingState state) {
+        return switch (state) {
+            case UNASSIGNED, INITIALIZING, STARTED -> RelocationFailureInfo.NO_FAILURES;
+            case RELOCATING -> randomBoolean() ? RelocationFailureInfo.NO_FAILURES : new RelocationFailureInfo(randomIntBetween(1, 10));
+        };
+    }
+
     public static UnassignedInfo randomUnassignedInfo(String message) {
         UnassignedInfo.Reason reason = randomFrom(UnassignedInfo.Reason.values());
         String lastAllocatedNodeId = null;