浏览代码

ILM Make the `check-rollover-ready` step retryable (#48256)

This adds the infrastructure to be able to retry the execution of retryable
steps and makes the `check-rollover-ready` retryable as an initial step to
make the rollover action more resilient to transient errors.
Andrei Dan 6 年之前
父节点
当前提交
454020ac8a
共有 17 个文件被更改,包括 536 次插入95 次删除
  1. 16 7
      docs/reference/ilm/apis/explain.asciidoc
  2. 51 9
      x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/IndexLifecycleExplainResponse.java
  3. 49 7
      x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/LifecycleExecutionState.java
  4. 1 0
      x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/LifecycleSettings.java
  5. 7 0
      x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/Step.java
  6. 5 0
      x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForRolloverReadyStep.java
  7. 12 2
      x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/IndexLifecycleExplainResponseTests.java
  8. 1 1
      x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/RolloverActionTests.java
  9. 62 16
      x-pack/plugin/ilm/qa/multi-node/src/test/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java
  10. 1 1
      x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/ExecuteStepsUpdateTask.java
  11. 105 28
      x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleRunner.java
  12. 2 2
      x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleService.java
  13. 7 2
      x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/MoveToErrorStepUpdateTask.java
  14. 2 0
      x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/action/TransportExplainLifecycleAction.java
  15. 1 1
      x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/action/TransportRetryAction.java
  16. 203 13
      x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/IndexLifecycleRunnerTests.java
  17. 11 6
      x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/MoveToErrorStepUpdateTaskTests.java

+ 16 - 7
docs/reference/ilm/apis/explain.asciidoc

@@ -239,8 +239,11 @@ information for the step that's being performed on the index.
 
 
 If the index is in the ERROR step, something went wrong while executing a
 If the index is in the ERROR step, something went wrong while executing a
 step in the policy and you will need to take action for the index to proceed
 step in the policy and you will need to take action for the index to proceed
-to the next step. To help you diagnose the problem, the explain response shows
-the step that failed and the step info provides information about the error.
+to the next step. Some steps are safe to automatically be retried in certain
+circumstances. To help you diagnose the problem, the explain response shows
+the step that failed, the step info which provides information about the error,
+and information about the retry attempts executed for the failed step if it's
+the case.
 
 
 [source,console-result]
 [source,console-result]
 --------------------------------------------------
 --------------------------------------------------
@@ -262,10 +265,12 @@ the step that failed and the step info provides information about the error.
       "step": "ERROR",
       "step": "ERROR",
       "step_time_millis": 1538475653317,
       "step_time_millis": 1538475653317,
       "step_time": "2018-10-15T13:45:22.577Z",
       "step_time": "2018-10-15T13:45:22.577Z",
-      "failed_step": "attempt-rollover", <1>
-      "step_info": { <2>
-        "type": "resource_already_exists_exception",
-        "reason": "index [test-000057/H7lF9n36Rzqa-KfKcnGQMg] already exists",
+      "failed_step": "check-rollover-ready", <1>
+      "is_auto_retryable_error": true, <2>
+      "failed_step_retry_count": 1, <3>
+      "step_info": { <4>
+        "type": "cluster_block_exception",
+        "reason": "index [test-000057/H7lF9n36Rzqa-KfKcnGQMg] blocked by: [FORBIDDEN/5/index read-only (api)",
         "index_uuid": "H7lF9n36Rzqa-KfKcnGQMg",
         "index_uuid": "H7lF9n36Rzqa-KfKcnGQMg",
         "index": "test-000057"
         "index": "test-000057"
       },
       },
@@ -290,4 +295,8 @@ the step that failed and the step info provides information about the error.
 // TESTRESPONSE[skip:not possible to get the cluster into this state in a docs test]
 // TESTRESPONSE[skip:not possible to get the cluster into this state in a docs test]
 
 
 <1> The step that caused the error
 <1> The step that caused the error
-<2> What went wrong
+<2> Indicates if retrying the failed step can overcome the error. If this
+is true, ILM will retry the failed step automatically.
+<3> Shows the number of attempted automatic retries to execute the failed
+step.
+<4> What went wrong

+ 51 - 9
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/IndexLifecycleExplainResponse.java

@@ -6,6 +6,7 @@
 
 
 package org.elasticsearch.xpack.core.ilm;
 package org.elasticsearch.xpack.core.ilm;
 
 
+import org.elasticsearch.Version;
 import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.bytes.BytesReference;
@@ -34,6 +35,8 @@ public class IndexLifecycleExplainResponse implements ToXContentObject, Writeabl
     private static final ParseField ACTION_FIELD = new ParseField("action");
     private static final ParseField ACTION_FIELD = new ParseField("action");
     private static final ParseField STEP_FIELD = new ParseField("step");
     private static final ParseField STEP_FIELD = new ParseField("step");
     private static final ParseField FAILED_STEP_FIELD = new ParseField("failed_step");
     private static final ParseField FAILED_STEP_FIELD = new ParseField("failed_step");
+    private static final ParseField IS_AUTO_RETRYABLE_ERROR_FIELD = new ParseField("is_auto_retryable_error");
+    private static final ParseField FAILED_STEP_RETRY_COUNT_FIELD = new ParseField("failed_step_retry_count");
     private static final ParseField PHASE_TIME_MILLIS_FIELD = new ParseField("phase_time_millis");
     private static final ParseField PHASE_TIME_MILLIS_FIELD = new ParseField("phase_time_millis");
     private static final ParseField PHASE_TIME_FIELD = new ParseField("phase_time");
     private static final ParseField PHASE_TIME_FIELD = new ParseField("phase_time");
     private static final ParseField ACTION_TIME_MILLIS_FIELD = new ParseField("action_time_millis");
     private static final ParseField ACTION_TIME_MILLIS_FIELD = new ParseField("action_time_millis");
@@ -55,6 +58,8 @@ public class IndexLifecycleExplainResponse implements ToXContentObject, Writeabl
                     (String) a[5],
                     (String) a[5],
                     (String) a[6],
                     (String) a[6],
                     (String) a[7],
                     (String) a[7],
+                    (Boolean) a[14],
+                    (Integer) a[15],
                     (Long) (a[8]),
                     (Long) (a[8]),
                     (Long) (a[9]),
                     (Long) (a[9]),
                     (Long) (a[10]),
                     (Long) (a[10]),
@@ -82,6 +87,8 @@ public class IndexLifecycleExplainResponse implements ToXContentObject, Writeabl
         PARSER.declareObject(ConstructingObjectParser.optionalConstructorArg(), (p, c) -> PhaseExecutionInfo.parse(p, ""),
         PARSER.declareObject(ConstructingObjectParser.optionalConstructorArg(), (p, c) -> PhaseExecutionInfo.parse(p, ""),
             PHASE_EXECUTION_INFO);
             PHASE_EXECUTION_INFO);
         PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), AGE_FIELD);
         PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), AGE_FIELD);
+        PARSER.declareBoolean(ConstructingObjectParser.optionalConstructorArg(), IS_AUTO_RETRYABLE_ERROR_FIELD);
+        PARSER.declareInt(ConstructingObjectParser.optionalConstructorArg(), FAILED_STEP_RETRY_COUNT_FIELD);
     }
     }
 
 
     private final String index;
     private final String index;
@@ -97,21 +104,25 @@ public class IndexLifecycleExplainResponse implements ToXContentObject, Writeabl
     private final boolean managedByILM;
     private final boolean managedByILM;
     private final BytesReference stepInfo;
     private final BytesReference stepInfo;
     private final PhaseExecutionInfo phaseExecutionInfo;
     private final PhaseExecutionInfo phaseExecutionInfo;
+    private final Boolean isAutoRetryableError;
+    private final Integer failedStepRetryCount;
 
 
     public static IndexLifecycleExplainResponse newManagedIndexResponse(String index, String policyName, Long lifecycleDate,
     public static IndexLifecycleExplainResponse newManagedIndexResponse(String index, String policyName, Long lifecycleDate,
-            String phase, String action, String step, String failedStep, Long phaseTime, Long actionTime, Long stepTime,
-            BytesReference stepInfo, PhaseExecutionInfo phaseExecutionInfo) {
-        return new IndexLifecycleExplainResponse(index, true, policyName, lifecycleDate, phase, action, step, failedStep, phaseTime,
-                actionTime, stepTime, stepInfo, phaseExecutionInfo);
+            String phase, String action, String step, String failedStep, Boolean isAutoRetryableError, Integer failedStepRetryCount,
+            Long phaseTime, Long actionTime, Long stepTime, BytesReference stepInfo, PhaseExecutionInfo phaseExecutionInfo) {
+        return new IndexLifecycleExplainResponse(index, true, policyName, lifecycleDate, phase, action, step, failedStep,
+            isAutoRetryableError, failedStepRetryCount, phaseTime, actionTime, stepTime, stepInfo, phaseExecutionInfo);
     }
     }
 
 
     public static IndexLifecycleExplainResponse newUnmanagedIndexResponse(String index) {
     public static IndexLifecycleExplainResponse newUnmanagedIndexResponse(String index) {
-        return new IndexLifecycleExplainResponse(index, false, null, null, null, null, null, null, null, null, null, null, null);
+        return new IndexLifecycleExplainResponse(index, false, null, null, null, null, null, null, null, null, null, null, null, null,
+            null);
     }
     }
 
 
     private IndexLifecycleExplainResponse(String index, boolean managedByILM, String policyName, Long lifecycleDate,
     private IndexLifecycleExplainResponse(String index, boolean managedByILM, String policyName, Long lifecycleDate,
-                                          String phase, String action, String step, String failedStep, Long phaseTime, Long actionTime,
-                                          Long stepTime, BytesReference stepInfo, PhaseExecutionInfo phaseExecutionInfo) {
+                                          String phase, String action, String step, String failedStep, Boolean isAutoRetryableError,
+                                          Integer failedStepRetryCount, Long phaseTime, Long actionTime, Long stepTime,
+                                          BytesReference stepInfo, PhaseExecutionInfo phaseExecutionInfo) {
         if (managedByILM) {
         if (managedByILM) {
             if (policyName == null) {
             if (policyName == null) {
                 throw new IllegalArgumentException("[" + POLICY_NAME_FIELD.getPreferredName() + "] cannot be null for managed index");
                 throw new IllegalArgumentException("[" + POLICY_NAME_FIELD.getPreferredName() + "] cannot be null for managed index");
@@ -142,6 +153,8 @@ public class IndexLifecycleExplainResponse implements ToXContentObject, Writeabl
         this.actionTime = actionTime;
         this.actionTime = actionTime;
         this.stepTime = stepTime;
         this.stepTime = stepTime;
         this.failedStep = failedStep;
         this.failedStep = failedStep;
+        this.isAutoRetryableError = isAutoRetryableError;
+        this.failedStepRetryCount = failedStepRetryCount;
         this.stepInfo = stepInfo;
         this.stepInfo = stepInfo;
         this.phaseExecutionInfo = phaseExecutionInfo;
         this.phaseExecutionInfo = phaseExecutionInfo;
     }
     }
@@ -161,6 +174,13 @@ public class IndexLifecycleExplainResponse implements ToXContentObject, Writeabl
             stepTime = in.readOptionalLong();
             stepTime = in.readOptionalLong();
             stepInfo = in.readOptionalBytesReference();
             stepInfo = in.readOptionalBytesReference();
             phaseExecutionInfo = in.readOptionalWriteable(PhaseExecutionInfo::new);
             phaseExecutionInfo = in.readOptionalWriteable(PhaseExecutionInfo::new);
+            if (in.getVersion().onOrAfter(Version.V_8_0_0)) {
+                isAutoRetryableError = in.readOptionalBoolean();
+                failedStepRetryCount = in.readOptionalVInt();
+            } else {
+                isAutoRetryableError = null;
+                failedStepRetryCount = null;
+            }
         } else {
         } else {
             policyName = null;
             policyName = null;
             lifecycleDate = null;
             lifecycleDate = null;
@@ -168,6 +188,8 @@ public class IndexLifecycleExplainResponse implements ToXContentObject, Writeabl
             action = null;
             action = null;
             step = null;
             step = null;
             failedStep = null;
             failedStep = null;
+            isAutoRetryableError = null;
+            failedStepRetryCount = null;
             phaseTime = null;
             phaseTime = null;
             actionTime = null;
             actionTime = null;
             stepTime = null;
             stepTime = null;
@@ -192,6 +214,10 @@ public class IndexLifecycleExplainResponse implements ToXContentObject, Writeabl
             out.writeOptionalLong(stepTime);
             out.writeOptionalLong(stepTime);
             out.writeOptionalBytesReference(stepInfo);
             out.writeOptionalBytesReference(stepInfo);
             out.writeOptionalWriteable(phaseExecutionInfo);
             out.writeOptionalWriteable(phaseExecutionInfo);
+            if (out.getVersion().onOrAfter(Version.V_8_0_0)) {
+                out.writeOptionalBoolean(isAutoRetryableError);
+                out.writeOptionalVInt(failedStepRetryCount);
+            }
         }
         }
     }
     }
 
 
@@ -247,6 +273,14 @@ public class IndexLifecycleExplainResponse implements ToXContentObject, Writeabl
         return phaseExecutionInfo;
         return phaseExecutionInfo;
     }
     }
 
 
+    public Boolean isAutoRetryableError() {
+        return isAutoRetryableError;
+    }
+
+    public Integer getFailedStepRetryCount() {
+        return failedStepRetryCount;
+    }
+
     public TimeValue getAge() {
     public TimeValue getAge() {
         if (lifecycleDate == null) {
         if (lifecycleDate == null) {
             return TimeValue.MINUS_ONE;
             return TimeValue.MINUS_ONE;
@@ -287,6 +321,12 @@ public class IndexLifecycleExplainResponse implements ToXContentObject, Writeabl
             if (Strings.hasLength(failedStep)) {
             if (Strings.hasLength(failedStep)) {
                 builder.field(FAILED_STEP_FIELD.getPreferredName(), failedStep);
                 builder.field(FAILED_STEP_FIELD.getPreferredName(), failedStep);
             }
             }
+            if (isAutoRetryableError != null) {
+                builder.field(IS_AUTO_RETRYABLE_ERROR_FIELD.getPreferredName(), isAutoRetryableError);
+            }
+            if (failedStepRetryCount != null) {
+                builder.field(FAILED_STEP_RETRY_COUNT_FIELD.getPreferredName(), failedStepRetryCount);
+            }
             if (stepInfo != null && stepInfo.length() > 0) {
             if (stepInfo != null && stepInfo.length() > 0) {
                 builder.rawField(STEP_INFO_FIELD.getPreferredName(), stepInfo.streamInput(), XContentType.JSON);
                 builder.rawField(STEP_INFO_FIELD.getPreferredName(), stepInfo.streamInput(), XContentType.JSON);
             }
             }
@@ -300,8 +340,8 @@ public class IndexLifecycleExplainResponse implements ToXContentObject, Writeabl
 
 
     @Override
     @Override
     public int hashCode() {
     public int hashCode() {
-        return Objects.hash(index, managedByILM, policyName, lifecycleDate, phase, action, step, failedStep, phaseTime, actionTime,
-                stepTime, stepInfo, phaseExecutionInfo);
+        return Objects.hash(index, managedByILM, policyName, lifecycleDate, phase, action, step, failedStep, isAutoRetryableError,
+            failedStepRetryCount, phaseTime, actionTime, stepTime, stepInfo, phaseExecutionInfo);
     }
     }
 
 
     @Override
     @Override
@@ -321,6 +361,8 @@ public class IndexLifecycleExplainResponse implements ToXContentObject, Writeabl
                 Objects.equals(action, other.action) &&
                 Objects.equals(action, other.action) &&
                 Objects.equals(step, other.step) &&
                 Objects.equals(step, other.step) &&
                 Objects.equals(failedStep, other.failedStep) &&
                 Objects.equals(failedStep, other.failedStep) &&
+                Objects.equals(isAutoRetryableError, other.isAutoRetryableError) &&
+                Objects.equals(failedStepRetryCount, other.failedStepRetryCount) &&
                 Objects.equals(phaseTime, other.phaseTime) &&
                 Objects.equals(phaseTime, other.phaseTime) &&
                 Objects.equals(actionTime, other.actionTime) &&
                 Objects.equals(actionTime, other.actionTime) &&
                 Objects.equals(stepTime, other.stepTime) &&
                 Objects.equals(stepTime, other.stepTime) &&

+ 49 - 7
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/LifecycleExecutionState.java

@@ -30,6 +30,8 @@ public class LifecycleExecutionState {
     private static final String ACTION_TIME = "action_time";
     private static final String ACTION_TIME = "action_time";
     private static final String STEP_TIME = "step_time";
     private static final String STEP_TIME = "step_time";
     private static final String FAILED_STEP = "failed_step";
     private static final String FAILED_STEP = "failed_step";
+    private static final String IS_AUTO_RETRYABLE_ERROR = "is_auto_retryable_error";
+    private static final String FAILED_STEP_RETRY_COUNT = "failed_step_retry_count";
     private static final String STEP_INFO = "step_info";
     private static final String STEP_INFO = "step_info";
     private static final String PHASE_DEFINITION = "phase_definition";
     private static final String PHASE_DEFINITION = "phase_definition";
 
 
@@ -37,6 +39,8 @@ public class LifecycleExecutionState {
     private final String action;
     private final String action;
     private final String step;
     private final String step;
     private final String failedStep;
     private final String failedStep;
+    private final Boolean isAutoRetryableError;
+    private final Integer failedStepRetryCount;
     private final String stepInfo;
     private final String stepInfo;
     private final String phaseDefinition;
     private final String phaseDefinition;
     private final Long lifecycleDate;
     private final Long lifecycleDate;
@@ -44,13 +48,15 @@ public class LifecycleExecutionState {
     private final Long actionTime;
     private final Long actionTime;
     private final Long stepTime;
     private final Long stepTime;
 
 
-    private LifecycleExecutionState(String phase, String action, String step, String failedStep,
-                                    String stepInfo, String phaseDefinition, Long lifecycleDate,
+    private LifecycleExecutionState(String phase, String action, String step, String failedStep, Boolean isAutoRetryableError,
+                                    Integer failedStepRetryCount, String stepInfo, String phaseDefinition, Long lifecycleDate,
                                     Long phaseTime, Long actionTime, Long stepTime) {
                                     Long phaseTime, Long actionTime, Long stepTime) {
         this.phase = phase;
         this.phase = phase;
         this.action = action;
         this.action = action;
         this.step = step;
         this.step = step;
         this.failedStep = failedStep;
         this.failedStep = failedStep;
+        this.isAutoRetryableError = isAutoRetryableError;
+        this.failedStepRetryCount = failedStepRetryCount;
         this.stepInfo = stepInfo;
         this.stepInfo = stepInfo;
         this.phaseDefinition = phaseDefinition;
         this.phaseDefinition = phaseDefinition;
         this.lifecycleDate = lifecycleDate;
         this.lifecycleDate = lifecycleDate;
@@ -82,6 +88,8 @@ public class LifecycleExecutionState {
             .setAction(state.action)
             .setAction(state.action)
             .setStep(state.step)
             .setStep(state.step)
             .setFailedStep(state.failedStep)
             .setFailedStep(state.failedStep)
+            .setIsAutoRetryableError(state.isAutoRetryableError)
+            .setFailedStepRetryCount(state.failedStepRetryCount)
             .setStepInfo(state.stepInfo)
             .setStepInfo(state.stepInfo)
             .setPhaseDefinition(state.phaseDefinition)
             .setPhaseDefinition(state.phaseDefinition)
             .setIndexCreationDate(state.lifecycleDate)
             .setIndexCreationDate(state.lifecycleDate)
@@ -104,6 +112,12 @@ public class LifecycleExecutionState {
         if (customData.containsKey(FAILED_STEP)) {
         if (customData.containsKey(FAILED_STEP)) {
             builder.setFailedStep(customData.get(FAILED_STEP));
             builder.setFailedStep(customData.get(FAILED_STEP));
         }
         }
+        if (customData.containsKey(IS_AUTO_RETRYABLE_ERROR)) {
+            builder.setIsAutoRetryableError(Boolean.parseBoolean(customData.get(IS_AUTO_RETRYABLE_ERROR)));
+        }
+        if (customData.containsKey(FAILED_STEP_RETRY_COUNT)) {
+            builder.setFailedStepRetryCount(Integer.parseInt(customData.get(FAILED_STEP_RETRY_COUNT)));
+        }
         if (customData.containsKey(STEP_INFO)) {
         if (customData.containsKey(STEP_INFO)) {
             builder.setStepInfo(customData.get(STEP_INFO));
             builder.setStepInfo(customData.get(STEP_INFO));
         }
         }
@@ -164,6 +178,12 @@ public class LifecycleExecutionState {
         if (failedStep != null) {
         if (failedStep != null) {
             result.put(FAILED_STEP, failedStep);
             result.put(FAILED_STEP, failedStep);
         }
         }
+        if (isAutoRetryableError != null) {
+            result.put(IS_AUTO_RETRYABLE_ERROR, String.valueOf(isAutoRetryableError));
+        }
+        if (failedStepRetryCount != null) {
+            result.put(FAILED_STEP_RETRY_COUNT, String.valueOf(failedStepRetryCount));
+        }
         if (stepInfo != null) {
         if (stepInfo != null) {
             result.put(STEP_INFO, stepInfo);
             result.put(STEP_INFO, stepInfo);
         }
         }
@@ -201,6 +221,14 @@ public class LifecycleExecutionState {
         return failedStep;
         return failedStep;
     }
     }
 
 
+    public Boolean isAutoRetryableError() {
+        return isAutoRetryableError;
+    }
+
+    public Integer getFailedStepRetryCount() {
+        return failedStepRetryCount;
+    }
+
     public String getStepInfo() {
     public String getStepInfo() {
         return stepInfo;
         return stepInfo;
     }
     }
@@ -230,7 +258,7 @@ public class LifecycleExecutionState {
         if (this == o) return true;
         if (this == o) return true;
         if (o == null || getClass() != o.getClass()) return false;
         if (o == null || getClass() != o.getClass()) return false;
         LifecycleExecutionState that = (LifecycleExecutionState) o;
         LifecycleExecutionState that = (LifecycleExecutionState) o;
-        return Objects.equals(getLifecycleDate(),that.getLifecycleDate()) &&
+        return Objects.equals(getLifecycleDate(), that.getLifecycleDate()) &&
             Objects.equals(getPhaseTime(), that.getPhaseTime()) &&
             Objects.equals(getPhaseTime(), that.getPhaseTime()) &&
             Objects.equals(getActionTime(), that.getActionTime()) &&
             Objects.equals(getActionTime(), that.getActionTime()) &&
             Objects.equals(getStepTime(), that.getStepTime()) &&
             Objects.equals(getStepTime(), that.getStepTime()) &&
@@ -238,14 +266,16 @@ public class LifecycleExecutionState {
             Objects.equals(getAction(), that.getAction()) &&
             Objects.equals(getAction(), that.getAction()) &&
             Objects.equals(getStep(), that.getStep()) &&
             Objects.equals(getStep(), that.getStep()) &&
             Objects.equals(getFailedStep(), that.getFailedStep()) &&
             Objects.equals(getFailedStep(), that.getFailedStep()) &&
+            Objects.equals(isAutoRetryableError(), that.isAutoRetryableError()) &&
+            Objects.equals(getFailedStepRetryCount(), that.getFailedStepRetryCount()) &&
             Objects.equals(getStepInfo(), that.getStepInfo()) &&
             Objects.equals(getStepInfo(), that.getStepInfo()) &&
             Objects.equals(getPhaseDefinition(), that.getPhaseDefinition());
             Objects.equals(getPhaseDefinition(), that.getPhaseDefinition());
     }
     }
 
 
     @Override
     @Override
     public int hashCode() {
     public int hashCode() {
-        return Objects.hash(getPhase(), getAction(), getStep(), getFailedStep(), getStepInfo(), getPhaseDefinition(),
-            getLifecycleDate(), getPhaseTime(), getActionTime(), getStepTime());
+        return Objects.hash(getPhase(), getAction(), getStep(), getFailedStep(), isAutoRetryableError(), getFailedStepRetryCount(),
+            getStepInfo(), getPhaseDefinition(), getLifecycleDate(), getPhaseTime(), getActionTime(), getStepTime());
     }
     }
 
 
     public static class Builder {
     public static class Builder {
@@ -259,6 +289,8 @@ public class LifecycleExecutionState {
         private Long phaseTime;
         private Long phaseTime;
         private Long actionTime;
         private Long actionTime;
         private Long stepTime;
         private Long stepTime;
+        private Boolean isAutoRetryableError;
+        private Integer failedStepRetryCount;
 
 
         public Builder setPhase(String phase) {
         public Builder setPhase(String phase) {
             this.phase = phase;
             this.phase = phase;
@@ -310,9 +342,19 @@ public class LifecycleExecutionState {
             return this;
             return this;
         }
         }
 
 
+        public Builder setIsAutoRetryableError(Boolean isAutoRetryableError) {
+            this.isAutoRetryableError = isAutoRetryableError;
+            return this;
+        }
+
+        public Builder setFailedStepRetryCount(Integer failedStepRetryCount) {
+            this.failedStepRetryCount = failedStepRetryCount;
+            return this;
+        }
+
         public LifecycleExecutionState build() {
         public LifecycleExecutionState build() {
-            return new LifecycleExecutionState(phase, action, step, failedStep, stepInfo, phaseDefinition, indexCreationDate,
-                phaseTime, actionTime, stepTime);
+            return new LifecycleExecutionState(phase, action, step, failedStep, isAutoRetryableError, failedStepRetryCount, stepInfo,
+                phaseDefinition, indexCreationDate, phaseTime, actionTime, stepTime);
         }
         }
     }
     }
 
 

+ 1 - 0
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/LifecycleSettings.java

@@ -36,6 +36,7 @@ public class LifecycleSettings {
     public static final Setting<Boolean> LIFECYCLE_PARSE_ORIGINATION_DATE_SETTING = Setting.boolSetting(LIFECYCLE_PARSE_ORIGINATION_DATE,
     public static final Setting<Boolean> LIFECYCLE_PARSE_ORIGINATION_DATE_SETTING = Setting.boolSetting(LIFECYCLE_PARSE_ORIGINATION_DATE,
         false, Setting.Property.Dynamic, Setting.Property.IndexScope);
         false, Setting.Property.Dynamic, Setting.Property.IndexScope);
 
 
+
     public static final Setting<Boolean> SLM_HISTORY_INDEX_ENABLED_SETTING = Setting.boolSetting(SLM_HISTORY_INDEX_ENABLED, true,
     public static final Setting<Boolean> SLM_HISTORY_INDEX_ENABLED_SETTING = Setting.boolSetting(SLM_HISTORY_INDEX_ENABLED, true,
         Setting.Property.NodeScope);
         Setting.Property.NodeScope);
     public static final Setting<String> SLM_RETENTION_SCHEDULE_SETTING = Setting.simpleString(SLM_RETENTION_SCHEDULE,
     public static final Setting<String> SLM_RETENTION_SCHEDULE_SETTING = Setting.simpleString(SLM_RETENTION_SCHEDULE,

+ 7 - 0
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/Step.java

@@ -38,6 +38,13 @@ public abstract class Step {
         return nextStepKey;
         return nextStepKey;
     }
     }
 
 
+    /**
+     * Indicates if the step can be automatically retried when it encounters an execution error.
+     */
+    public boolean isRetryable() {
+        return false;
+    }
+
     @Override
     @Override
     public int hashCode() {
     public int hashCode() {
         return Objects.hash(key, nextStepKey);
         return Objects.hash(key, nextStepKey);

+ 5 - 0
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/WaitForRolloverReadyStep.java

@@ -42,6 +42,11 @@ public class WaitForRolloverReadyStep extends AsyncWaitStep {
         this.maxDocs = maxDocs;
         this.maxDocs = maxDocs;
     }
     }
 
 
+    @Override
+    public boolean isRetryable() {
+        return true;
+    }
+
     @Override
     @Override
     public void evaluateCondition(IndexMetaData indexMetaData, Listener listener) {
     public void evaluateCondition(IndexMetaData indexMetaData, Listener listener) {
         String rolloverAlias = RolloverAction.LIFECYCLE_ROLLOVER_ALIAS_SETTING.get(indexMetaData.getSettings());
         String rolloverAlias = RolloverAction.LIFECYCLE_ROLLOVER_ALIAS_SETTING.get(indexMetaData.getSettings());

+ 12 - 2
x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/IndexLifecycleExplainResponseTests.java

@@ -52,6 +52,8 @@ public class IndexLifecycleExplainResponseTests extends AbstractSerializingTestC
             stepNull ? null : randomAlphaOfLength(10),
             stepNull ? null : randomAlphaOfLength(10),
             stepNull ? null : randomAlphaOfLength(10),
             stepNull ? null : randomAlphaOfLength(10),
             randomBoolean() ? null : randomAlphaOfLength(10),
             randomBoolean() ? null : randomAlphaOfLength(10),
+            stepNull ? null : randomBoolean(),
+            stepNull ? null : randomInt(10),
             stepNull ? null : randomNonNegativeLong(),
             stepNull ? null : randomNonNegativeLong(),
             stepNull ? null : randomNonNegativeLong(),
             stepNull ? null : randomNonNegativeLong(),
             stepNull ? null : randomNonNegativeLong(),
             stepNull ? null : randomNonNegativeLong(),
@@ -69,6 +71,8 @@ public class IndexLifecycleExplainResponseTests extends AbstractSerializingTestC
                 (numNull == 2) ? null : randomAlphaOfLength(10),
                 (numNull == 2) ? null : randomAlphaOfLength(10),
                 (numNull == 3) ? null : randomAlphaOfLength(10),
                 (numNull == 3) ? null : randomAlphaOfLength(10),
                 randomBoolean() ? null : randomAlphaOfLength(10),
                 randomBoolean() ? null : randomAlphaOfLength(10),
+                randomBoolean() ? null : randomBoolean(),
+                randomBoolean() ? null : randomInt(10),
                 randomBoolean() ? null : randomNonNegativeLong(),
                 randomBoolean() ? null : randomNonNegativeLong(),
                 randomBoolean() ? null : randomNonNegativeLong(),
                 randomBoolean() ? null : randomNonNegativeLong(),
                 randomBoolean() ? null : randomNonNegativeLong(),
                 randomBoolean() ? null : randomNonNegativeLong(),
@@ -106,6 +110,8 @@ public class IndexLifecycleExplainResponseTests extends AbstractSerializingTestC
         String action = instance.getAction();
         String action = instance.getAction();
         String step = instance.getStep();
         String step = instance.getStep();
         String failedStep = instance.getFailedStep();
         String failedStep = instance.getFailedStep();
+        Boolean isAutoRetryableError = instance.isAutoRetryableError();
+        Integer failedStepRetryCount = instance.getFailedStepRetryCount();
         Long policyTime = instance.getLifecycleDate();
         Long policyTime = instance.getLifecycleDate();
         Long phaseTime = instance.getPhaseTime();
         Long phaseTime = instance.getPhaseTime();
         Long actionTime = instance.getActionTime();
         Long actionTime = instance.getActionTime();
@@ -114,7 +120,7 @@ public class IndexLifecycleExplainResponseTests extends AbstractSerializingTestC
         BytesReference stepInfo = instance.getStepInfo();
         BytesReference stepInfo = instance.getStepInfo();
         PhaseExecutionInfo phaseExecutionInfo = instance.getPhaseExecutionInfo();
         PhaseExecutionInfo phaseExecutionInfo = instance.getPhaseExecutionInfo();
         if (managed) {
         if (managed) {
-            switch (between(0, 10)) {
+            switch (between(0, 11)) {
             case 0:
             case 0:
                 index = index + randomAlphaOfLengthBetween(1, 5);
                 index = index + randomAlphaOfLengthBetween(1, 5);
                 break;
                 break;
@@ -162,11 +168,15 @@ public class IndexLifecycleExplainResponseTests extends AbstractSerializingTestC
                 break;
                 break;
             case 10:
             case 10:
                 return IndexLifecycleExplainResponse.newUnmanagedIndexResponse(index);
                 return IndexLifecycleExplainResponse.newUnmanagedIndexResponse(index);
+            case 11:
+                isAutoRetryableError = true;
+                failedStepRetryCount = randomValueOtherThan(failedStepRetryCount, () -> randomInt(10));
+                break;
             default:
             default:
                 throw new AssertionError("Illegal randomisation branch");
                 throw new AssertionError("Illegal randomisation branch");
             }
             }
             return IndexLifecycleExplainResponse.newManagedIndexResponse(index, policy, policyTime, phase, action, step, failedStep,
             return IndexLifecycleExplainResponse.newManagedIndexResponse(index, policy, policyTime, phase, action, step, failedStep,
-                    phaseTime, actionTime, stepTime, stepInfo, phaseExecutionInfo);
+                isAutoRetryableError, failedStepRetryCount, phaseTime, actionTime, stepTime, stepInfo, phaseExecutionInfo);
         } else {
         } else {
             switch (between(0, 1)) {
             switch (between(0, 1)) {
             case 0:
             case 0:

+ 1 - 1
x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/RolloverActionTests.java

@@ -27,7 +27,7 @@ public class RolloverActionTests extends AbstractActionTestCase<RolloverAction>
         return randomInstance();
         return randomInstance();
     }
     }
 
 
-    static RolloverAction randomInstance() {
+    public static RolloverAction randomInstance() {
         ByteSizeUnit maxSizeUnit = randomFrom(ByteSizeUnit.values());
         ByteSizeUnit maxSizeUnit = randomFrom(ByteSizeUnit.values());
         ByteSizeValue maxSize = randomBoolean() ? null : new ByteSizeValue(randomNonNegativeLong() / maxSizeUnit.toBytes(1), maxSizeUnit);
         ByteSizeValue maxSize = randomBoolean() ? null : new ByteSizeValue(randomNonNegativeLong() / maxSizeUnit.toBytes(1), maxSizeUnit);
         Long maxDocs = randomBoolean() ? null : randomNonNegativeLong();
         Long maxDocs = randomBoolean() ? null : randomNonNegativeLong();

+ 62 - 16
x-pack/plugin/ilm/qa/multi-node/src/test/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java

@@ -60,16 +60,20 @@ import static org.hamcrest.Matchers.allOf;
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.greaterThan;
 import static org.hamcrest.Matchers.greaterThan;
+import static org.hamcrest.Matchers.greaterThanOrEqualTo;
 import static org.hamcrest.Matchers.hasKey;
 import static org.hamcrest.Matchers.hasKey;
+import static org.hamcrest.Matchers.is;
 import static org.hamcrest.Matchers.not;
 import static org.hamcrest.Matchers.not;
 import static org.hamcrest.Matchers.nullValue;
 import static org.hamcrest.Matchers.nullValue;
 
 
 public class TimeSeriesLifecycleActionsIT extends ESRestTestCase {
 public class TimeSeriesLifecycleActionsIT extends ESRestTestCase {
+    private static final Logger logger = LogManager.getLogger(TimeSeriesLifecycleActionsIT.class);
+    private static final String FAILED_STEP_RETRY_COUNT_FIELD = "failed_step_retry_count";
+    private static final String IS_AUTO_RETRYABLE_ERROR_FIELD = "is_auto_retryable_error";
+
     private String index;
     private String index;
     private String policy;
     private String policy;
 
 
-    private static final Logger logger = LogManager.getLogger(TimeSeriesLifecycleActionsIT.class);
-
     @Before
     @Before
     public void refreshIndex() {
     public void refreshIndex() {
         index = randomAlphaOfLength(10).toLowerCase(Locale.ROOT);
         index = randomAlphaOfLength(10).toLowerCase(Locale.ROOT);
@@ -77,7 +81,6 @@ public class TimeSeriesLifecycleActionsIT extends ESRestTestCase {
     }
     }
 
 
     public static void updatePolicy(String indexName, String policy) throws IOException {
     public static void updatePolicy(String indexName, String policy) throws IOException {
-
         Request changePolicyRequest = new Request("PUT", "/" + indexName + "/_settings");
         Request changePolicyRequest = new Request("PUT", "/" + indexName + "/_settings");
         final StringEntity changePolicyEntity = new StringEntity("{ \"index.lifecycle.name\": \"" + policy + "\" }",
         final StringEntity changePolicyEntity = new StringEntity("{ \"index.lifecycle.name\": \"" + policy + "\" }",
                 ContentType.APPLICATION_JSON);
                 ContentType.APPLICATION_JSON);
@@ -292,7 +295,9 @@ public class TimeSeriesLifecycleActionsIT extends ESRestTestCase {
         String secondIndex = index + "-000002";
         String secondIndex = index + "-000002";
         createIndexWithSettings(originalIndex, Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
         createIndexWithSettings(originalIndex, Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
             .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
             .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
-            .put(RolloverAction.LIFECYCLE_ROLLOVER_ALIAS, "alias"));
+            .put(RolloverAction.LIFECYCLE_ROLLOVER_ALIAS, "alias")
+        );
+
         // create policy
         // create policy
         createNewSingletonPolicy("hot", new RolloverAction(null, null, 1L));
         createNewSingletonPolicy("hot", new RolloverAction(null, null, 1L));
         // update policy on index
         // update policy on index
@@ -833,10 +838,12 @@ public class TimeSeriesLifecycleActionsIT extends ESRestTestCase {
         createIndexWithSettings(goodIndex, Settings.builder()
         createIndexWithSettings(goodIndex, Settings.builder()
             .put(RolloverAction.LIFECYCLE_ROLLOVER_ALIAS, "alias")
             .put(RolloverAction.LIFECYCLE_ROLLOVER_ALIAS, "alias")
             .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
             .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
-            .put(LifecycleSettings.LIFECYCLE_NAME, policy));
+            .put(LifecycleSettings.LIFECYCLE_NAME, policy)
+        );
         createIndexWithSettingsNoAlias(errorIndex, Settings.builder()
         createIndexWithSettingsNoAlias(errorIndex, Settings.builder()
             .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
             .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
-            .put(LifecycleSettings.LIFECYCLE_NAME, policy));
+            .put(LifecycleSettings.LIFECYCLE_NAME, policy)
+        );
         createIndexWithSettingsNoAlias(nonexistantPolicyIndex, Settings.builder()
         createIndexWithSettingsNoAlias(nonexistantPolicyIndex, Settings.builder()
             .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
             .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
             .put(LifecycleSettings.LIFECYCLE_NAME, randomValueOtherThan(policy, () -> randomAlphaOfLengthBetween(3,10))));
             .put(LifecycleSettings.LIFECYCLE_NAME, randomValueOtherThan(policy, () -> randomAlphaOfLengthBetween(3,10))));
@@ -854,27 +861,66 @@ public class TimeSeriesLifecycleActionsIT extends ESRestTestCase {
             assertThat(onlyManagedResponse, allOf(hasKey(goodIndex), hasKey(errorIndex), hasKey(nonexistantPolicyIndex)));
             assertThat(onlyManagedResponse, allOf(hasKey(goodIndex), hasKey(errorIndex), hasKey(nonexistantPolicyIndex)));
             assertThat(onlyManagedResponse, not(hasKey(unmanagedIndex)));
             assertThat(onlyManagedResponse, not(hasKey(unmanagedIndex)));
 
 
-            Map<String, Map<String, Object>> onlyErrorsResponse = explain(index + "*", true, randomBoolean());
+            Map<String, Map<String, Object>> onlyErrorsResponse = explain(index + "*", true, true);
             assertNotNull(onlyErrorsResponse);
             assertNotNull(onlyErrorsResponse);
             assertThat(onlyErrorsResponse, allOf(hasKey(errorIndex), hasKey(nonexistantPolicyIndex)));
             assertThat(onlyErrorsResponse, allOf(hasKey(errorIndex), hasKey(nonexistantPolicyIndex)));
             assertThat(onlyErrorsResponse, allOf(not(hasKey(goodIndex)), not(hasKey(unmanagedIndex))));
             assertThat(onlyErrorsResponse, allOf(not(hasKey(goodIndex)), not(hasKey(unmanagedIndex))));
         });
         });
     }
     }
 
 
+    public void testExplainIndexContainsAutomaticRetriesInformation() throws Exception {
+        createFullPolicy(TimeValue.ZERO);
+
+        // create index without alias so the rollover action fails and is retried
+        createIndexWithSettingsNoAlias(index, Settings.builder()
+            .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
+            .put(LifecycleSettings.LIFECYCLE_NAME, policy)
+        );
+
+        assertBusy(() -> {
+            Map<String, Object> explainIndex = explainIndex(index);
+            assertThat((Integer) explainIndex.get(FAILED_STEP_RETRY_COUNT_FIELD), greaterThanOrEqualTo(1));
+            assertThat(explainIndex.get(IS_AUTO_RETRYABLE_ERROR_FIELD), is(true));
+        });
+    }
+
+    public void testILMRolloverRetriesOnReadOnlyBlock() throws Exception {
+        String firstIndex = index + "-000001";
+
+        createNewSingletonPolicy("hot", new RolloverAction(null, TimeValue.timeValueSeconds(1), null));
+
+        // create the index as readonly and associate the ILM policy to it
+        createIndexWithSettings(
+            firstIndex,
+            Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
+                .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)
+                .put(LifecycleSettings.LIFECYCLE_NAME, policy)
+                .put(RolloverAction.LIFECYCLE_ROLLOVER_ALIAS, "alias")
+                .put("index.blocks.read_only", true),
+            true
+        );
+
+        // wait for ILM to start retrying the step
+        assertBusy(() -> assertThat((Integer) explainIndex(firstIndex).get(FAILED_STEP_RETRY_COUNT_FIELD), greaterThanOrEqualTo(1)));
+
+        // remove the read only block
+        Request allowWritesOnIndexSettingUpdate = new Request("PUT", firstIndex + "/_settings");
+        allowWritesOnIndexSettingUpdate.setJsonEntity("{" +
+            "  \"index\": {\n" +
+            "     \"blocks.read_only\" : \"false\" \n" +
+            "  }\n" +
+            "}");
+        client().performRequest(allowWritesOnIndexSettingUpdate);
+
+        // index is not readonly so the ILM should complete successfully
+        assertBusy(() -> assertThat(getStepKeyForIndex(firstIndex), equalTo(TerminalPolicyStep.KEY)));
+    }
+
    public void testILMRolloverOnManuallyRolledIndex() throws Exception {
    public void testILMRolloverOnManuallyRolledIndex() throws Exception {
        String originalIndex = index + "-000001";
        String originalIndex = index + "-000001";
        String secondIndex = index + "-000002";
        String secondIndex = index + "-000002";
        String thirdIndex = index + "-000003";
        String thirdIndex = index + "-000003";
 
 
-       // Configure ILM to run every second
-       Request updateLifecylePollSetting = new Request("PUT", "_cluster/settings");
-       updateLifecylePollSetting.setJsonEntity("{" +
-           "  \"transient\": {\n" +
-                "\"indices.lifecycle.poll_interval\" : \"1s\" \n" +
-           "  }\n" +
-           "}");
-       client().performRequest(updateLifecylePollSetting);
-
        // Set up a policy with rollover
        // Set up a policy with rollover
        createNewSingletonPolicy("hot", new RolloverAction(null, null, 2L));
        createNewSingletonPolicy("hot", new RolloverAction(null, null, 2L));
        Request createIndexTemplate = new Request("PUT", "_template/rolling_indexes");
        Request createIndexTemplate = new Request("PUT", "_template/rolling_indexes");

+ 1 - 1
x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/ExecuteStepsUpdateTask.java

@@ -192,7 +192,7 @@ public class ExecuteStepsUpdateTask extends ClusterStateUpdateTask {
         logger.error("policy [{}] for index [{}] failed on cluster state step [{}]. Moving to ERROR step", policy, index.getName(),
         logger.error("policy [{}] for index [{}] failed on cluster state step [{}]. Moving to ERROR step", policy, index.getName(),
             currentStepKey);
             currentStepKey);
         MoveToErrorStepUpdateTask moveToErrorStepUpdateTask = new MoveToErrorStepUpdateTask(index, policy, currentStepKey, cause,
         MoveToErrorStepUpdateTask moveToErrorStepUpdateTask = new MoveToErrorStepUpdateTask(index, policy, currentStepKey, cause,
-            nowSupplier);
+            nowSupplier, policyStepsRegistry::getStep);
         return moveToErrorStepUpdateTask.execute(state);
         return moveToErrorStepUpdateTask.execute(state);
     }
     }
 }
 }

+ 105 - 28
x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleRunner.java

@@ -12,6 +12,7 @@ import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.action.support.TransportAction;
 import org.elasticsearch.action.support.TransportAction;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.ClusterStateObserver;
 import org.elasticsearch.cluster.ClusterStateObserver;
+import org.elasticsearch.cluster.ClusterStateUpdateTask;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.cluster.metadata.MetaData;
 import org.elasticsearch.cluster.metadata.MetaData;
 import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.cluster.service.ClusterService;
@@ -46,6 +47,7 @@ import org.elasticsearch.xpack.core.ilm.TerminalPolicyStep;
 import java.io.IOException;
 import java.io.IOException;
 import java.util.Collections;
 import java.util.Collections;
 import java.util.List;
 import java.util.List;
+import java.util.function.BiFunction;
 import java.util.function.LongSupplier;
 import java.util.function.LongSupplier;
 
 
 import static org.elasticsearch.ElasticsearchException.REST_EXCEPTION_SKIP_STACK_TRACE;
 import static org.elasticsearch.ElasticsearchException.REST_EXCEPTION_SKIP_STACK_TRACE;
@@ -118,7 +120,7 @@ public class IndexLifecycleRunner {
             logger.debug("policy [{}] for index [{}] complete, skipping execution", policy, index);
             logger.debug("policy [{}] for index [{}] complete, skipping execution", policy, index);
             return;
             return;
         } else if (currentStep instanceof ErrorStep) {
         } else if (currentStep instanceof ErrorStep) {
-            logger.debug("policy [{}] for index [{}] on an error step, skipping execution", policy, index);
+            onErrorMaybeRetryFailedStep(policy, indexMetaData);
             return;
             return;
         }
         }
 
 
@@ -154,6 +156,38 @@ public class IndexLifecycleRunner {
         }
         }
     }
     }
 
 
+    private void onErrorMaybeRetryFailedStep(String policy, IndexMetaData indexMetaData) {
+        String index = indexMetaData.getIndex().getName();
+        LifecycleExecutionState lifecycleState = LifecycleExecutionState.fromIndexMetadata(indexMetaData);
+        Step failedStep = stepRegistry.getStep(indexMetaData, new StepKey(lifecycleState.getPhase(), lifecycleState.getAction(),
+            lifecycleState.getFailedStep()));
+        if (failedStep == null) {
+            logger.warn("failed step [{}] for index [{}] is not part of policy [{}] anymore, or it is invalid. skipping execution",
+                lifecycleState.getFailedStep(), index, policy);
+            return;
+        }
+
+        if (lifecycleState.isAutoRetryableError() != null && lifecycleState.isAutoRetryableError()) {
+            int currentRetryAttempt = lifecycleState.getFailedStepRetryCount() == null ? 1 : 1 + lifecycleState.getFailedStepRetryCount();
+            logger.info("policy [{}] for index [{}] on an error step due to a transitive error, moving back to the failed " +
+                "step [{}] for execution. retry attempt [{}]", policy, index, lifecycleState.getFailedStep(), currentRetryAttempt);
+            clusterService.submitStateUpdateTask("ilm-retry-failed-step", new ClusterStateUpdateTask() {
+                @Override
+                public ClusterState execute(ClusterState currentState) {
+                    return moveClusterStateToPreviouslyFailedStep(currentState, index, true);
+                }
+
+                @Override
+                public void onFailure(String source, Exception e) {
+                    logger.error(new ParameterizedMessage("retry execution of step [{}] for index [{}] failed",
+                        failedStep.getKey().getName(), index), e);
+                }
+            });
+        } else {
+            logger.debug("policy [{}] for index [{}] on an error step after a terminal error, skipping execution", policy, index);
+        }
+    }
+
     /**
     /**
      * If the current step (matching the expected step key) is an asynchronous action step, run it
      * If the current step (matching the expected step key) is an asynchronous action step, run it
      */
      */
@@ -291,6 +325,19 @@ public class IndexLifecycleRunner {
                                                StepKey nextStepKey, LongSupplier nowSupplier,
                                                StepKey nextStepKey, LongSupplier nowSupplier,
                                                PolicyStepsRegistry stepRegistry, boolean forcePhaseDefinitionRefresh) {
                                                PolicyStepsRegistry stepRegistry, boolean forcePhaseDefinitionRefresh) {
         IndexMetaData idxMeta = currentState.getMetaData().index(indexName);
         IndexMetaData idxMeta = currentState.getMetaData().index(indexName);
+        validateTransition(idxMeta, currentStepKey, nextStepKey, stepRegistry);
+
+        Settings indexSettings = idxMeta.getSettings();
+        String policy = LifecycleSettings.LIFECYCLE_NAME_SETTING.get(indexSettings);
+        logger.info("moving index [{}] from [{}] to [{}] in policy [{}]",
+            indexName, currentStepKey, nextStepKey, policy);
+
+        return IndexLifecycleRunner.moveClusterStateToNextStep(idxMeta.getIndex(), currentState, currentStepKey,
+            nextStepKey, nowSupplier, forcePhaseDefinitionRefresh);
+    }
+
+    static void validateTransition(IndexMetaData idxMeta, StepKey currentStepKey, StepKey nextStepKey, PolicyStepsRegistry stepRegistry) {
+        String indexName = idxMeta.getIndex().getName();
         Settings indexSettings = idxMeta.getSettings();
         Settings indexSettings = idxMeta.getSettings();
         String indexPolicySetting = LifecycleSettings.LIFECYCLE_NAME_SETTING.get(indexSettings);
         String indexPolicySetting = LifecycleSettings.LIFECYCLE_NAME_SETTING.get(indexSettings);
 
 
@@ -308,12 +355,6 @@ public class IndexLifecycleRunner {
             throw new IllegalArgumentException("step [" + nextStepKey + "] for index [" + idxMeta.getIndex().getName() +
             throw new IllegalArgumentException("step [" + nextStepKey + "] for index [" + idxMeta.getIndex().getName() +
                 "] with policy [" + indexPolicySetting + "] does not exist");
                 "] with policy [" + indexPolicySetting + "] does not exist");
         }
         }
-
-        logger.info("moving index [{}] from [{}] to [{}] in policy [{}]",
-            indexName, currentStepKey, nextStepKey, indexPolicySetting);
-
-        return IndexLifecycleRunner.moveClusterStateToNextStep(idxMeta.getIndex(), currentState, currentStepKey,
-            nextStepKey, nowSupplier, forcePhaseDefinitionRefresh);
     }
     }
 
 
     static ClusterState moveClusterStateToNextStep(Index index, ClusterState clusterState, StepKey currentStep, StepKey nextStep,
     static ClusterState moveClusterStateToNextStep(Index index, ClusterState clusterState, StepKey currentStep, StepKey nextStep,
@@ -331,7 +372,8 @@ public class IndexLifecycleRunner {
     }
     }
 
 
     static ClusterState moveClusterStateToErrorStep(Index index, ClusterState clusterState, StepKey currentStep, Exception cause,
     static ClusterState moveClusterStateToErrorStep(Index index, ClusterState clusterState, StepKey currentStep, Exception cause,
-                                                    LongSupplier nowSupplier) throws IOException {
+                                                    LongSupplier nowSupplier,
+                                                    BiFunction<IndexMetaData, StepKey, Step> stepLookupFunction) throws IOException {
         IndexMetaData idxMeta = clusterState.getMetaData().index(index);
         IndexMetaData idxMeta = clusterState.getMetaData().index(index);
         IndexLifecycleMetadata ilmMeta = clusterState.metaData().custom(IndexLifecycleMetadata.TYPE);
         IndexLifecycleMetadata ilmMeta = clusterState.metaData().custom(IndexLifecycleMetadata.TYPE);
         LifecyclePolicyMetadata policyMetadata = ilmMeta.getPolicyMetadatas()
         LifecyclePolicyMetadata policyMetadata = ilmMeta.getPolicyMetadatas()
@@ -340,34 +382,67 @@ public class IndexLifecycleRunner {
         causeXContentBuilder.startObject();
         causeXContentBuilder.startObject();
         ElasticsearchException.generateThrowableXContent(causeXContentBuilder, STACKTRACE_PARAMS, cause);
         ElasticsearchException.generateThrowableXContent(causeXContentBuilder, STACKTRACE_PARAMS, cause);
         causeXContentBuilder.endObject();
         causeXContentBuilder.endObject();
-        LifecycleExecutionState nextStepState = moveExecutionStateToNextStep(policyMetadata,
-            LifecycleExecutionState.fromIndexMetadata(idxMeta), currentStep, new StepKey(currentStep.getPhase(),
-                currentStep.getAction(), ErrorStep.NAME), nowSupplier, false);
+        LifecycleExecutionState currentState = LifecycleExecutionState.fromIndexMetadata(idxMeta);
+        LifecycleExecutionState nextStepState = moveExecutionStateToNextStep(policyMetadata, currentState, currentStep,
+            new StepKey(currentStep.getPhase(), currentStep.getAction(), ErrorStep.NAME), nowSupplier, false);
         LifecycleExecutionState.Builder failedState = LifecycleExecutionState.builder(nextStepState);
         LifecycleExecutionState.Builder failedState = LifecycleExecutionState.builder(nextStepState);
         failedState.setFailedStep(currentStep.getName());
         failedState.setFailedStep(currentStep.getName());
         failedState.setStepInfo(BytesReference.bytes(causeXContentBuilder).utf8ToString());
         failedState.setStepInfo(BytesReference.bytes(causeXContentBuilder).utf8ToString());
+        Step failedStep = stepLookupFunction.apply(idxMeta, currentStep);
+        if (failedStep != null) {
+            // as an initial step we'll mark the failed step as auto retryable without actually looking at the cause to determine
+            // if the error is transient/recoverable from
+            failedState.setIsAutoRetryableError(failedStep.isRetryable());
+            // maintain the retry count of the failed step as it will be cleared after a successful execution
+            failedState.setFailedStepRetryCount(currentState.getFailedStepRetryCount());
+        } else {
+            logger.warn("failed step [{}] for index [{}] is not part of policy [{}] anymore, or it is invalid",
+                currentStep.getName(), index, policyMetadata.getName());
+        }
+
         ClusterState.Builder newClusterStateBuilder = newClusterStateWithLifecycleState(index, clusterState, failedState.build());
         ClusterState.Builder newClusterStateBuilder = newClusterStateWithLifecycleState(index, clusterState, failedState.build());
         return newClusterStateBuilder.build();
         return newClusterStateBuilder.build();
     }
     }
 
 
-    ClusterState moveClusterStateToFailedStep(ClusterState currentState, String[] indices) {
-        ClusterState newState = currentState;
-        for (String index : indices) {
-            IndexMetaData indexMetaData = currentState.metaData().index(index);
-            if (indexMetaData == null) {
-                throw new IllegalArgumentException("index [" + index + "] does not exist");
-            }
-            LifecycleExecutionState lifecycleState = LifecycleExecutionState.fromIndexMetadata(indexMetaData);
-            StepKey currentStepKey = IndexLifecycleRunner.getCurrentStepKey(lifecycleState);
-            String failedStep = lifecycleState.getFailedStep();
-            if (currentStepKey != null && ErrorStep.NAME.equals(currentStepKey.getName())
-                    && Strings.isNullOrEmpty(failedStep) == false) {
-                StepKey nextStepKey = new StepKey(currentStepKey.getPhase(), currentStepKey.getAction(), failedStep);
-                newState = moveClusterStateToStep(index, currentState, currentStepKey, nextStepKey, nowSupplier, stepRegistry, true);
+    ClusterState moveClusterStateToPreviouslyFailedStep(ClusterState currentState, String index, boolean isAutomaticRetry) {
+        ClusterState newState;
+        IndexMetaData indexMetaData = currentState.metaData().index(index);
+        if (indexMetaData == null) {
+            throw new IllegalArgumentException("index [" + index + "] does not exist");
+        }
+        LifecycleExecutionState lifecycleState = LifecycleExecutionState.fromIndexMetadata(indexMetaData);
+        StepKey currentStepKey = IndexLifecycleRunner.getCurrentStepKey(lifecycleState);
+        String failedStep = lifecycleState.getFailedStep();
+        if (currentStepKey != null && ErrorStep.NAME.equals(currentStepKey.getName()) && Strings.isNullOrEmpty(failedStep) == false) {
+            StepKey nextStepKey = new StepKey(currentStepKey.getPhase(), currentStepKey.getAction(), failedStep);
+            validateTransition(indexMetaData, currentStepKey, nextStepKey, stepRegistry);
+            IndexLifecycleMetadata ilmMeta = currentState.metaData().custom(IndexLifecycleMetadata.TYPE);
+
+            LifecyclePolicyMetadata policyMetadata = ilmMeta.getPolicyMetadatas()
+                .get(LifecycleSettings.LIFECYCLE_NAME_SETTING.get(indexMetaData.getSettings()));
+            LifecycleExecutionState nextStepState = moveExecutionStateToNextStep(policyMetadata,
+                lifecycleState, currentStepKey, nextStepKey, nowSupplier, true);
+            LifecycleExecutionState.Builder retryStepState = LifecycleExecutionState.builder(nextStepState);
+            retryStepState.setIsAutoRetryableError(lifecycleState.isAutoRetryableError());
+            Integer currentRetryCount = lifecycleState.getFailedStepRetryCount();
+            if (isAutomaticRetry) {
+                retryStepState.setFailedStepRetryCount(currentRetryCount == null ? 1 : ++currentRetryCount);
             } else {
             } else {
-                throw new IllegalArgumentException("cannot retry an action for an index ["
-                    + index + "] that has not encountered an error when running a Lifecycle Policy");
+                // manual retries don't update the retry count
+                retryStepState.setFailedStepRetryCount(lifecycleState.getFailedStepRetryCount());
             }
             }
+            newState = newClusterStateWithLifecycleState(indexMetaData.getIndex(), currentState, retryStepState.build()).build();
+        } else {
+            throw new IllegalArgumentException("cannot retry an action for an index ["
+                + index + "] that has not encountered an error when running a Lifecycle Policy");
+        }
+        return newState;
+    }
+
+    ClusterState moveClusterStateToPreviouslyFailedStep(ClusterState currentState, String[] indices) {
+        ClusterState newState = currentState;
+        for (String index : indices) {
+            newState = moveClusterStateToPreviouslyFailedStep(newState, index, false);
         }
         }
         return newState;
         return newState;
     }
     }
@@ -387,6 +462,8 @@ public class IndexLifecycleRunner {
         // clear any step info or error-related settings from the current step
         // clear any step info or error-related settings from the current step
         updatedState.setFailedStep(null);
         updatedState.setFailedStep(null);
         updatedState.setStepInfo(null);
         updatedState.setStepInfo(null);
+        updatedState.setIsAutoRetryableError(null);
+        updatedState.setFailedStepRetryCount(null);
 
 
         if (currentStep.getPhase().equals(nextStep.getPhase()) == false || forcePhaseDefinitionRefresh) {
         if (currentStep.getPhase().equals(nextStep.getPhase()) == false || forcePhaseDefinitionRefresh) {
             final String newPhaseDefinition;
             final String newPhaseDefinition;
@@ -473,7 +550,7 @@ public class IndexLifecycleRunner {
         logger.error(new ParameterizedMessage("policy [{}] for index [{}] failed on step [{}]. Moving to ERROR step",
         logger.error(new ParameterizedMessage("policy [{}] for index [{}] failed on step [{}]. Moving to ERROR step",
                 policy, index.getName(), currentStepKey), e);
                 policy, index.getName(), currentStepKey), e);
         clusterService.submitStateUpdateTask("ilm-move-to-error-step",
         clusterService.submitStateUpdateTask("ilm-move-to-error-step",
-            new MoveToErrorStepUpdateTask(index, policy, currentStepKey, e, nowSupplier));
+            new MoveToErrorStepUpdateTask(index, policy, currentStepKey, e, nowSupplier, stepRegistry::getStep));
     }
     }
 
 
     private void setStepInfo(Index index, String policy, StepKey currentStepKey, ToXContentObject stepInfo) {
     private void setStepInfo(Index index, String policy, StepKey currentStepKey, ToXContentObject stepInfo) {

+ 2 - 2
x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleService.java

@@ -91,8 +91,8 @@ public class IndexLifecycleService
             nowSupplier, policyRegistry, false);
             nowSupplier, policyRegistry, false);
     }
     }
 
 
-    public ClusterState moveClusterStateToFailedStep(ClusterState currentState, String[] indices) {
-        return lifecycleRunner.moveClusterStateToFailedStep(currentState, indices);
+    public ClusterState moveClusterStateToPreviouslyFailedStep(ClusterState currentState, String[] indices) {
+        return lifecycleRunner.moveClusterStateToPreviouslyFailedStep(currentState, indices);
     }
     }
 
 
     @Override
     @Override

+ 7 - 2
x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/MoveToErrorStepUpdateTask.java

@@ -16,21 +16,25 @@ import org.elasticsearch.xpack.core.ilm.LifecycleSettings;
 import org.elasticsearch.xpack.core.ilm.Step;
 import org.elasticsearch.xpack.core.ilm.Step;
 
 
 import java.io.IOException;
 import java.io.IOException;
+import java.util.function.BiFunction;
 import java.util.function.LongSupplier;
 import java.util.function.LongSupplier;
 
 
 public class MoveToErrorStepUpdateTask extends ClusterStateUpdateTask {
 public class MoveToErrorStepUpdateTask extends ClusterStateUpdateTask {
     private final Index index;
     private final Index index;
     private final String policy;
     private final String policy;
     private final Step.StepKey currentStepKey;
     private final Step.StepKey currentStepKey;
+    private final BiFunction<IndexMetaData, Step.StepKey, Step> stepLookupFunction;
     private LongSupplier nowSupplier;
     private LongSupplier nowSupplier;
     private Exception cause;
     private Exception cause;
 
 
-    public MoveToErrorStepUpdateTask(Index index, String policy, Step.StepKey currentStepKey, Exception cause, LongSupplier nowSupplier) {
+    public MoveToErrorStepUpdateTask(Index index, String policy, Step.StepKey currentStepKey, Exception cause, LongSupplier nowSupplier,
+                                     BiFunction<IndexMetaData, Step.StepKey, Step> stepLookupFunction) {
         this.index = index;
         this.index = index;
         this.policy = policy;
         this.policy = policy;
         this.currentStepKey = currentStepKey;
         this.currentStepKey = currentStepKey;
         this.cause = cause;
         this.cause = cause;
         this.nowSupplier = nowSupplier;
         this.nowSupplier = nowSupplier;
+        this.stepLookupFunction = stepLookupFunction;
     }
     }
 
 
     Index getIndex() {
     Index getIndex() {
@@ -60,7 +64,8 @@ public class MoveToErrorStepUpdateTask extends ClusterStateUpdateTask {
         LifecycleExecutionState indexILMData = LifecycleExecutionState.fromIndexMetadata(idxMeta);
         LifecycleExecutionState indexILMData = LifecycleExecutionState.fromIndexMetadata(idxMeta);
         if (policy.equals(LifecycleSettings.LIFECYCLE_NAME_SETTING.get(indexSettings))
         if (policy.equals(LifecycleSettings.LIFECYCLE_NAME_SETTING.get(indexSettings))
             && currentStepKey.equals(IndexLifecycleRunner.getCurrentStepKey(indexILMData))) {
             && currentStepKey.equals(IndexLifecycleRunner.getCurrentStepKey(indexILMData))) {
-            return IndexLifecycleRunner.moveClusterStateToErrorStep(index, currentState, currentStepKey, cause, nowSupplier);
+            return IndexLifecycleRunner.moveClusterStateToErrorStep(index, currentState, currentStepKey, cause, nowSupplier,
+                stepLookupFunction);
         } else {
         } else {
             // either the policy has changed or the step is now
             // either the policy has changed or the step is now
             // not the same as when we submitted the update task. In
             // not the same as when we submitted the update task. In

+ 2 - 0
x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/action/TransportExplainLifecycleAction.java

@@ -116,6 +116,8 @@ public class TransportExplainLifecycleAction
                         lifecycleState.getAction(),
                         lifecycleState.getAction(),
                         lifecycleState.getStep(),
                         lifecycleState.getStep(),
                         lifecycleState.getFailedStep(),
                         lifecycleState.getFailedStep(),
+                        lifecycleState.isAutoRetryableError(),
+                        lifecycleState.getFailedStepRetryCount(),
                         lifecycleState.getPhaseTime(),
                         lifecycleState.getPhaseTime(),
                         lifecycleState.getActionTime(),
                         lifecycleState.getActionTime(),
                         lifecycleState.getStepTime(),
                         lifecycleState.getStepTime(),

+ 1 - 1
x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/action/TransportRetryAction.java

@@ -62,7 +62,7 @@ public class TransportRetryAction extends TransportMasterNodeAction<Request, Res
             new AckedClusterStateUpdateTask<Response>(request, listener) {
             new AckedClusterStateUpdateTask<Response>(request, listener) {
                 @Override
                 @Override
                 public ClusterState execute(ClusterState currentState) {
                 public ClusterState execute(ClusterState currentState) {
-                    return indexLifecycleService.moveClusterStateToFailedStep(currentState, request.indices());
+                    return indexLifecycleService.moveClusterStateToPreviouslyFailedStep(currentState, request.indices());
                 }
                 }
 
 
                 @Override
                 @Override

+ 203 - 13
x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/IndexLifecycleRunnerTests.java

@@ -29,6 +29,7 @@ import org.elasticsearch.common.xcontent.json.JsonXContent;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.test.ClusterServiceUtils;
 import org.elasticsearch.test.ClusterServiceUtils;
 import org.elasticsearch.test.ESTestCase;
 import org.elasticsearch.test.ESTestCase;
+import org.elasticsearch.test.client.NoOpClient;
 import org.elasticsearch.threadpool.TestThreadPool;
 import org.elasticsearch.threadpool.TestThreadPool;
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.xpack.core.ilm.AbstractStepTestCase;
 import org.elasticsearch.xpack.core.ilm.AbstractStepTestCase;
@@ -50,9 +51,11 @@ import org.elasticsearch.xpack.core.ilm.OperationMode;
 import org.elasticsearch.xpack.core.ilm.Phase;
 import org.elasticsearch.xpack.core.ilm.Phase;
 import org.elasticsearch.xpack.core.ilm.PhaseExecutionInfo;
 import org.elasticsearch.xpack.core.ilm.PhaseExecutionInfo;
 import org.elasticsearch.xpack.core.ilm.RolloverAction;
 import org.elasticsearch.xpack.core.ilm.RolloverAction;
+import org.elasticsearch.xpack.core.ilm.RolloverActionTests;
 import org.elasticsearch.xpack.core.ilm.Step;
 import org.elasticsearch.xpack.core.ilm.Step;
 import org.elasticsearch.xpack.core.ilm.Step.StepKey;
 import org.elasticsearch.xpack.core.ilm.Step.StepKey;
 import org.elasticsearch.xpack.core.ilm.TerminalPolicyStep;
 import org.elasticsearch.xpack.core.ilm.TerminalPolicyStep;
+import org.elasticsearch.xpack.core.ilm.WaitForRolloverReadyStep;
 import org.junit.After;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Before;
 import org.mockito.ArgumentMatcher;
 import org.mockito.ArgumentMatcher;
@@ -74,12 +77,18 @@ import java.util.function.BiFunction;
 import java.util.function.Function;
 import java.util.function.Function;
 import java.util.stream.Collectors;
 import java.util.stream.Collectors;
 
 
+import static java.util.stream.Collectors.toList;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.awaitLatch;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.awaitLatch;
 import static org.elasticsearch.xpack.core.ilm.LifecycleExecutionState.ILM_CUSTOM_METADATA_KEY;
 import static org.elasticsearch.xpack.core.ilm.LifecycleExecutionState.ILM_CUSTOM_METADATA_KEY;
 import static org.elasticsearch.xpack.core.ilm.LifecyclePolicyTestsUtils.newTestLifecyclePolicy;
 import static org.elasticsearch.xpack.core.ilm.LifecyclePolicyTestsUtils.newTestLifecyclePolicy;
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.nullValue;
+import static org.mockito.Matchers.any;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.when;
 
 
 public class IndexLifecycleRunnerTests extends ESTestCase {
 public class IndexLifecycleRunnerTests extends ESTestCase {
     private static final NamedXContentRegistry REGISTRY;
     private static final NamedXContentRegistry REGISTRY;
@@ -147,7 +156,7 @@ public class IndexLifecycleRunnerTests extends ESTestCase {
         Index index = new Index(indexName, indexName + "uuid");
         Index index = new Index(indexName, indexName + "uuid");
         indexSteps.put(index, steps);
         indexSteps.put(index, steps);
         Client client = mock(Client.class);
         Client client = mock(Client.class);
-        Mockito.when(client.settings()).thenReturn(Settings.EMPTY);
+        when(client.settings()).thenReturn(Settings.EMPTY);
         return new MockPolicyStepsRegistry(lifecyclePolicyMap, firstStepMap, stepMap, REGISTRY, client);
         return new MockPolicyStepsRegistry(lifecyclePolicyMap, firstStepMap, stepMap, REGISTRY, client);
     }
     }
 
 
@@ -167,24 +176,77 @@ public class IndexLifecycleRunnerTests extends ESTestCase {
 
 
     public void testRunPolicyErrorStep() {
     public void testRunPolicyErrorStep() {
         String policyName = "async_action_policy";
         String policyName = "async_action_policy";
-        StepKey stepKey = new StepKey("phase", "action", "cluster_state_action_step");
-        MockClusterStateWaitStep step = new MockClusterStateWaitStep(stepKey, null);
+        LifecyclePolicy policy = LifecyclePolicyTests.randomTimeseriesLifecyclePolicyWithAllPhases(policyName);
+        String phaseName = randomFrom(policy.getPhases().keySet());
+        Phase phase = policy.getPhases().get(phaseName);
+        PhaseExecutionInfo phaseExecutionInfo = new PhaseExecutionInfo(policy.getName(), phase, 1, randomNonNegativeLong());
+        String phaseJson = Strings.toString(phaseExecutionInfo);
+        LifecycleAction action = randomFrom(phase.getActions().values());
+        Step step = randomFrom(action.toSteps(new NoOpClient(threadPool), phaseName, null));
+        StepKey stepKey = step.getKey();
+
         PolicyStepsRegistry stepRegistry = createOneStepPolicyStepRegistry(policyName, step);
         PolicyStepsRegistry stepRegistry = createOneStepPolicyStepRegistry(policyName, step);
         ClusterService clusterService = mock(ClusterService.class);
         ClusterService clusterService = mock(ClusterService.class);
         IndexLifecycleRunner runner = new IndexLifecycleRunner(stepRegistry, clusterService, threadPool, () -> 0L);
         IndexLifecycleRunner runner = new IndexLifecycleRunner(stepRegistry, clusterService, threadPool, () -> 0L);
         LifecycleExecutionState.Builder newState = LifecycleExecutionState.builder();
         LifecycleExecutionState.Builder newState = LifecycleExecutionState.builder();
+        newState.setFailedStep(stepKey.getName());
+        newState.setIsAutoRetryableError(false);
         newState.setPhase(stepKey.getPhase());
         newState.setPhase(stepKey.getPhase());
         newState.setAction(stepKey.getAction());
         newState.setAction(stepKey.getAction());
         newState.setStep(ErrorStep.NAME);
         newState.setStep(ErrorStep.NAME);
-        IndexMetaData indexMetaData = IndexMetaData.builder("my_index").settings(settings(Version.CURRENT))
+        newState.setPhaseDefinition(phaseJson);
+        IndexMetaData indexMetaData = IndexMetaData.builder("test")
+            .settings(settings(Version.CURRENT).put(LifecycleSettings.LIFECYCLE_NAME, policyName))
             .putCustom(ILM_CUSTOM_METADATA_KEY, newState.build().asMap())
             .putCustom(ILM_CUSTOM_METADATA_KEY, newState.build().asMap())
-            .numberOfShards(randomIntBetween(1, 5)).numberOfReplicas(randomIntBetween(0, 5)).build();
+            .numberOfShards(randomIntBetween(1, 5)).numberOfReplicas(randomIntBetween(0, 5))
+            .build();
 
 
         runner.runPolicyAfterStateChange(policyName, indexMetaData);
         runner.runPolicyAfterStateChange(policyName, indexMetaData);
 
 
         Mockito.verifyZeroInteractions(clusterService);
         Mockito.verifyZeroInteractions(clusterService);
     }
     }
 
 
+    public void testRunPolicyErrorStepOnRetryableFailedStep() {
+        String policyName = "rollover_policy";
+        String phaseName = "hot";
+        TimeValue after = TimeValue.parseTimeValue(randomTimeValue(0, 1000000000, "s", "m", "h", "d"), "test_after");
+        Map<String, LifecycleAction> actions = new HashMap<>();
+        RolloverAction action = RolloverActionTests.randomInstance();
+        actions.put(RolloverAction.NAME, action);
+        Phase phase = new Phase(phaseName, after, actions);
+        Map<String, Phase> phases = Map.of(phaseName, phase);
+        PhaseExecutionInfo phaseExecutionInfo = new PhaseExecutionInfo(policyName, phase, 1, randomNonNegativeLong());
+        String phaseJson = Strings.toString(phaseExecutionInfo);
+        NoOpClient client = new NoOpClient(threadPool);
+        List<Step> waitForRolloverStepList =
+            action.toSteps(client, phaseName, null).stream()
+                .filter(s -> s.getKey().getName().equals(WaitForRolloverReadyStep.NAME))
+                .collect(toList());
+        assertThat(waitForRolloverStepList.size(), is(1));
+        Step waitForRolloverStep = waitForRolloverStepList.get(0);
+        StepKey stepKey = waitForRolloverStep.getKey();
+
+        PolicyStepsRegistry stepRegistry = createOneStepPolicyStepRegistry(policyName, waitForRolloverStep);
+        ClusterService clusterService = mock(ClusterService.class);
+        IndexLifecycleRunner runner = new IndexLifecycleRunner(stepRegistry, clusterService, threadPool, () -> 0L);
+        LifecycleExecutionState.Builder newState = LifecycleExecutionState.builder();
+        newState.setFailedStep(stepKey.getName());
+        newState.setIsAutoRetryableError(true);
+        newState.setPhase(stepKey.getPhase());
+        newState.setAction(stepKey.getAction());
+        newState.setStep(ErrorStep.NAME);
+        newState.setPhaseDefinition(phaseJson);
+        IndexMetaData indexMetaData = IndexMetaData.builder("test")
+            .settings(settings(Version.CURRENT).put(LifecycleSettings.LIFECYCLE_NAME, policyName))
+            .putCustom(ILM_CUSTOM_METADATA_KEY, newState.build().asMap())
+            .numberOfShards(randomIntBetween(1, 5)).numberOfReplicas(randomIntBetween(0, 5))
+            .build();
+
+        runner.runPeriodicStep(policyName, indexMetaData);
+
+        Mockito.verify(clusterService, times(1)).submitStateUpdateTask(any(), any());
+    }
+
     public void testRunStateChangePolicyWithNoNextStep() throws Exception {
     public void testRunStateChangePolicyWithNoNextStep() throws Exception {
         String policyName = "foo";
         String policyName = "foo";
         StepKey stepKey = new StepKey("phase", "action", "cluster_state_action_step");
         StepKey stepKey = new StepKey("phase", "action", "cluster_state_action_step");
@@ -627,7 +689,7 @@ public class IndexLifecycleRunnerTests extends ESTestCase {
 
 
         Step.StepKey MOCK_STEP_KEY = new Step.StepKey("mock", "mock", "mock");
         Step.StepKey MOCK_STEP_KEY = new Step.StepKey("mock", "mock", "mock");
         Client client = mock(Client.class);
         Client client = mock(Client.class);
-        Mockito.when(client.settings()).thenReturn(Settings.EMPTY);
+        when(client.settings()).thenReturn(Settings.EMPTY);
         LifecyclePolicy policy = LifecyclePolicyTests.randomTimeseriesLifecyclePolicyWithAllPhases(policyName);
         LifecyclePolicy policy = LifecyclePolicyTests.randomTimeseriesLifecyclePolicyWithAllPhases(policyName);
         LifecyclePolicyMetadata policyMetadata = new LifecyclePolicyMetadata(policy, Collections.emptyMap(), 1, randomNonNegativeLong());
         LifecyclePolicyMetadata policyMetadata = new LifecyclePolicyMetadata(policy, Collections.emptyMap(), 1, randomNonNegativeLong());
         String phaseName = randomFrom(policy.getPhases().keySet());
         String phaseName = randomFrom(policy.getPhases().keySet());
@@ -855,6 +917,7 @@ public class IndexLifecycleRunnerTests extends ESTestCase {
     public void testMoveClusterStateToErrorStep() throws IOException {
     public void testMoveClusterStateToErrorStep() throws IOException {
         String indexName = "my_index";
         String indexName = "my_index";
         StepKey currentStep = new StepKey("current_phase", "current_action", "current_step");
         StepKey currentStep = new StepKey("current_phase", "current_action", "current_step");
+        StepKey nextStepKey = new StepKey("next_phase", "next_action", "next_step");
         long now = randomNonNegativeLong();
         long now = randomNonNegativeLong();
         Exception cause = new ElasticsearchException("THIS IS AN EXPECTED CAUSE");
         Exception cause = new ElasticsearchException("THIS IS AN EXPECTED CAUSE");
 
 
@@ -865,12 +928,14 @@ public class IndexLifecycleRunnerTests extends ESTestCase {
         ClusterState clusterState = buildClusterState(indexName, Settings.builder(), lifecycleState.build(), Collections.emptyList());
         ClusterState clusterState = buildClusterState(indexName, Settings.builder(), lifecycleState.build(), Collections.emptyList());
         Index index = clusterState.metaData().index(indexName).getIndex();
         Index index = clusterState.metaData().index(indexName).getIndex();
 
 
-        ClusterState newClusterState = IndexLifecycleRunner.moveClusterStateToErrorStep(index, clusterState, currentStep, cause, () -> now);
+        ClusterState newClusterState = IndexLifecycleRunner.moveClusterStateToErrorStep(index, clusterState, currentStep, cause, () -> now,
+            (idxMeta, stepKey) -> new MockStep(stepKey, nextStepKey));
         assertClusterStateOnErrorStep(clusterState, index, currentStep, newClusterState, now,
         assertClusterStateOnErrorStep(clusterState, index, currentStep, newClusterState, now,
             "{\"type\":\"exception\",\"reason\":\"THIS IS AN EXPECTED CAUSE\"");
             "{\"type\":\"exception\",\"reason\":\"THIS IS AN EXPECTED CAUSE\"");
 
 
         cause = new IllegalArgumentException("non elasticsearch-exception");
         cause = new IllegalArgumentException("non elasticsearch-exception");
-        newClusterState = IndexLifecycleRunner.moveClusterStateToErrorStep(index, clusterState, currentStep, cause, () -> now);
+        newClusterState = IndexLifecycleRunner.moveClusterStateToErrorStep(index, clusterState, currentStep, cause, () -> now,
+            (idxMeta, stepKey) -> new MockStep(stepKey, nextStepKey));
         assertClusterStateOnErrorStep(clusterState, index, currentStep, newClusterState, now,
         assertClusterStateOnErrorStep(clusterState, index, currentStep, newClusterState, now,
             "{\"type\":\"illegal_argument_exception\",\"reason\":\"non elasticsearch-exception\",\"stack_trace\":\"");
             "{\"type\":\"illegal_argument_exception\",\"reason\":\"non elasticsearch-exception\",\"stack_trace\":\"");
     }
     }
@@ -902,9 +967,11 @@ public class IndexLifecycleRunnerTests extends ESTestCase {
             Collections.singletonList(policyMetadata));
             Collections.singletonList(policyMetadata));
         Index index = clusterState.metaData().index(indexName).getIndex();
         Index index = clusterState.metaData().index(indexName).getIndex();
         IndexLifecycleRunner runner = new IndexLifecycleRunner(policyRegistry, null, threadPool, () -> now);
         IndexLifecycleRunner runner = new IndexLifecycleRunner(policyRegistry, null, threadPool, () -> now);
-        ClusterState nextClusterState = runner.moveClusterStateToFailedStep(clusterState, indices);
+        ClusterState nextClusterState = runner.moveClusterStateToPreviouslyFailedStep(clusterState, indices);
         IndexLifecycleRunnerTests.assertClusterStateOnNextStep(clusterState, index, errorStepKey, failedStepKey,
         IndexLifecycleRunnerTests.assertClusterStateOnNextStep(clusterState, index, errorStepKey, failedStepKey,
             nextClusterState, now);
             nextClusterState, now);
+        LifecycleExecutionState executionState = LifecycleExecutionState.fromIndexMetadata(nextClusterState.metaData().index(indexName));
+        assertThat("manual move to failed step should not count as a retry", executionState.getFailedStepRetryCount(), is(nullValue()));
     }
     }
 
 
     public void testMoveClusterStateToFailedStepWithUnknownStep() {
     public void testMoveClusterStateToFailedStepWithUnknownStep() {
@@ -937,7 +1004,7 @@ public class IndexLifecycleRunnerTests extends ESTestCase {
             Collections.singletonList(policyMetadata));
             Collections.singletonList(policyMetadata));
         IndexLifecycleRunner runner = new IndexLifecycleRunner(policyRegistry, null, threadPool, () -> now);
         IndexLifecycleRunner runner = new IndexLifecycleRunner(policyRegistry, null, threadPool, () -> now);
         IllegalArgumentException exception = expectThrows(IllegalArgumentException.class,
         IllegalArgumentException exception = expectThrows(IllegalArgumentException.class,
-            () -> runner.moveClusterStateToFailedStep(clusterState, indices));
+            () -> runner.moveClusterStateToPreviouslyFailedStep(clusterState, indices));
         assertThat(exception.getMessage(), equalTo("step [" + failedStepKey
         assertThat(exception.getMessage(), equalTo("step [" + failedStepKey
             + "] for index [my_index] with policy [my_policy] does not exist"));
             + "] for index [my_index] with policy [my_policy] does not exist"));
     }
     }
@@ -949,7 +1016,7 @@ public class IndexLifecycleRunnerTests extends ESTestCase {
             Collections.emptyList());
             Collections.emptyList());
         IndexLifecycleRunner runner = new IndexLifecycleRunner(null, null, threadPool, () -> 0L);
         IndexLifecycleRunner runner = new IndexLifecycleRunner(null, null, threadPool, () -> 0L);
         IllegalArgumentException exception = expectThrows(IllegalArgumentException.class,
         IllegalArgumentException exception = expectThrows(IllegalArgumentException.class,
-            () -> runner.moveClusterStateToFailedStep(clusterState, new String[] { invalidIndexName }));
+            () -> runner.moveClusterStateToPreviouslyFailedStep(clusterState, new String[] { invalidIndexName }));
         assertThat(exception.getMessage(), equalTo("index [" + invalidIndexName + "] does not exist"));
         assertThat(exception.getMessage(), equalTo("index [" + invalidIndexName + "] does not exist"));
     }
     }
 
 
@@ -972,7 +1039,7 @@ public class IndexLifecycleRunnerTests extends ESTestCase {
         ClusterState clusterState = buildClusterState(indexName, indexSettingsBuilder, lifecycleState.build(), Collections.emptyList());
         ClusterState clusterState = buildClusterState(indexName, indexSettingsBuilder, lifecycleState.build(), Collections.emptyList());
         IndexLifecycleRunner runner = new IndexLifecycleRunner(policyRegistry, null, threadPool, () -> now);
         IndexLifecycleRunner runner = new IndexLifecycleRunner(policyRegistry, null, threadPool, () -> now);
         IllegalArgumentException exception = expectThrows(IllegalArgumentException.class,
         IllegalArgumentException exception = expectThrows(IllegalArgumentException.class,
-            () -> runner.moveClusterStateToFailedStep(clusterState, indices));
+            () -> runner.moveClusterStateToPreviouslyFailedStep(clusterState, indices));
         assertThat(exception.getMessage(), equalTo("index [" + indexName + "] is not associated with an Index Lifecycle Policy"));
         assertThat(exception.getMessage(), equalTo("index [" + indexName + "] is not associated with an Index Lifecycle Policy"));
     }
     }
 
 
@@ -993,11 +1060,44 @@ public class IndexLifecycleRunnerTests extends ESTestCase {
         ClusterState clusterState = buildClusterState(indexName, indexSettingsBuilder, lifecycleState.build(), Collections.emptyList());
         ClusterState clusterState = buildClusterState(indexName, indexSettingsBuilder, lifecycleState.build(), Collections.emptyList());
         IndexLifecycleRunner runner = new IndexLifecycleRunner(policyRegistry, null, threadPool, () -> now);
         IndexLifecycleRunner runner = new IndexLifecycleRunner(policyRegistry, null, threadPool, () -> now);
         IllegalArgumentException exception = expectThrows(IllegalArgumentException.class,
         IllegalArgumentException exception = expectThrows(IllegalArgumentException.class,
-            () -> runner.moveClusterStateToFailedStep(clusterState, indices));
+            () -> runner.moveClusterStateToPreviouslyFailedStep(clusterState, indices));
         assertThat(exception.getMessage(), equalTo("cannot retry an action for an index [" + indices[0]
         assertThat(exception.getMessage(), equalTo("cannot retry an action for an index [" + indices[0]
             + "] that has not encountered an error when running a Lifecycle Policy"));
             + "] that has not encountered an error when running a Lifecycle Policy"));
     }
     }
 
 
+    public void testMoveClusterStateToPreviouslyFailedStepAsAutomaticRetry() {
+        String indexName = "my_index";
+        String policyName = "my_policy";
+        long now = randomNonNegativeLong();
+        StepKey failedStepKey = new StepKey("current_phase", MockAction.NAME, "current_step");
+        StepKey errorStepKey = new StepKey(failedStepKey.getPhase(), failedStepKey.getAction(), ErrorStep.NAME);
+        Step retryableStep = new RetryableMockStep(failedStepKey, null);
+        LifecyclePolicy policy = createPolicy(policyName, failedStepKey, null);
+        LifecyclePolicyMetadata policyMetadata = new LifecyclePolicyMetadata(policy, Collections.emptyMap(),
+            randomNonNegativeLong(), randomNonNegativeLong());
+
+        PolicyStepsRegistry policyRegistry = createOneStepPolicyStepRegistry(policyName, retryableStep, indexName);
+        Settings.Builder indexSettingsBuilder = Settings.builder()
+                .put(LifecycleSettings.LIFECYCLE_NAME, policyName);
+        LifecycleExecutionState.Builder lifecycleState = LifecycleExecutionState.builder();
+        lifecycleState.setPhase(errorStepKey.getPhase());
+        lifecycleState.setPhaseTime(now);
+        lifecycleState.setAction(errorStepKey.getAction());
+        lifecycleState.setActionTime(now);
+        lifecycleState.setStep(errorStepKey.getName());
+        lifecycleState.setStepTime(now);
+        lifecycleState.setFailedStep(failedStepKey.getName());
+        ClusterState clusterState = buildClusterState(indexName, indexSettingsBuilder, lifecycleState.build(),
+            Collections.singletonList(policyMetadata));
+        Index index = clusterState.metaData().index(indexName).getIndex();
+        IndexLifecycleRunner runner = new IndexLifecycleRunner(policyRegistry, null, threadPool, () -> now);
+        ClusterState nextClusterState = runner.moveClusterStateToPreviouslyFailedStep(clusterState, indexName, true);
+        IndexLifecycleRunnerTests.assertClusterStateOnNextStep(clusterState, index, errorStepKey, failedStepKey,
+            nextClusterState, now);
+        LifecycleExecutionState executionState = LifecycleExecutionState.fromIndexMetadata(nextClusterState.metaData().index(indexName));
+        assertThat(executionState.getFailedStepRetryCount(), is(1));
+    }
+
     public void testAddStepInfoToClusterState() throws IOException {
     public void testAddStepInfoToClusterState() throws IOException {
         String indexName = "my_index";
         String indexName = "my_index";
         StepKey currentStep = new StepKey("current_phase", "current_action", "current_step");
         StepKey currentStep = new StepKey("current_phase", "current_action", "current_step");
@@ -1227,6 +1327,75 @@ public class IndexLifecycleRunnerTests extends ESTestCase {
             runner.isReadyToTransitionToThisPhase(policyName, indexMetaData, "phase"));
             runner.isReadyToTransitionToThisPhase(policyName, indexMetaData, "phase"));
     }
     }
 
 
+   public void testValidateTransitionThrowsExceptionForMissingIndexPolicy() {
+       IndexMetaData indexMetaData = IndexMetaData.builder("index").settings(settings(Version.CURRENT))
+           .numberOfShards(randomIntBetween(1, 5))
+           .numberOfReplicas(randomIntBetween(0, 5))
+           .build();
+
+       StepKey currentStepKey = new StepKey("hot", "action", "firstStep");
+       StepKey nextStepKey = new StepKey("hot", "action", "secondStep");
+       Step currentStep = new MockStep(currentStepKey, nextStepKey);
+       MockPolicyStepsRegistry policyRegistry = createOneStepPolicyStepRegistry("policy", currentStep);
+
+       expectThrows(IllegalArgumentException.class,
+           () -> IndexLifecycleRunner.validateTransition(indexMetaData, currentStepKey, nextStepKey, policyRegistry));
+   }
+
+   public void testValidateTransitionThrowsExceptionIfTheCurrentStepIsIncorrect() {
+       LifecycleExecutionState.Builder lifecycleState = LifecycleExecutionState.builder();
+       lifecycleState.setPhase("hot");
+       lifecycleState.setAction("action");
+       lifecycleState.setStep("another_step");
+       String policy = "policy";
+       IndexMetaData indexMetaData = buildIndexMetadata(policy, lifecycleState);
+
+       StepKey currentStepKey = new StepKey("hot", "action", "firstStep");
+       StepKey nextStepKey = new StepKey("hot", "action", "secondStep");
+       Step currentStep = new MockStep(currentStepKey, nextStepKey);
+       MockPolicyStepsRegistry policyRegistry = createOneStepPolicyStepRegistry(policy, currentStep);
+
+       expectThrows(IllegalArgumentException.class,
+           () -> IndexLifecycleRunner.validateTransition(indexMetaData, currentStepKey, nextStepKey, policyRegistry));
+   }
+
+   public void testValidateTransitionThrowsExceptionIfNextStepDoesNotExist() {
+       LifecycleExecutionState.Builder lifecycleState = LifecycleExecutionState.builder();
+       lifecycleState.setPhase("hot");
+       lifecycleState.setAction("action");
+       lifecycleState.setStep("firstStep");
+       String policy = "policy";
+       IndexMetaData indexMetaData = buildIndexMetadata(policy, lifecycleState);
+
+       StepKey currentStepKey = new StepKey("hot", "action", "firstStep");
+       StepKey nextStepKey = new StepKey("hot", "action", "secondStep");
+       Step currentStep = new MockStep(currentStepKey, nextStepKey);
+       MockPolicyStepsRegistry policyRegistry = createOneStepPolicyStepRegistry(policy, currentStep);
+
+       expectThrows(IllegalArgumentException.class,
+           () -> IndexLifecycleRunner.validateTransition(indexMetaData, currentStepKey, nextStepKey, policyRegistry));
+   }
+
+    public void testValidateValidTransition() {
+        LifecycleExecutionState.Builder lifecycleState = LifecycleExecutionState.builder();
+        lifecycleState.setPhase("hot");
+        lifecycleState.setAction("action");
+        lifecycleState.setStep("firstStep");
+        String policy = "policy";
+        IndexMetaData indexMetaData = buildIndexMetadata(policy, lifecycleState);
+
+        StepKey currentStepKey = new StepKey("hot", "action", "firstStep");
+        StepKey nextStepKey = new StepKey("hot", "action", "secondStep");
+        Step finalStep = new MockStep(nextStepKey, new StepKey("hot", "action", "completed"));
+        MockPolicyStepsRegistry policyRegistry = createOneStepPolicyStepRegistry(policy, finalStep);
+
+        try {
+            IndexLifecycleRunner.validateTransition(indexMetaData, currentStepKey, nextStepKey, policyRegistry);
+        } catch (Exception e) {
+            logger.error(e);
+            fail("validateTransition should not throw exception on valid transitions");
+        }
+    }
 
 
     public static void assertIndexNotManagedByILM(ClusterState clusterState, Index index) {
     public static void assertIndexNotManagedByILM(ClusterState clusterState, Index index) {
         MetaData metadata = clusterState.metaData();
         MetaData metadata = clusterState.metaData();
@@ -1304,6 +1473,15 @@ public class IndexLifecycleRunnerTests extends ESTestCase {
         assertEquals(null, newLifecycleState.getStepInfo());
         assertEquals(null, newLifecycleState.getStepInfo());
     }
     }
 
 
+    private IndexMetaData buildIndexMetadata(String policy, LifecycleExecutionState.Builder lifecycleState) {
+        return IndexMetaData.builder("index")
+            .settings(settings(Version.CURRENT).put(LifecycleSettings.LIFECYCLE_NAME, policy))
+            .numberOfShards(randomIntBetween(1, 5))
+            .numberOfReplicas(randomIntBetween(0, 5))
+            .putCustom(ILM_CUSTOM_METADATA_KEY, lifecycleState.build().asMap())
+            .build();
+    }
+
     private void assertClusterStateOnErrorStep(ClusterState oldClusterState, Index index, StepKey currentStep,
     private void assertClusterStateOnErrorStep(ClusterState oldClusterState, Index index, StepKey currentStep,
                                                ClusterState newClusterState, long now, String expectedCauseValue) throws IOException {
                                                ClusterState newClusterState, long now, String expectedCauseValue) throws IOException {
         assertNotSame(oldClusterState, newClusterState);
         assertNotSame(oldClusterState, newClusterState);
@@ -1573,4 +1751,16 @@ public class IndexLifecycleRunnerTests extends ESTestCase {
         }
         }
 
 
     }
     }
+
+    private static final class RetryableMockStep extends MockStep {
+
+        RetryableMockStep(StepKey stepKey, StepKey nextStepKey) {
+            super(stepKey, nextStepKey);
+        }
+
+        @Override
+        public boolean isRetryable() {
+            return true;
+        }
+    }
 }
 }

+ 11 - 6
x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/MoveToErrorStepUpdateTaskTests.java

@@ -19,12 +19,13 @@ import org.elasticsearch.common.xcontent.json.JsonXContent;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.test.ESTestCase;
 import org.elasticsearch.test.ESTestCase;
 import org.elasticsearch.xpack.core.ilm.ErrorStep;
 import org.elasticsearch.xpack.core.ilm.ErrorStep;
-import org.elasticsearch.xpack.core.ilm.LifecycleExecutionState;
 import org.elasticsearch.xpack.core.ilm.IndexLifecycleMetadata;
 import org.elasticsearch.xpack.core.ilm.IndexLifecycleMetadata;
+import org.elasticsearch.xpack.core.ilm.LifecycleExecutionState;
 import org.elasticsearch.xpack.core.ilm.LifecyclePolicy;
 import org.elasticsearch.xpack.core.ilm.LifecyclePolicy;
 import org.elasticsearch.xpack.core.ilm.LifecyclePolicyMetadata;
 import org.elasticsearch.xpack.core.ilm.LifecyclePolicyMetadata;
 import org.elasticsearch.xpack.core.ilm.LifecyclePolicyTests;
 import org.elasticsearch.xpack.core.ilm.LifecyclePolicyTests;
 import org.elasticsearch.xpack.core.ilm.LifecycleSettings;
 import org.elasticsearch.xpack.core.ilm.LifecycleSettings;
+import org.elasticsearch.xpack.core.ilm.MockStep;
 import org.elasticsearch.xpack.core.ilm.OperationMode;
 import org.elasticsearch.xpack.core.ilm.OperationMode;
 import org.elasticsearch.xpack.core.ilm.Step.StepKey;
 import org.elasticsearch.xpack.core.ilm.Step.StepKey;
 import org.junit.Before;
 import org.junit.Before;
@@ -67,12 +68,14 @@ public class MoveToErrorStepUpdateTaskTests extends ESTestCase {
 
 
     public void testExecuteSuccessfullyMoved() throws IOException {
     public void testExecuteSuccessfullyMoved() throws IOException {
         StepKey currentStepKey = new StepKey("current-phase", "current-action", "current-name");
         StepKey currentStepKey = new StepKey("current-phase", "current-action", "current-name");
+        StepKey nextStepKey = new StepKey("next-phase", "next-action", "next-step-name");
         long now = randomNonNegativeLong();
         long now = randomNonNegativeLong();
         Exception cause = new ElasticsearchException("THIS IS AN EXPECTED CAUSE");
         Exception cause = new ElasticsearchException("THIS IS AN EXPECTED CAUSE");
 
 
         setStateToKey(currentStepKey);
         setStateToKey(currentStepKey);
 
 
-        MoveToErrorStepUpdateTask task = new MoveToErrorStepUpdateTask(index, policy, currentStepKey, cause, () -> now);
+        MoveToErrorStepUpdateTask task = new MoveToErrorStepUpdateTask(index, policy, currentStepKey, cause, () -> now,
+            (idxMeta, stepKey) -> new MockStep(stepKey, nextStepKey));
         ClusterState newState = task.execute(clusterState);
         ClusterState newState = task.execute(clusterState);
         LifecycleExecutionState lifecycleState = LifecycleExecutionState.fromIndexMetadata(newState.getMetaData().index(index));
         LifecycleExecutionState lifecycleState = LifecycleExecutionState.fromIndexMetadata(newState.getMetaData().index(index));
         StepKey actualKey = IndexLifecycleRunner.getCurrentStepKey(lifecycleState);
         StepKey actualKey = IndexLifecycleRunner.getCurrentStepKey(lifecycleState);
@@ -97,7 +100,8 @@ public class MoveToErrorStepUpdateTaskTests extends ESTestCase {
         long now = randomNonNegativeLong();
         long now = randomNonNegativeLong();
         Exception cause = new ElasticsearchException("THIS IS AN EXPECTED CAUSE");
         Exception cause = new ElasticsearchException("THIS IS AN EXPECTED CAUSE");
         setStateToKey(notCurrentStepKey);
         setStateToKey(notCurrentStepKey);
-        MoveToErrorStepUpdateTask task = new MoveToErrorStepUpdateTask(index, policy, currentStepKey, cause, () -> now);
+        MoveToErrorStepUpdateTask task = new MoveToErrorStepUpdateTask(index, policy, currentStepKey, cause, () -> now,
+            (idxMeta, stepKey) -> new MockStep(stepKey, new StepKey("next-phase", "action", "step")));
         ClusterState newState = task.execute(clusterState);
         ClusterState newState = task.execute(clusterState);
         assertThat(newState, sameInstance(clusterState));
         assertThat(newState, sameInstance(clusterState));
     }
     }
@@ -108,7 +112,8 @@ public class MoveToErrorStepUpdateTaskTests extends ESTestCase {
         Exception cause = new ElasticsearchException("THIS IS AN EXPECTED CAUSE");
         Exception cause = new ElasticsearchException("THIS IS AN EXPECTED CAUSE");
         setStateToKey(currentStepKey);
         setStateToKey(currentStepKey);
         setStatePolicy("not-" + policy);
         setStatePolicy("not-" + policy);
-        MoveToErrorStepUpdateTask task = new MoveToErrorStepUpdateTask(index, policy, currentStepKey, cause, () -> now);
+        MoveToErrorStepUpdateTask task = new MoveToErrorStepUpdateTask(index, policy, currentStepKey, cause, () -> now,
+            (idxMeta, stepKey) -> new MockStep(stepKey, new StepKey("next-phase", "action", "step")));
         ClusterState newState = task.execute(clusterState);
         ClusterState newState = task.execute(clusterState);
         assertThat(newState, sameInstance(clusterState));
         assertThat(newState, sameInstance(clusterState));
     }
     }
@@ -120,7 +125,8 @@ public class MoveToErrorStepUpdateTaskTests extends ESTestCase {
 
 
         setStateToKey(currentStepKey);
         setStateToKey(currentStepKey);
 
 
-        MoveToErrorStepUpdateTask task = new MoveToErrorStepUpdateTask(index, policy, currentStepKey, cause, () -> now);
+        MoveToErrorStepUpdateTask task = new MoveToErrorStepUpdateTask(index, policy, currentStepKey, cause, () -> now,
+            (idxMeta, stepKey) -> new MockStep(stepKey, new StepKey("next-phase", "action", "step")));
         Exception expectedException = new RuntimeException();
         Exception expectedException = new RuntimeException();
         ElasticsearchException exception = expectThrows(ElasticsearchException.class,
         ElasticsearchException exception = expectThrows(ElasticsearchException.class,
                 () -> task.onFailure(randomAlphaOfLength(10), expectedException));
                 () -> task.onFailure(randomAlphaOfLength(10), expectedException));
@@ -134,7 +140,6 @@ public class MoveToErrorStepUpdateTaskTests extends ESTestCase {
             .metaData(MetaData.builder(clusterState.metaData())
             .metaData(MetaData.builder(clusterState.metaData())
                 .updateSettings(Settings.builder()
                 .updateSettings(Settings.builder()
                     .put(LifecycleSettings.LIFECYCLE_NAME, policy).build(), index.getName())).build();
                     .put(LifecycleSettings.LIFECYCLE_NAME, policy).build(), index.getName())).build();
-
     }
     }
     private void setStateToKey(StepKey stepKey) {
     private void setStateToKey(StepKey stepKey) {
         LifecycleExecutionState.Builder lifecycleState = LifecycleExecutionState.builder(
         LifecycleExecutionState.Builder lifecycleState = LifecycleExecutionState.builder(