Browse Source

Add health user action for unhealthy SLM policy failure counts (#88523)

This PR adds a user action to the SLM health indicator which checks each SLM policy's invocations 
since last success field and reports degraded health (YELLOW) in the event that any policy is at or 
above the failure threshold (default is 5 failures in a row).
James Baiera 3 years ago
parent
commit
6ce5f73e97

+ 5 - 0
docs/changelog/88523.yaml

@@ -0,0 +1,5 @@
+pr: 88523
+summary: Add health user action for unhealthy SLM policy failure counts
+area: Health
+type: enhancement
+issues: []

+ 7 - 0
docs/reference/settings/snapshot-settings.asciidoc

@@ -38,6 +38,13 @@ Defaults to daily at 1:30am UTC: `0 30 1 * * ?`.
 Limits how long {slm-init} should spend deleting old snapshots.
 Defaults to one hour: `1h`.
 
+[[slm-health-failed-snapshot-warn-threshold]]
+`slm.health.failed_snapshot_warn_threshold`::
+(<<dynamic-cluster-setting,Dynamic>>, Long)
+The number of failed invocations since last successful snapshot that
+indicate a problem with the policy in the health api.
+Defaults to a health api warning after five repeated failures: `5L`.
+
 [[repositories-url-allowed]]
 // tag::repositories-url-allowed[]
 `repositories.url.allowed_urls` {ess-icon}::

+ 12 - 0
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/LifecycleSettings.java

@@ -29,6 +29,7 @@ public class LifecycleSettings {
     public static final String SLM_RETENTION_SCHEDULE = "slm.retention_schedule";
     public static final String SLM_RETENTION_DURATION = "slm.retention_duration";
     public static final String SLM_MINIMUM_INTERVAL = "slm.minimum_interval";
+    public static final String SLM_HEALTH_FAILED_SNAPSHOT_WARN_THRESHOLD = "slm.health.failed_snapshot_warn_threshold";
 
     // This is not a setting configuring ILM per se, but certain ILM actions need to validate the managed index is not
     // already mounted as a searchable snapshot. Those ILM actions will check if the index has this setting name configured.
@@ -128,4 +129,15 @@ public class LifecycleSettings {
         Setting.Property.Dynamic,
         Setting.Property.NodeScope
     );
+    /**
+     * The number of repeated failures allowed since the last successful SLM snapshot before a health warning is surfaced in the
+     * health API.
+     */
+    public static final Setting<Long> SLM_HEALTH_FAILED_SNAPSHOT_WARN_THRESHOLD_SETTING = Setting.longSetting(
+        SLM_HEALTH_FAILED_SNAPSHOT_WARN_THRESHOLD,
+        5L,
+        1L,
+        Setting.Property.Dynamic,
+        Setting.Property.NodeScope
+    );
 }

+ 2 - 1
x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycle.java

@@ -187,7 +187,8 @@ public class IndexLifecycle extends Plugin implements ActionPlugin, HealthPlugin
             LifecycleSettings.SLM_HISTORY_INDEX_ENABLED_SETTING,
             LifecycleSettings.SLM_RETENTION_SCHEDULE_SETTING,
             LifecycleSettings.SLM_RETENTION_DURATION_SETTING,
-            LifecycleSettings.SLM_MINIMUM_INTERVAL_SETTING
+            LifecycleSettings.SLM_MINIMUM_INTERVAL_SETTING,
+            LifecycleSettings.SLM_HEALTH_FAILED_SNAPSHOT_WARN_THRESHOLD_SETTING
         );
     }
 

+ 98 - 7
x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/slm/SlmHealthIndicatorService.java

@@ -16,14 +16,20 @@ import org.elasticsearch.health.HealthIndicatorService;
 import org.elasticsearch.health.ImpactArea;
 import org.elasticsearch.health.SimpleHealthIndicatorDetails;
 import org.elasticsearch.xpack.core.ilm.OperationMode;
+import org.elasticsearch.xpack.core.slm.SnapshotInvocationRecord;
 import org.elasticsearch.xpack.core.slm.SnapshotLifecycleMetadata;
+import org.elasticsearch.xpack.core.slm.SnapshotLifecyclePolicyMetadata;
 
+import java.util.Collection;
 import java.util.Collections;
+import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.stream.Collectors;
 
 import static org.elasticsearch.health.HealthStatus.GREEN;
 import static org.elasticsearch.health.HealthStatus.YELLOW;
+import static org.elasticsearch.xpack.core.ilm.LifecycleSettings.SLM_HEALTH_FAILED_SNAPSHOT_WARN_THRESHOLD_SETTING;
 
 /**
  * This indicator reports health for snapshot lifecycle management component.
@@ -48,10 +54,26 @@ public class SlmHealthIndicatorService implements HealthIndicatorService {
         null
     );
 
+    public static final String ACTION_CHECK_RECENTLY_FAILED_SNAPSHOTS_HELP_URL = "https://ela.st/fix-recent-snapshot-failures";
+    public static final Diagnosis.Definition ACTION_CHECK_RECENTLY_FAILED_SNAPSHOTS = new Diagnosis.Definition(
+        "check_recent_snapshot_failures",
+        "The following snapshot lifecycle policies have exceeded the warning threshold for repeat failures without a successful execution.",
+        "Check the snapshot lifecycle policies [/_slm/policy/<policy_name>?human] for detailed failure info.",
+        ACTION_CHECK_RECENTLY_FAILED_SNAPSHOTS_HELP_URL
+    );
+
     private final ClusterService clusterService;
+    private volatile long failedSnapshotWarnThreshold;
 
     public SlmHealthIndicatorService(ClusterService clusterService) {
         this.clusterService = clusterService;
+        this.failedSnapshotWarnThreshold = clusterService.getClusterSettings().get(SLM_HEALTH_FAILED_SNAPSHOT_WARN_THRESHOLD_SETTING);
+        clusterService.getClusterSettings()
+            .addSettingsUpdateConsumer(SLM_HEALTH_FAILED_SNAPSHOT_WARN_THRESHOLD_SETTING, this::setFailedSnapshotWarnThreshold);
+    }
+
+    public void setFailedSnapshotWarnThreshold(long value) {
+        this.failedSnapshotWarnThreshold = value;
     }
 
     @Override
@@ -66,7 +88,7 @@ public class SlmHealthIndicatorService implements HealthIndicatorService {
             return createIndicator(
                 GREEN,
                 "No Snapshot Lifecycle Management policies configured",
-                createDetails(explain, slmMetadata),
+                createDetails(explain, Collections.emptyList(), slmMetadata),
                 Collections.emptyList(),
                 Collections.emptyList()
             );
@@ -81,26 +103,95 @@ public class SlmHealthIndicatorService implements HealthIndicatorService {
             return createIndicator(
                 YELLOW,
                 "Snapshot Lifecycle Management is not running",
-                createDetails(explain, slmMetadata),
+                createDetails(explain, Collections.emptyList(), slmMetadata),
                 impacts,
                 List.of(SLM_NOT_RUNNING)
             );
         } else {
+            List<SnapshotLifecyclePolicyMetadata> unhealthyPolicies = slmMetadata.getSnapshotConfigurations()
+                .values()
+                .stream()
+                .filter(metadata -> snapshotFailuresExceedWarningCount(failedSnapshotWarnThreshold, metadata))
+                .toList();
+
+            if (unhealthyPolicies.size() > 0) {
+                List<HealthIndicatorImpact> impacts = Collections.singletonList(
+                    new HealthIndicatorImpact(
+                        2,
+                        "Some automated snapshots have not had a successful execution recently. Indices restored from affected "
+                            + "snapshots may not contain recent changes.",
+                        List.of(ImpactArea.BACKUP)
+                    )
+                );
+
+                return createIndicator(
+                    YELLOW,
+                    "Encountered [" + unhealthyPolicies.size() + "] unhealthy snapshot lifecycle management policies.",
+                    createDetails(explain, unhealthyPolicies, slmMetadata),
+                    impacts,
+                    List.of(
+                        new Diagnosis(
+                            ACTION_CHECK_RECENTLY_FAILED_SNAPSHOTS,
+                            unhealthyPolicies.stream().map(SnapshotLifecyclePolicyMetadata::getName).toList()
+                        )
+                    )
+                );
+            }
+
             return createIndicator(
                 GREEN,
                 "Snapshot Lifecycle Management is running",
-                createDetails(explain, slmMetadata),
+                createDetails(explain, Collections.emptyList(), slmMetadata),
                 Collections.emptyList(),
                 Collections.emptyList()
             );
         }
     }
 
-    private static HealthIndicatorDetails createDetails(boolean explain, SnapshotLifecycleMetadata metadata) {
+    static boolean snapshotFailuresExceedWarningCount(long failedSnapshotWarnThreshold, SnapshotLifecyclePolicyMetadata policyMetadata) {
+        SnapshotInvocationRecord lastFailure = policyMetadata.getLastFailure();
+        if (lastFailure == null) {
+            // No failures yet to act on
+            return false;
+        }
+
+        // Determine if the most recent snapshot is a failure
+        SnapshotInvocationRecord lastSuccess = policyMetadata.getLastSuccess();
+        if (lastSuccess != null && lastSuccess.getSnapshotStartTimestamp() > lastFailure.getSnapshotStartTimestamp()) {
+            // Success was more recent than last failure
+            return false;
+        }
+
+        return policyMetadata.getInvocationsSinceLastSuccess() >= failedSnapshotWarnThreshold;
+    }
+
+    private static HealthIndicatorDetails createDetails(
+        boolean explain,
+        Collection<SnapshotLifecyclePolicyMetadata> unhealthyPolicies,
+        SnapshotLifecycleMetadata metadata
+    ) {
         if (explain) {
-            return new SimpleHealthIndicatorDetails(
-                Map.of("slm_status", metadata.getOperationMode(), "policies", metadata.getSnapshotConfigurations().size())
-            );
+            Map<String, Object> details = new LinkedHashMap<>();
+            details.put("slm_status", metadata.getOperationMode());
+            details.put("policies", metadata.getSnapshotConfigurations().size());
+            if (unhealthyPolicies.size() > 0) {
+                details.put(
+                    "unhealthy_policies",
+                    Map.of(
+                        "count",
+                        unhealthyPolicies.size(),
+                        "invocations_since_last_success",
+                        unhealthyPolicies.stream()
+                            .collect(
+                                Collectors.toMap(
+                                    SnapshotLifecyclePolicyMetadata::getName,
+                                    SnapshotLifecyclePolicyMetadata::getInvocationsSinceLastSuccess
+                                )
+                            )
+                    )
+                );
+            }
+            return new SimpleHealthIndicatorDetails(details);
         } else {
             return HealthIndicatorDetails.EMPTY;
         }

+ 158 - 0
x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/slm/SlmHealthIndicatorServiceTests.java

@@ -11,11 +11,16 @@ import org.elasticsearch.cluster.ClusterName;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.metadata.Metadata;
 import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.settings.ClusterSettings;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.health.Diagnosis;
 import org.elasticsearch.health.HealthIndicatorImpact;
 import org.elasticsearch.health.HealthIndicatorResult;
 import org.elasticsearch.health.ImpactArea;
 import org.elasticsearch.health.SimpleHealthIndicatorDetails;
 import org.elasticsearch.test.ESTestCase;
+import org.elasticsearch.xpack.core.ilm.LifecycleSettings;
+import org.elasticsearch.xpack.core.slm.SnapshotInvocationRecord;
 import org.elasticsearch.xpack.core.slm.SnapshotLifecycleMetadata;
 import org.elasticsearch.xpack.core.slm.SnapshotLifecyclePolicy;
 import org.elasticsearch.xpack.core.slm.SnapshotLifecyclePolicyMetadata;
@@ -23,6 +28,8 @@ import org.elasticsearch.xpack.core.slm.SnapshotLifecyclePolicyMetadata;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
 
 import static org.elasticsearch.health.HealthStatus.GREEN;
 import static org.elasticsearch.health.HealthStatus.YELLOW;
@@ -32,6 +39,7 @@ import static org.elasticsearch.xpack.core.ilm.OperationMode.STOPPING;
 import static org.elasticsearch.xpack.slm.SlmHealthIndicatorService.NAME;
 import static org.elasticsearch.xpack.slm.SlmHealthIndicatorService.SLM_NOT_RUNNING;
 import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.is;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
 
@@ -121,6 +129,136 @@ public class SlmHealthIndicatorServiceTests extends ESTestCase {
         );
     }
 
+    public void testIsGreenWhenPoliciesHaveFailedForLessThanWarningThreshold() {
+        long execTime = System.currentTimeMillis();
+        long window = TimeUnit.HOURS.toMillis(24) - 5000L; // Just under 24 hours.
+        var clusterState = createClusterStateWith(
+            new SnapshotLifecycleMetadata(
+                createSlmPolicyWithInvocations(
+                    snapshotInvocation(execTime, execTime + 1000L),
+                    snapshotInvocation(execTime + window, execTime + window + 1000L),
+                    randomLongBetween(0, 4)
+                ),
+                RUNNING,
+                null
+            )
+        );
+        var service = createSlmHealthIndicatorService(clusterState);
+
+        assertThat(
+            service.calculate(true),
+            equalTo(
+                new HealthIndicatorResult(
+                    NAME,
+                    GREEN,
+                    "Snapshot Lifecycle Management is running",
+                    new SimpleHealthIndicatorDetails(Map.of("slm_status", RUNNING, "policies", 1)),
+                    Collections.emptyList(),
+                    Collections.emptyList()
+                )
+            )
+        );
+    }
+
+    public void testIsYellowWhenPoliciesHaveFailedForMoreThanWarningThreshold() {
+        long execTime = System.currentTimeMillis();
+        long window = TimeUnit.HOURS.toMillis(24) + 5000L; // 24 hours and some extra room.
+        long failedInvocations = randomLongBetween(5L, Long.MAX_VALUE);
+        var clusterState = createClusterStateWith(
+            new SnapshotLifecycleMetadata(
+                createSlmPolicyWithInvocations(
+                    snapshotInvocation(execTime, execTime + 1000L),
+                    snapshotInvocation(execTime + window, execTime + window + 1000L),
+                    failedInvocations
+                ),
+                RUNNING,
+                null
+            )
+        );
+        var service = createSlmHealthIndicatorService(clusterState);
+
+        HealthIndicatorResult calculate = service.calculate(true);
+        assertThat(
+            calculate,
+            equalTo(
+                new HealthIndicatorResult(
+                    NAME,
+                    YELLOW,
+                    "Encountered [1] unhealthy snapshot lifecycle management policies.",
+                    new SimpleHealthIndicatorDetails(
+                        Map.of(
+                            "slm_status",
+                            RUNNING,
+                            "policies",
+                            1,
+                            "unhealthy_policies",
+                            Map.of("count", 1, "invocations_since_last_success", Map.of("test-policy", failedInvocations))
+                        )
+                    ),
+                    Collections.singletonList(
+                        new HealthIndicatorImpact(
+                            2,
+                            "Some automated snapshots have not had a successful execution recently. Indices restored from affected "
+                                + "snapshots may not contain recent changes.",
+                            List.of(ImpactArea.BACKUP)
+                        )
+                    ),
+                    List.of(new Diagnosis(SlmHealthIndicatorService.ACTION_CHECK_RECENTLY_FAILED_SNAPSHOTS, List.of("test-policy")))
+                )
+            )
+        );
+    }
+
+    public void testSnapshotPolicyExceedsWarningThresholdPredicate() {
+        SnapshotLifecyclePolicyMetadata slmPolicyMetadata = SnapshotLifecyclePolicyMetadata.builder()
+            .setPolicy(new SnapshotLifecyclePolicy("id", "test-policy", "", "test-repository", null, null))
+            .setVersion(1L)
+            .setModifiedDate(System.currentTimeMillis())
+            .build();
+
+        assertThat(SlmHealthIndicatorService.snapshotFailuresExceedWarningCount(15L, slmPolicyMetadata), is(false));
+        assertThat(SlmHealthIndicatorService.snapshotFailuresExceedWarningCount(5L, slmPolicyMetadata), is(false));
+        assertThat(SlmHealthIndicatorService.snapshotFailuresExceedWarningCount(1L, slmPolicyMetadata), is(false));
+
+        slmPolicyMetadata = SnapshotLifecyclePolicyMetadata.builder()
+            .setPolicy(new SnapshotLifecyclePolicy("id", "test-policy", "", "test-repository", null, null))
+            .setVersion(1L)
+            .setModifiedDate(System.currentTimeMillis())
+            .setLastSuccess(snapshotInvocation(1000L, 2000L))
+            .setInvocationsSinceLastSuccess(0L)
+            .build();
+
+        assertThat(SlmHealthIndicatorService.snapshotFailuresExceedWarningCount(15L, slmPolicyMetadata), is(false));
+        assertThat(SlmHealthIndicatorService.snapshotFailuresExceedWarningCount(5L, slmPolicyMetadata), is(false));
+        assertThat(SlmHealthIndicatorService.snapshotFailuresExceedWarningCount(1L, slmPolicyMetadata), is(false));
+
+        slmPolicyMetadata = SnapshotLifecyclePolicyMetadata.builder()
+            .setPolicy(new SnapshotLifecyclePolicy("id", "test-policy", "", "test-repository", null, null))
+            .setVersion(1L)
+            .setModifiedDate(System.currentTimeMillis())
+            .setLastSuccess(snapshotInvocation(1000L, 2000L))
+            .setLastFailure(snapshotInvocation(8000L, 9000L))
+            .setInvocationsSinceLastSuccess(randomLongBetween(5L, 10L))
+            .build();
+
+        assertThat(SlmHealthIndicatorService.snapshotFailuresExceedWarningCount(15L, slmPolicyMetadata), is(false));
+        assertThat(SlmHealthIndicatorService.snapshotFailuresExceedWarningCount(5L, slmPolicyMetadata), is(true));
+        assertThat(SlmHealthIndicatorService.snapshotFailuresExceedWarningCount(1L, slmPolicyMetadata), is(true));
+
+        slmPolicyMetadata = SnapshotLifecyclePolicyMetadata.builder()
+            .setPolicy(new SnapshotLifecyclePolicy("id", "test-policy", "", "test-repository", null, null))
+            .setVersion(1L)
+            .setModifiedDate(System.currentTimeMillis())
+            .setLastSuccess(snapshotInvocation(8000L, 9000L))
+            .setLastFailure(snapshotInvocation(1000L, 2000L))
+            .setInvocationsSinceLastSuccess(0L)
+            .build();
+
+        assertThat(SlmHealthIndicatorService.snapshotFailuresExceedWarningCount(15L, slmPolicyMetadata), is(false));
+        assertThat(SlmHealthIndicatorService.snapshotFailuresExceedWarningCount(5L, slmPolicyMetadata), is(false));
+        assertThat(SlmHealthIndicatorService.snapshotFailuresExceedWarningCount(1L, slmPolicyMetadata), is(false));
+    }
+
     private static ClusterState createClusterStateWith(SnapshotLifecycleMetadata metadata) {
         var builder = new ClusterState.Builder(new ClusterName("test-cluster"));
         if (metadata != null) {
@@ -130,19 +268,39 @@ public class SlmHealthIndicatorServiceTests extends ESTestCase {
     }
 
     private static Map<String, SnapshotLifecyclePolicyMetadata> createSlmPolicy() {
+        return createSlmPolicyWithInvocations(null, null, 0L);
+    }
+
+    private static Map<String, SnapshotLifecyclePolicyMetadata> createSlmPolicyWithInvocations(
+        SnapshotInvocationRecord lastSuccess,
+        SnapshotInvocationRecord lastFailure,
+        long invocationsSinceLastSuccess
+    ) {
         return Map.of(
             "test-policy",
             SnapshotLifecyclePolicyMetadata.builder()
                 .setPolicy(new SnapshotLifecyclePolicy("id", "test-policy", "", "test-repository", null, null))
                 .setVersion(1L)
                 .setModifiedDate(System.currentTimeMillis())
+                .setLastSuccess(lastSuccess)
+                .setLastFailure(lastFailure)
+                .setInvocationsSinceLastSuccess(invocationsSinceLastSuccess)
                 .build()
         );
     }
 
+    private static SnapshotInvocationRecord snapshotInvocation(long startTime, long stopTime) {
+        return new SnapshotInvocationRecord("test-policy-snapshot", startTime, stopTime, null);
+    }
+
     private static SlmHealthIndicatorService createSlmHealthIndicatorService(ClusterState clusterState) {
         var clusterService = mock(ClusterService.class);
         when(clusterService.state()).thenReturn(clusterState);
+        ClusterSettings clusterSettings = new ClusterSettings(
+            Settings.EMPTY,
+            Set.of(LifecycleSettings.SLM_HEALTH_FAILED_SNAPSHOT_WARN_THRESHOLD_SETTING)
+        );
+        when(clusterService.getClusterSettings()).thenReturn(clusterSettings);
         return new SlmHealthIndicatorService(clusterService);
     }
 }