3 years ago · db73aa0498
--- a/docs/reference/tab-widgets/troubleshooting/snapshot/repeated-snapshot-failures-widget.asciidoc
+++ b/docs/reference/tab-widgets/troubleshooting/snapshot/repeated-snapshot-failures-widget.asciidoc
@@ -0,0 +1,40 @@
 
				+++++
			
 
				+<div class="tabs" data-tab-group="host">
			
 
				+  <div role="tablist" aria-label="Addressing repeated snapshot policy failures">
			
 
				+    <button role="tab"
			
 
				+            aria-selected="true"
			
 
				+            aria-controls="cloud-tab-repeated-snapshot-failures"
			
 
				+            id="cloud-repeated-snapshot-failures">
			
 
				+      Elasticsearch Service
			
 
				+    </button>
			
 
				+    <button role="tab"
			
 
				+            aria-selected="false"
			
 
				+            aria-controls="self-managed-tab-repeated-snapshot-failures"
			
 
				+            id="self-managed-repeated-snapshot-failures"
			
 
				+            tabindex="-1">
			
 
				+      Self-managed
			
 
				+    </button>
			
 
				+  </div>
			
 
				+  <div tabindex="0"
			
 
				+       role="tabpanel"
			
 
				+       id="cloud-tab-repeated-snapshot-failures"
			
 
				+       aria-labelledby="cloud-repeated-snapshot-failures">
			
 
				+++++
			
 
				+
			
 
				+include::repeated-snapshot-failures.asciidoc[tag=cloud]
			
 
				+
			
 
				+++++
			
 
				+  </div>
			
 
				+  <div tabindex="0"
			
 
				+       role="tabpanel"
			
 
				+       id="self-managed-tab-repeated-snapshot-failures"
			
 
				+       aria-labelledby="self-managed-repeated-snapshot-failures"
			
 
				+       hidden="">
			
 
				+++++
			
 
				+
			
 
				+include::repeated-snapshot-failures.asciidoc[tag=self-managed]
			
 
				+
			
 
				+++++
			
 
				+  </div>
			
 
				+</div>
			
 
				+++++
			
--- a/docs/reference/tab-widgets/troubleshooting/snapshot/repeated-snapshot-failures.asciidoc
+++ b/docs/reference/tab-widgets/troubleshooting/snapshot/repeated-snapshot-failures.asciidoc
@@ -0,0 +1,172 @@
 
				+// tag::cloud[]
			
 
				+In order to check the status of failing {slm} policies we need to go to Kibana and retrieve the
			
 
				+<<slm-api-get-policy, Snapshot Lifecycle Policy information>>.
			
 
				+
			
 
				+**Use {kib}**
			
 
				+
			
 
				+//tag::kibana-api-ex[]
			
 
				+. Log in to the {ess-console}[{ecloud} console].
			
 
				++
			
 
				+
			
 
				+. On the **Elasticsearch Service** panel, click the name of your deployment.
			
 
				++
			
 
				+
			
 
				+NOTE: If the name of your deployment is disabled your {kib} instances might be
			
 
				+unhealthy, in which case please contact https://support.elastic.co[Elastic Support].
			
 
				+If your deployment doesn't include {kib}, all you need to do is
			
 
				+{cloud}/ec-access-kibana.html[enable it first].
			
 
				+
			
 
				+. Open your deployment's side navigation menu (placed under the Elastic logo in the upper left corner)
			
 
				+and go to **Dev Tools > Console**.
			
 
				++
			
 
				+[role="screenshot"]
			
 
				+image::images/kibana-console.png[{kib} Console,align="center"]
			
 
				+
			
 
				+. <<slm-api-get-policy, Retrieve>> the {slm} policy:
			
 
				++
			
 
				+[source,console]
			
 
				+----
			
 
				+GET _slm/policy/<affected-policy-name>
			
 
				+----
			
 
				+// TEST[skip:These policies do not exist]
			
 
				++
			
 
				+The response will look like this:
			
 
				++
			
 
				+[source,console-result]
			
 
				+----
			
 
				+{
			
 
				+  "affected-policy-name": { <1>
			
 
				+    "version": 1,
			
 
				+    "modified_date": "2099-05-06T01:30:00.000Z",
			
 
				+    "modified_date_millis": 4081757400000,
			
 
				+    "policy" : {
			
 
				+      "schedule": "0 30 1 * * ?",
			
 
				+      "name": "<daily-snap-{now/d}>",
			
 
				+      "repository": "my_repository",
			
 
				+      "config": {
			
 
				+        "indices": ["data-*", "important"],
			
 
				+        "ignore_unavailable": false,
			
 
				+        "include_global_state": false
			
 
				+      },
			
 
				+      "retention": {
			
 
				+        "expire_after": "30d",
			
 
				+        "min_count": 5,
			
 
				+        "max_count": 50
			
 
				+      }
			
 
				+    },
			
 
				+    "last_success" : {
			
 
				+      "snapshot_name" : "daily-snap-2099.05.30-tme_ivjqswgkpryvnao2lg",
			
 
				+      "start_time" : 4083782400000,
			
 
				+      "time" : 4083782400000
			
 
				+    },
			
 
				+    "last_failure" : { <2>
			
 
				+      "snapshot_name" : "daily-snap-2099.06.16-ywe-kgh5rfqfrpnchvsujq",
			
 
				+      "time" : 4085251200000, <3>
			
 
				+      "details" : """{"type":"snapshot_exception","reason":"[daily-snap-2099.06.16-ywe-kgh5rfqfrpnchvsujq] failed to create snapshot successfully, 5 out of 149 total shards failed"}""" <4>
			
 
				+    },
			
 
				+    "stats": {
			
 
				+      "policy": "daily-snapshots",
			
 
				+      "snapshots_taken": 0,
			
 
				+      "snapshots_failed": 0,
			
 
				+      "snapshots_deleted": 0,
			
 
				+      "snapshot_deletion_failures": 0
			
 
				+    },
			
 
				+    "next_execution": "2099-06-17T01:30:00.000Z",
			
 
				+    "next_execution_millis": 4085343000000
			
 
				+  }
			
 
				+}
			
 
				+----
			
 
				+// TESTRESPONSE[skip:the result is for illustrating purposes only]
			
 
				+<1> The affected snapshot lifecycle policy.
			
 
				+<2> The information about the last failure for the policy.
			
 
				+<3> The time when the failure occurred in millis. Use the `human=true` request parameter to see a formatted timestamp.
			
 
				+<4> Error details containing the reason for the snapshot failure.
			
 
				++
			
 
				+
			
 
				+Snapshots can fail for a variety reasons. If the failures are due to configuration errors, consult the
			
 
				+documentation for the repository that the automated snapshots are using. Refer to the
			
 
				+https://www.elastic.co/guide/en/cloud-enterprise/current/ece-manage-repositories.html[guide on managing repositories in ECE]
			
 
				+if you are using such a deployment.
			
 
				+
			
 
				+One common failure scenario is repository corruption. This occurs most often when multiple instances of {es} write to
			
 
				+the same repository location. There is a <<add-repository, separate troubleshooting guide>> to fix this problem.
			
 
				+
			
 
				+In the event that snapshots are failing for other reasons check the logs on the elected master node during the snapshot
			
 
				+execution period for more information.
			
 
				+
			
 
				+
			
 
				+//end::kibana-api-ex[]
			
 
				+// end::cloud[]
			
 
				+
			
 
				+// tag::self-managed[]
			
 
				+<<slm-api-get-policy, Retrieve>> the {slm} policy:
			
 
				+
			
 
				+[source,console]
			
 
				+----
			
 
				+GET _slm/policy/<affected-policy-name>
			
 
				+----
			
 
				+// TEST[skip:These policies do not exist]
			
 
				+
			
 
				+The response will look like this:
			
 
				+
			
 
				+[source,console-result]
			
 
				+----
			
 
				+{
			
 
				+  "affected-policy-name": { <1>
			
 
				+    "version": 1,
			
 
				+    "modified_date": "2099-05-06T01:30:00.000Z",
			
 
				+    "modified_date_millis": 4081757400000,
			
 
				+    "policy" : {
			
 
				+      "schedule": "0 30 1 * * ?",
			
 
				+      "name": "<daily-snap-{now/d}>",
			
 
				+      "repository": "my_repository",
			
 
				+      "config": {
			
 
				+        "indices": ["data-*", "important"],
			
 
				+        "ignore_unavailable": false,
			
 
				+        "include_global_state": false
			
 
				+      },
			
 
				+      "retention": {
			
 
				+        "expire_after": "30d",
			
 
				+        "min_count": 5,
			
 
				+        "max_count": 50
			
 
				+      }
			
 
				+    },
			
 
				+    "last_success" : {
			
 
				+      "snapshot_name" : "daily-snap-2099.05.30-tme_ivjqswgkpryvnao2lg",
			
 
				+      "start_time" : 4083782400000,
			
 
				+      "time" : 4083782400000
			
 
				+    },
			
 
				+    "last_failure" : { <2>
			
 
				+      "snapshot_name" : "daily-snap-2099.06.16-ywe-kgh5rfqfrpnchvsujq",
			
 
				+      "time" : 4085251200000, <3>
			
 
				+      "details" : """{"type":"snapshot_exception","reason":"[daily-snap-2099.06.16-ywe-kgh5rfqfrpnchvsujq] failed to create snapshot successfully, 5 out of 149 total shards failed"}""" <4>
			
 
				+    },
			
 
				+    "stats": {
			
 
				+      "policy": "daily-snapshots",
			
 
				+      "snapshots_taken": 0,
			
 
				+      "snapshots_failed": 0,
			
 
				+      "snapshots_deleted": 0,
			
 
				+      "snapshot_deletion_failures": 0
			
 
				+    },
			
 
				+    "next_execution": "2099-06-17T01:30:00.000Z",
			
 
				+    "next_execution_millis": 4085343000000
			
 
				+  }
			
 
				+}
			
 
				+----
			
 
				+// TESTRESPONSE[skip:the result is for illustrating purposes only]
			
 
				+<1> The affected snapshot lifecycle policy.
			
 
				+<2> The information about the last failure for the policy.
			
 
				+<3> The time when the failure occurred in millis. Use the `human=true` request parameter to see a formatted timestamp.
			
 
				+<4> Error details containing the reason for the snapshot failure.
			
 
				+
			
 
				+Snapshots can fail for a variety reasons. If the failures are due to configuration errors, consult the
			
 
				+documentation for the repository that the automated snapshots are using.
			
 
				+
			
 
				+One common failure scenario is repository corruption. This occurs most often when multiple instances of {es} write to
			
 
				+the same repository location. There is a <<add-repository, separate troubleshooting guide>> to fix this problem.
			
 
				+
			
 
				+In the event that snapshots are failing for other reasons check the logs on the elected master node during the snapshot
			
 
				+execution period for more information.
			
 
				+
			
 
				+// end::self-managed[]
			
 
				+
			
--- a/docs/reference/troubleshooting.asciidoc
+++ b/docs/reference/troubleshooting.asciidoc
@@ -36,6 +36,7 @@ fix problems that an {es} deployment might encounter.
 
				 === Snapshot and restore
			
 
				 * <<restore-from-snapshot,Restore data from snapshot>>
			
 
				 * <<add-repository,Multiple deployments writing to the same snapshot repository>>
			
 
				+* <<repeated-snapshot-failures,Troubleshooting repeated snapshot failures>>
			
 
				 
			
 
				 [discrete]
			
 
				 [[troubleshooting-others]]
			
@@ -97,6 +98,8 @@ include::troubleshooting/data/restore-from-snapshot.asciidoc[]
 
				 
			
 
				 include::troubleshooting/snapshot/add-repository.asciidoc[]
			
 
				 
			
 
				+include::troubleshooting/snapshot/repeated-snapshot-failures.asciidoc[]
			
 
				+
			
 
				 include::troubleshooting/discovery-issues.asciidoc[]
			
 
				 
			
 
				 include::monitoring/troubleshooting.asciidoc[]
			
@@ -105,4 +108,4 @@ include::transform/troubleshooting.asciidoc[leveloffset=+1]
 
				 
			
 
				 include::../../x-pack/docs/en/watcher/troubleshooting.asciidoc[]
			
 
				 
			
 
				-include::troubleshooting/troubleshooting-searches.asciidoc[]
			
 
				+include::troubleshooting/troubleshooting-searches.asciidoc[]
			
--- a/docs/reference/troubleshooting/snapshot/repeated-snapshot-failures.asciidoc
+++ b/docs/reference/troubleshooting/snapshot/repeated-snapshot-failures.asciidoc
@@ -0,0 +1,18 @@
 
				+[[repeated-snapshot-failures]]
			
 
				+== Addressing repeated snapshot policy failures
			
 
				+
			
 
				+Repeated snapshot failures are usually an indicator of a problem with your deployment. Continuous failures of automated
			
 
				+snapshots can leave a deployment without recovery options in cases of data loss or outages.
			
 
				+
			
 
				+Elasticsearch keeps track of the number of repeated failures when executing automated snapshots. If an automated
			
 
				+snapshot fails too many times without a successful execution, the health API will report a warning. The number of
			
 
				+repeated failures before reporting a warning is controlled by the
			
 
				+<<slm-health-failed-snapshot-warn-threshold,`slm.health.failed_snapshot_warn_threshold`>> setting.
			
 
				+
			
 
				+In the event that an automated {slm} policy execution is experiencing repeated failures, follow these steps to get more
			
 
				+information about the problem:
			
 
				+
			
 
				+include::{es-repo-dir}/tab-widgets/troubleshooting/snapshot/repeated-snapshot-failures-widget.asciidoc[]
			
 
				+
			
 
				+
			
 
				+