Procházet zdrojové kódy

Report impact and diagnosis data nodes w/o disk space and no blocked indices (#90772)

This adds a dedicated impact and a diagnosis for the case where data
nodes are out of disk space but not read-only blocked indices are in the
cluster.
Andrei Dan před 3 roky
rodič
revize
dac276902b

+ 6 - 0
docs/changelog/90772.yaml

@@ -0,0 +1,6 @@
+pr: 90772
+summary: Report impact and diagnosis data nodes w/o disk space and no blocked indices
+area: Health
+type: bug
+issues:
+ - 90442

+ 58 - 21
server/src/main/java/org/elasticsearch/health/node/DiskHealthIndicatorService.java

@@ -68,6 +68,7 @@ public class DiskHealthIndicatorService implements HealthIndicatorService {
     private static final String IMPACT_INGEST_AT_RISK_ID = "ingest_capability_at_risk";
     private static final String IMPACT_CLUSTER_STABILITY_AT_RISK_ID = "cluster_stability_at_risk";
     private static final String IMPACT_CLUSTER_FUNCTIONALITY_UNAVAILABLE_ID = "cluster_functionality_unavailable";
+    private static final String IMPACT_DATA_NODE_WITHOUT_DISK_SPACE = "data_node_without_disk_space";
 
     private final ClusterService clusterService;
 
@@ -292,6 +293,26 @@ public class DiskHealthIndicatorService implements HealthIndicatorService {
                         )
                     );
                 }
+                // data nodes don't have space, but no indices had the write block in the cluster
+                if (hasUnhealthyDataNodes()) {
+                    impacts.add(
+                        new HealthIndicatorImpact(
+                            NAME,
+                            IMPACT_INGEST_AT_RISK_ID,
+                            2,
+                            String.format(
+                                Locale.ROOT,
+                                "%d %s %s out of disk or running low on disk space. %s %s cannot be used to store data anymore.",
+                                dataNodes.size(),
+                                regularNoun("node", dataNodes.size()),
+                                are(dataNodes.size()),
+                                these(dataNodes.size()),
+                                regularNoun("node", dataNodes.size())
+                            ),
+                            List.of(ImpactArea.DEPLOYMENT_MANAGEMENT)
+                        )
+                    );
+                }
             }
             if (affectedRoles.contains(DiscoveryNodeRole.MASTER_ROLE)) {
                 impacts.add(
@@ -344,28 +365,44 @@ public class DiskHealthIndicatorService implements HealthIndicatorService {
                     );
                     affectedResources.add(indexResources);
                 }
-                diagnosisList.add(
-                    new Diagnosis(
-                        new Diagnosis.Definition(
-                            NAME,
-                            "add_disk_capacity_data_nodes",
-                            String.format(
-                                Locale.ROOT,
-                                "%d %s %s on nodes that have run or are likely to run out of disk space, "
-                                    + "this can temporarily disable writing on %s %s.",
-                                affectedIndices.size(),
-                                indices(affectedIndices.size()),
-                                regularVerb("reside", affectedIndices.size()),
-                                these(affectedIndices.size()),
-                                indices(affectedIndices.size())
+                if (affectedIndices.size() > 0) {
+                    diagnosisList.add(
+                        new Diagnosis(
+                            new Diagnosis.Definition(
+                                NAME,
+                                "add_disk_capacity_data_nodes",
+                                String.format(
+                                    Locale.ROOT,
+                                    "%d %s %s on nodes that have run or are likely to run out of disk space, "
+                                        + "this can temporarily disable writing on %s %s.",
+                                    affectedIndices.size(),
+                                    indices(affectedIndices.size()),
+                                    regularVerb("reside", affectedIndices.size()),
+                                    these(affectedIndices.size()),
+                                    indices(affectedIndices.size())
+                                ),
+                                "Enable autoscaling (if applicable), add disk capacity or free up disk space to resolve "
+                                    + "this. If you have already taken action please wait for the rebalancing to complete.",
+                                "https://ela.st/fix-data-disk"
                             ),
-                            "Enable autoscaling (if applicable), add disk capacity or free up disk space to resolve "
-                                + "this. If you have already taken action please wait for the rebalancing to complete.",
-                            "https://ela.st/fix-data-disk"
-                        ),
-                        affectedResources
-                    )
-                );
+                            affectedResources
+                        )
+                    );
+                } else {
+                    diagnosisList.add(
+                        new Diagnosis(
+                            new Diagnosis.Definition(
+                                NAME,
+                                "add_disk_capacity_data_nodes",
+                                "Disk is almost full.",
+                                "Enable autoscaling (if applicable), add disk capacity or free up disk space to resolve "
+                                    + "this. If you have already taken action please wait for the rebalancing to complete.",
+                                "https://ela.st/fix-data-disk"
+                            ),
+                            affectedResources
+                        )
+                    );
+                }
             }
             if (masterNodes.containsKey(HealthStatus.RED)) {
                 diagnosisList.add(createNonDataNodeDiagnosis(HealthStatus.RED, masterNodes.get(HealthStatus.RED), true));

+ 54 - 5
server/src/test/java/org/elasticsearch/health/node/DiskHealthIndicatorServiceTests.java

@@ -57,6 +57,7 @@ import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_CREATION_
 import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS;
 import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_SHARDS;
 import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_VERSION_CREATED;
+import static org.elasticsearch.health.ImpactArea.DEPLOYMENT_MANAGEMENT;
 import static org.elasticsearch.health.node.DiskHealthIndicatorService.DiskHealthAnalyzer.INDICES_WITH_READONLY_BLOCK;
 import static org.elasticsearch.health.node.DiskHealthIndicatorService.DiskHealthAnalyzer.NODES_OVER_FLOOD_STAGE_WATERMARK;
 import static org.elasticsearch.health.node.DiskHealthIndicatorService.DiskHealthAnalyzer.NODES_OVER_HIGH_WATERMARK;
@@ -459,6 +460,54 @@ public class DiskHealthIndicatorServiceTests extends ESTestCase {
         assertThat(details.get(INDICES_WITH_READONLY_BLOCK), equalTo(blockedIndices.size()));
     }
 
+    public void testRedNodesWithoutAnyBlockedIndices() throws IOException {
+        HealthStatus expectedStatus = HealthStatus.RED;
+        Set<DiscoveryNode> discoveryNodes = createNodesWithAllRoles();
+        int numberOfRedNodes = randomIntBetween(1, discoveryNodes.size());
+        HealthInfo healthInfo = createHealthInfo(new HealthInfoConfig(HealthStatus.RED, numberOfRedNodes, discoveryNodes));
+        Set<String> nonRedNodeIds = healthInfo.diskInfoByNode()
+            .entrySet()
+            .stream()
+            .filter(entry -> entry.getValue().healthStatus().equals(expectedStatus) == false)
+            .map(Map.Entry::getKey)
+            .collect(Collectors.toSet());
+        Map<String, Set<String>> indexNameToNodeIdsMap = new HashMap<>();
+        int numberOfIndices = randomIntBetween(1, 1000);
+        for (int i = 0; i < numberOfIndices; i++) {
+            String indexName = randomAlphaOfLength(20);
+            // all blocked indices must be hosted by non-red indices
+            indexNameToNodeIdsMap.put(indexName, nonRedNodeIds);
+        }
+        ClusterService clusterService = createClusterService(Set.of(), discoveryNodes, indexNameToNodeIdsMap);
+        DiskHealthIndicatorService diskHealthIndicatorService = new DiskHealthIndicatorService(clusterService);
+        HealthIndicatorResult result = diskHealthIndicatorService.calculate(true, healthInfo);
+        assertThat(result.status(), equalTo(expectedStatus));
+        assertThat(result.impacts().size(), equalTo(3));
+        assertThat(
+            result.symptom(),
+            equalTo(
+                (numberOfRedNodes == 1 ? "1 node is" : numberOfRedNodes + " nodes ")
+                    + "with roles: [data, data_cold, data_content, data_frozen, data_hot, data_warm, ingest, master, ml, "
+                    + "remote_cluster_client, transform, voting_only] are out of disk or running low on disk space."
+            )
+        );
+        assertThat(
+            result.impacts().stream().flatMap(impact -> impact.impactAreas().stream()).collect(Collectors.toSet()),
+            is(Set.of(DEPLOYMENT_MANAGEMENT))
+        );
+        assertThat(result.diagnosisList().size(), is(1));
+        Diagnosis diagnosis = result.diagnosisList().get(0);
+        assertThat(diagnosis.definition().cause(), is("Disk is almost full."));
+        assertThat(diagnosis.affectedResources().size(), is(1));
+        assertThat(diagnosis.affectedResources().get(0).getNodes().size(), is(numberOfRedNodes));
+        Map<String, Object> details = xContentToMap(result.details());
+        assertThat(details.get(NODES_WITH_ENOUGH_DISK_SPACE), equalTo(discoveryNodes.size() - numberOfRedNodes));
+        assertThat(details.get(NODES_WITH_UNKNOWN_DISK_STATUS), equalTo(0));
+        assertThat(details.get(NODES_OVER_HIGH_WATERMARK), equalTo(0));
+        assertThat(details.get(NODES_OVER_FLOOD_STAGE_WATERMARK), equalTo(numberOfRedNodes));
+        assertThat(details.get(INDICES_WITH_READONLY_BLOCK), equalTo(0));
+    }
+
     public void testMissingHealthInfo() {
         Set<DiscoveryNode> discoveryNodes = createNodesWithAllRoles();
         Set<DiscoveryNode> discoveryNodesInClusterState = new HashSet<>(discoveryNodes);
@@ -528,10 +577,10 @@ public class DiskHealthIndicatorServiceTests extends ESTestCase {
         assertThat(impacts.size(), equalTo(2));
         assertThat(impacts.get(0).impactDescription(), equalTo("Cluster stability might be impaired."));
         assertThat(impacts.get(0).severity(), equalTo(1));
-        assertThat(impacts.get(0).impactAreas(), equalTo(List.of(ImpactArea.DEPLOYMENT_MANAGEMENT)));
+        assertThat(impacts.get(0).impactAreas(), equalTo(List.of(DEPLOYMENT_MANAGEMENT)));
         assertThat(impacts.get(1).impactDescription(), equalTo("The [" + otherRole.roleName() + "] functionality might be impaired."));
         assertThat(impacts.get(1).severity(), equalTo(3));
-        assertThat(impacts.get(1).impactAreas(), equalTo(List.of(ImpactArea.DEPLOYMENT_MANAGEMENT)));
+        assertThat(impacts.get(1).impactAreas(), equalTo(List.of(DEPLOYMENT_MANAGEMENT)));
 
         List<Diagnosis> diagnosisList = result.diagnosisList();
         assertThat(diagnosisList.size(), equalTo(1));
@@ -584,7 +633,7 @@ public class DiskHealthIndicatorServiceTests extends ESTestCase {
         assertThat(impacts.size(), equalTo(1));
         assertThat(impacts.get(0).impactDescription(), equalTo("The [" + displayRoles + "] functionality might be impaired."));
         assertThat(impacts.get(0).severity(), equalTo(3));
-        assertThat(impacts.get(0).impactAreas(), equalTo(List.of(ImpactArea.DEPLOYMENT_MANAGEMENT)));
+        assertThat(impacts.get(0).impactAreas(), equalTo(List.of(DEPLOYMENT_MANAGEMENT)));
         List<Diagnosis> diagnosisList = result.diagnosisList();
         assertThat(diagnosisList.size(), equalTo(1));
         Diagnosis diagnosis = diagnosisList.get(0);
@@ -648,13 +697,13 @@ public class DiskHealthIndicatorServiceTests extends ESTestCase {
         assertThat(impacts.get(0).impactAreas(), equalTo(List.of(ImpactArea.INGEST)));
         assertThat(impacts.get(1).impactDescription(), equalTo("Cluster stability might be impaired."));
         assertThat(impacts.get(1).severity(), equalTo(1));
-        assertThat(impacts.get(1).impactAreas(), equalTo(List.of(ImpactArea.DEPLOYMENT_MANAGEMENT)));
+        assertThat(impacts.get(1).impactAreas(), equalTo(List.of(DEPLOYMENT_MANAGEMENT)));
         assertThat(
             impacts.get(2).impactDescription(),
             equalTo("The " + otherRoles.stream().map(DiscoveryNodeRole::roleName).sorted().toList() + " functionality might be impaired.")
         );
         assertThat(impacts.get(2).severity(), equalTo(3));
-        assertThat(impacts.get(2).impactAreas(), equalTo(List.of(ImpactArea.DEPLOYMENT_MANAGEMENT)));
+        assertThat(impacts.get(2).impactAreas(), equalTo(List.of(DEPLOYMENT_MANAGEMENT)));
 
         List<Diagnosis> diagnosisList = result.diagnosisList();
         assertThat(diagnosisList.size(), equalTo(3));