浏览代码

Log details of non-green indicators in HealthPeriodicLogger (#108266)

* Log details of non-green indicators in HealthPeriodicLogger

This commit adds the details of an indicator that is not green to the fields for
`HealthPeriodicLogger`.

An example of a regular (green) log message:

```
[2024-05-03T13:42:34,346][INFO ][o.e.h.HealthPeriodicLogger] [runTask-0] elasticsearch.health.data_stream_lifecycle.status="green" elasticsearch.health.disk.status="green" elasticsearch.health.ilm.status="green" elasticsearch.health.master_is_stable.status="green" elasticsearch.health.overall.status="green" elasticsearch.health.repository_integrity.status="green" elasticsearch.health.shards_availability.status="green" elasticsearch.health.shards_capacity.status="green" elasticsearch.health.slm.status="green" message="health=green"
```

And a message with details while the cluster is non-green:

```
[2024-05-03T13:43:34,339][INFO ][o.e.h.HealthPeriodicLogger] [runTask-0] elasticsearch.health.data_stream_lifecycle.status="green" elasticsearch.health.disk.status="green" elasticsearch.health.ilm.status="green" elasticsearch.health.master_is_stable.status="green" elasticsearch.health.overall.status="yellow" elasticsearch.health.repository_integrity.status="green" elasticsearch.health.shards_availability.details="{"initializing_primaries":0,"creating_replicas":0,"started_replicas":0,"unassigned_primaries":0,"restarting_replicas":0,"creating_primaries":0,"initializing_replicas":0,"unassigned_replicas":1,"started_primaries":2,"restarting_primaries":0}" elasticsearch.health.shards_availability.status="yellow" elasticsearch.health.shards_capacity.status="green" elasticsearch.health.slm.status="green" message="health=yellow [shards_availability]"
```

* Update docs/changelog/108266.yaml
Lee Hinman 1 年之前
父节点
当前提交
5361027989

+ 5 - 0
docs/changelog/108266.yaml

@@ -0,0 +1,5 @@
+pr: 108266
+summary: Log details of non-green indicators in `HealthPeriodicLogger`
+area: Health
+type: enhancement
+issues: []

+ 8 - 1
server/src/main/java/org/elasticsearch/health/HealthPeriodicLogger.java

@@ -17,6 +17,7 @@ import org.elasticsearch.cluster.ClusterChangedEvent;
 import org.elasticsearch.cluster.ClusterStateListener;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.component.AbstractLifecycleComponent;
 import org.elasticsearch.common.component.Lifecycle;
 import org.elasticsearch.common.component.LifecycleListener;
@@ -311,7 +312,7 @@ public class HealthPeriodicLogger extends AbstractLifecycleComponent implements
                 RunOnce release = new RunOnce(currentlyRunning::release);
                 try {
                     ActionListener<List<HealthIndicatorResult>> listenerWithRelease = ActionListener.runAfter(resultsListener, release);
-                    this.healthService.getHealth(this.client, null, false, 0, listenerWithRelease);
+                    this.healthService.getHealth(this.client, null, true, 0, listenerWithRelease);
                 } catch (Exception e) {
                     // In case of an exception before the listener was wired, we can release the flag here, and we feel safe
                     // that it will not release it again because this can only be run once.
@@ -359,6 +360,12 @@ public class HealthPeriodicLogger extends AbstractLifecycleComponent implements
                 String.format(Locale.ROOT, "%s.%s.status", HEALTH_FIELD_PREFIX, indicatorResult.name()),
                 indicatorResult.status().xContentValue()
             );
+            if (GREEN.equals(indicatorResult.status()) == false && indicatorResult.details() != null) {
+                result.put(
+                    String.format(Locale.ROOT, "%s.%s.details", HEALTH_FIELD_PREFIX, indicatorResult.name()),
+                    Strings.toString(indicatorResult.details())
+                );
+            }
         });
 
         // message field. Show the non-green indicators if they exist.

+ 1 - 1
server/src/test/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorServiceTests.java

@@ -2168,7 +2168,7 @@ public class ShardsAvailabilityHealthIndicatorServiceTests extends ESTestCase {
             .build();
     }
 
-    private static Map<String, Object> addDefaults(Map<String, Object> override) {
+    public static Map<String, Object> addDefaults(Map<String, Object> override) {
         return Map.of(
             "unassigned_primaries",
             override.getOrDefault("unassigned_primaries", 0),

+ 76 - 6
server/src/test/java/org/elasticsearch/health/HealthPeriodicLoggerTests.java

@@ -18,7 +18,9 @@ import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodeRole;
 import org.elasticsearch.cluster.node.DiscoveryNodeUtils;
+import org.elasticsearch.cluster.routing.allocation.shards.ShardsAvailabilityHealthIndicatorServiceTests;
 import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.component.Lifecycle;
 import org.elasticsearch.common.logging.ESLogMessage;
 import org.elasticsearch.common.scheduler.SchedulerEngine;
@@ -122,13 +124,36 @@ public class HealthPeriodicLoggerTests extends ESTestCase {
 
         Map<String, Object> loggerResults = HealthPeriodicLogger.convertToLoggedFields(results);
 
-        // verify that the number of fields is the number of indicators + 2 (for overall and for message)
-        assertThat(loggerResults.size(), equalTo(results.size() + 2));
+        // verify that the number of fields is the number of indicators + 4
+        // (for overall and for message, plus details for the two yellow indicators)
+        assertThat(loggerResults.size(), equalTo(results.size() + 4));
 
         // test indicator status
         assertThat(loggerResults.get(makeHealthStatusString("master_is_stable")), equalTo("green"));
         assertThat(loggerResults.get(makeHealthStatusString("disk")), equalTo("yellow"));
+        assertThat(
+            loggerResults.get(makeHealthDetailsString("disk")),
+            equalTo(
+                getTestIndicatorResults().stream()
+                    .filter(i -> i.name().equals("disk"))
+                    .findFirst()
+                    .map(HealthIndicatorResult::details)
+                    .map(Strings::toString)
+                    .orElseThrow()
+            )
+        );
         assertThat(loggerResults.get(makeHealthStatusString("shards_availability")), equalTo("yellow"));
+        assertThat(
+            loggerResults.get(makeHealthDetailsString("shards_availability")),
+            equalTo(
+                getTestIndicatorResults().stream()
+                    .filter(i -> i.name().equals("shards_availability"))
+                    .findFirst()
+                    .map(HealthIndicatorResult::details)
+                    .map(Strings::toString)
+                    .orElseThrow()
+            )
+        );
 
         // test calculated overall status
         assertThat(loggerResults.get(makeHealthStatusString("overall")), equalTo(overallStatus.xContentValue()));
@@ -751,8 +776,35 @@ public class HealthPeriodicLoggerTests extends ESTestCase {
 
     private List<HealthIndicatorResult> getTestIndicatorResults() {
         var networkLatency = new HealthIndicatorResult("master_is_stable", GREEN, null, null, null, null);
-        var slowTasks = new HealthIndicatorResult("disk", YELLOW, null, null, null, null);
-        var shardsAvailable = new HealthIndicatorResult("shards_availability", YELLOW, null, null, null, null);
+        var slowTasks = new HealthIndicatorResult(
+            "disk",
+            YELLOW,
+            null,
+            new SimpleHealthIndicatorDetails(
+                Map.of(
+                    "indices_with_readonly_block",
+                    0,
+                    "nodes_with_enough_disk_space",
+                    1,
+                    "nodes_with_unknown_disk_status",
+                    0,
+                    "nodes_over_high_watermark",
+                    0,
+                    "nodes_over_flood_stage_watermark",
+                    1
+                )
+            ),
+            null,
+            null
+        );
+        var shardsAvailable = new HealthIndicatorResult(
+            "shards_availability",
+            YELLOW,
+            null,
+            new SimpleHealthIndicatorDetails(ShardsAvailabilityHealthIndicatorServiceTests.addDefaults(Map.of())),
+            null,
+            null
+        );
 
         return List.of(networkLatency, slowTasks, shardsAvailable);
     }
@@ -760,7 +812,14 @@ public class HealthPeriodicLoggerTests extends ESTestCase {
     private List<HealthIndicatorResult> getTestIndicatorResultsAllGreen() {
         var networkLatency = new HealthIndicatorResult("master_is_stable", GREEN, null, null, null, null);
         var slowTasks = new HealthIndicatorResult("disk", GREEN, null, null, null, null);
-        var shardsAvailable = new HealthIndicatorResult("shards_availability", GREEN, null, null, null, null);
+        var shardsAvailable = new HealthIndicatorResult(
+            "shards_availability",
+            GREEN,
+            null,
+            new SimpleHealthIndicatorDetails(ShardsAvailabilityHealthIndicatorServiceTests.addDefaults(Map.of())),
+            null,
+            null
+        );
 
         return List.of(networkLatency, slowTasks, shardsAvailable);
     }
@@ -768,7 +827,14 @@ public class HealthPeriodicLoggerTests extends ESTestCase {
     private List<HealthIndicatorResult> getTestIndicatorResultsWithRed() {
         var networkLatency = new HealthIndicatorResult("master_is_stable", GREEN, null, null, null, null);
         var slowTasks = new HealthIndicatorResult("disk", GREEN, null, null, null, null);
-        var shardsAvailable = new HealthIndicatorResult("shards_availability", RED, null, null, null, null);
+        var shardsAvailable = new HealthIndicatorResult(
+            "shards_availability",
+            RED,
+            null,
+            new SimpleHealthIndicatorDetails(ShardsAvailabilityHealthIndicatorServiceTests.addDefaults(Map.of("unassigned_primaries", 1))),
+            null,
+            null
+        );
 
         return List.of(networkLatency, slowTasks, shardsAvailable);
     }
@@ -777,6 +843,10 @@ public class HealthPeriodicLoggerTests extends ESTestCase {
         return String.format(Locale.ROOT, "%s.%s.status", HealthPeriodicLogger.HEALTH_FIELD_PREFIX, key);
     }
 
+    private String makeHealthDetailsString(String key) {
+        return String.format(Locale.ROOT, "%s.%s.details", HealthPeriodicLogger.HEALTH_FIELD_PREFIX, key);
+    }
+
     private HealthPeriodicLogger createAndInitHealthPeriodicLogger(
         ClusterService clusterService,
         HealthService testHealthService,