|
@@ -11,7 +11,6 @@ package org.elasticsearch.monitor.fs;
|
|
|
import org.apache.logging.log4j.LogManager;
|
|
|
import org.apache.logging.log4j.Logger;
|
|
|
import org.apache.logging.log4j.message.ParameterizedMessage;
|
|
|
-import org.elasticsearch.common.Nullable;
|
|
|
import org.elasticsearch.common.UUIDs;
|
|
|
import org.elasticsearch.common.component.AbstractLifecycleComponent;
|
|
|
import org.elasticsearch.common.settings.ClusterSettings;
|
|
@@ -30,32 +29,26 @@ import java.nio.charset.StandardCharsets;
|
|
|
import java.nio.file.Files;
|
|
|
import java.nio.file.Path;
|
|
|
import java.nio.file.StandardOpenOption;
|
|
|
-import java.util.Set;
|
|
|
import java.util.function.LongSupplier;
|
|
|
-import java.util.stream.Collectors;
|
|
|
|
|
|
import static org.elasticsearch.monitor.StatusInfo.Status.HEALTHY;
|
|
|
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;
|
|
|
|
|
|
/**
|
|
|
- * Runs periodically and attempts to create a temp file to see if the filesystem is writable. If not then it marks the
|
|
|
- * path as unhealthy.
|
|
|
+ * Runs periodically and attempts to create a temp file to see if the filesystem is writable. If not then it marks the path as unhealthy.
|
|
|
*/
|
|
|
public class FsHealthService extends AbstractLifecycleComponent implements NodeHealthService {
|
|
|
|
|
|
private static final Logger logger = LogManager.getLogger(FsHealthService.class);
|
|
|
private final ThreadPool threadPool;
|
|
|
+ private volatile StatusInfo statusInfo = new StatusInfo(HEALTHY, "not started");
|
|
|
private volatile boolean enabled;
|
|
|
- private volatile boolean brokenLock;
|
|
|
private final TimeValue refreshInterval;
|
|
|
private volatile TimeValue slowPathLoggingThreshold;
|
|
|
private final NodeEnvironment nodeEnv;
|
|
|
private final LongSupplier currentTimeMillisSupplier;
|
|
|
private volatile Scheduler.Cancellable scheduledFuture;
|
|
|
|
|
|
- @Nullable
|
|
|
- private volatile Set<Path> unhealthyPaths;
|
|
|
-
|
|
|
public static final Setting<Boolean> ENABLED_SETTING =
|
|
|
Setting.boolSetting("monitor.fs.health.enabled", true, Setting.Property.NodeScope, Setting.Property.Dynamic);
|
|
|
public static final Setting<TimeValue> REFRESH_INTERVAL_SETTING =
|
|
@@ -79,8 +72,7 @@ public class FsHealthService extends AbstractLifecycleComponent implements NodeH
|
|
|
|
|
|
@Override
|
|
|
protected void doStart() {
|
|
|
- scheduledFuture = threadPool.scheduleWithFixedDelay(new FsHealthMonitor(), refreshInterval,
|
|
|
- ThreadPool.Names.GENERIC);
|
|
|
+ scheduledFuture = threadPool.scheduleWithFixedDelay(new FsHealthMonitor(), refreshInterval, ThreadPool.Names.GENERIC);
|
|
|
}
|
|
|
|
|
|
@Override
|
|
@@ -102,29 +94,20 @@ public class FsHealthService extends AbstractLifecycleComponent implements NodeH
|
|
|
|
|
|
@Override
|
|
|
public StatusInfo getHealth() {
|
|
|
- StatusInfo statusInfo;
|
|
|
- Set<Path> unhealthyPaths = this.unhealthyPaths;
|
|
|
if (enabled == false) {
|
|
|
- statusInfo = new StatusInfo(HEALTHY, "health check disabled");
|
|
|
- } else if (brokenLock) {
|
|
|
- statusInfo = new StatusInfo(UNHEALTHY, "health check failed due to broken node lock");
|
|
|
- } else if (unhealthyPaths == null) {
|
|
|
- statusInfo = new StatusInfo(HEALTHY, "health check passed");
|
|
|
- } else {
|
|
|
- String info = "health check failed on [" + unhealthyPaths.stream()
|
|
|
- .map(k -> k.toString()).collect(Collectors.joining(",")) + "]";
|
|
|
- statusInfo = new StatusInfo(UNHEALTHY, info);
|
|
|
+ return new StatusInfo(HEALTHY, "health check disabled");
|
|
|
}
|
|
|
+
|
|
|
return statusInfo;
|
|
|
}
|
|
|
|
|
|
- class FsHealthMonitor implements Runnable {
|
|
|
+ class FsHealthMonitor implements Runnable {
|
|
|
|
|
|
static final String TEMP_FILE_NAME = ".es_temp_file";
|
|
|
- private byte[] byteToWrite;
|
|
|
+ private final byte[] bytesToWrite;
|
|
|
|
|
|
FsHealthMonitor(){
|
|
|
- this.byteToWrite = UUIDs.randomBase64UUID().getBytes(StandardCharsets.UTF_8);
|
|
|
+ this.bytesToWrite = UUIDs.randomBase64UUID().getBytes(StandardCharsets.UTF_8);
|
|
|
}
|
|
|
|
|
|
@Override
|
|
@@ -140,23 +123,22 @@ public class FsHealthService extends AbstractLifecycleComponent implements NodeH
|
|
|
}
|
|
|
|
|
|
private void monitorFSHealth() {
|
|
|
- Path path;
|
|
|
+ final Path path;
|
|
|
try {
|
|
|
path = nodeEnv.nodeDataPath();
|
|
|
} catch (IllegalStateException e) {
|
|
|
+ statusInfo = new StatusInfo(UNHEALTHY, "health check failed due to broken node lock");
|
|
|
logger.error("health check failed", e);
|
|
|
- brokenLock = true;
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
- brokenLock = false;
|
|
|
- long executionStartTime = currentTimeMillisSupplier.getAsLong();
|
|
|
+ final long executionStartTime = currentTimeMillisSupplier.getAsLong();
|
|
|
try {
|
|
|
if (Files.exists(path)) {
|
|
|
- Path tempDataPath = path.resolve(TEMP_FILE_NAME);
|
|
|
+ final Path tempDataPath = path.resolve(TEMP_FILE_NAME);
|
|
|
Files.deleteIfExists(tempDataPath);
|
|
|
try (OutputStream os = Files.newOutputStream(tempDataPath, StandardOpenOption.CREATE_NEW)) {
|
|
|
- os.write(byteToWrite);
|
|
|
+ os.write(bytesToWrite);
|
|
|
IOUtils.fsync(tempDataPath, false);
|
|
|
}
|
|
|
Files.delete(tempDataPath);
|
|
@@ -167,11 +149,12 @@ public class FsHealthService extends AbstractLifecycleComponent implements NodeH
|
|
|
}
|
|
|
}
|
|
|
} catch (Exception ex) {
|
|
|
+ statusInfo = new StatusInfo(UNHEALTHY, "health check failed on [" + path + "]");
|
|
|
logger.error(new ParameterizedMessage("health check of [{}] failed", path), ex);
|
|
|
- unhealthyPaths = Set.of(path);
|
|
|
return;
|
|
|
}
|
|
|
- unhealthyPaths = null;
|
|
|
+
|
|
|
+ statusInfo = new StatusInfo(HEALTHY, "health check passed");
|
|
|
}
|
|
|
}
|
|
|
}
|