|
@@ -7,12 +7,18 @@
|
|
|
|
|
|
package org.elasticsearch.xpack.ssl;
|
|
|
|
|
|
+import org.apache.logging.log4j.Level;
|
|
|
+import org.apache.logging.log4j.LogManager;
|
|
|
+import org.apache.logging.log4j.Logger;
|
|
|
+import org.apache.logging.log4j.util.Strings;
|
|
|
import org.elasticsearch.common.settings.MockSecureSettings;
|
|
|
import org.elasticsearch.common.settings.Settings;
|
|
|
+import org.elasticsearch.core.TimeValue;
|
|
|
import org.elasticsearch.env.Environment;
|
|
|
import org.elasticsearch.env.TestEnvironment;
|
|
|
import org.elasticsearch.test.InternalTestCluster.RestartCallback;
|
|
|
import org.elasticsearch.test.SecurityIntegTestCase;
|
|
|
+import org.elasticsearch.test.junit.annotations.TestLogging;
|
|
|
import org.junit.BeforeClass;
|
|
|
|
|
|
import java.io.IOException;
|
|
@@ -23,22 +29,39 @@ import java.nio.file.Path;
|
|
|
import java.nio.file.StandardCopyOption;
|
|
|
import java.util.concurrent.CountDownLatch;
|
|
|
|
|
|
+/**
|
|
|
+ * Start a cluster. Restart a node with a bad Transport keystore so it can't rejoin.
|
|
|
+ * Update the Transport keystore, so ES can reload it at runtime. Verify reload was OK
|
|
|
+ * by verifying if the cluster goes back to normal.
|
|
|
+ */
|
|
|
public class SSLReloadDuringStartupIntegTests extends SecurityIntegTestCase {
|
|
|
+ private final Logger LOGGER = LogManager.getLogger(SSLReloadDuringStartupIntegTests.class);
|
|
|
+
|
|
|
+ private final String goodKeyStoreFilePath = "/org/elasticsearch/xpack/security/transport/ssl/certs/simple/testnode.jks";
|
|
|
+ private final String badKeyStoreFilePath = "/org/elasticsearch/xpack/security/transport/ssl/certs/simple/testnode_updated.jks";
|
|
|
|
|
|
@BeforeClass
|
|
|
public static void skipInFips() {
|
|
|
assumeFalse("Can't use JKS keystores in FIPS JVM", inFipsJvm());
|
|
|
}
|
|
|
|
|
|
+ /**
|
|
|
+ * Called for each node. Copy the original testnode.jks file into each node's config directory.
|
|
|
+ *
|
|
|
+ * @param nodeOrdinal Number of the node in the cluster.
|
|
|
+ * @param otherSettings Pass through settings for this test.
|
|
|
+ * @return Node settings with overrides for Transport SSL, so the test can update the Transport keystore file twice.
|
|
|
+ */
|
|
|
@Override
|
|
|
public Settings nodeSettings(int nodeOrdinal, Settings otherSettings) {
|
|
|
Settings settings = super.nodeSettings(nodeOrdinal, otherSettings);
|
|
|
Environment tmpEnv = TestEnvironment.newEnvironment(settings);
|
|
|
- // each node gets its own keystore under its home/config dir
|
|
|
- Path origKeystorePath = getDataPath("/org/elasticsearch/xpack/security/transport/ssl/certs/simple/testnode.jks");
|
|
|
- Path nodeSpecificPath = tmpEnv.configFile().resolve("testnode.jks");
|
|
|
+
|
|
|
+ // For each node, copy the original testnode.jks into each node's config directory.
|
|
|
+ Path nodeKeystorePath = tmpEnv.configFile().resolve("testnode.jks");
|
|
|
try {
|
|
|
- Files.copy(origKeystorePath, nodeSpecificPath, StandardCopyOption.REPLACE_EXISTING);
|
|
|
+ Path goodKeystorePath = getDataPath(goodKeyStoreFilePath);
|
|
|
+ Files.copy(goodKeystorePath, nodeKeystorePath, StandardCopyOption.REPLACE_EXISTING);
|
|
|
} catch (IOException e) {
|
|
|
throw new UncheckedIOException(e);
|
|
|
}
|
|
@@ -47,7 +70,7 @@ public class SSLReloadDuringStartupIntegTests extends SecurityIntegTestCase {
|
|
|
.put(settings.filter((s) -> s.startsWith("xpack.security.transport.ssl.") == false), false);
|
|
|
MockSecureSettings secureSettings = new MockSecureSettings();
|
|
|
builder.setSecureSettings(secureSettings);
|
|
|
- builder.put("xpack.security.transport.ssl.keystore.path", nodeSpecificPath)
|
|
|
+ builder.put("xpack.security.transport.ssl.keystore.path", nodeKeystorePath)
|
|
|
.put("resource.reload.interval.high", "1s")
|
|
|
.put("xpack.security.transport.ssl.enabled", true);
|
|
|
secureSettings.setString("xpack.security.transport.ssl.keystore.secure_password", "testnode");
|
|
@@ -59,66 +82,125 @@ public class SSLReloadDuringStartupIntegTests extends SecurityIntegTestCase {
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
- @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/77490")
|
|
|
+ /**
|
|
|
+ * This class start a cluster. For this test, restart a random node.
|
|
|
+ * While stopped, replace the Transport keystore with a bad one, so the node cannot rejoin the cluster.
|
|
|
+ * While restarting, replace the keystore with a good one, and verify if ES reloaded it by checking it if rejoined the cluster.
|
|
|
+ *
|
|
|
+ * @throws Exception Compare ES startup logs to diagnostic and timing logs for the test, to narrow down why ES startup failed.
|
|
|
+ */
|
|
|
+ @TestLogging(value = "org.elasticsearch.xpack.ssl.SSLReloadDuringStartupIntegTests:TRACE", reason = "See Keystore update vs ES restart")
|
|
|
public void testReloadDuringStartup() throws Exception {
|
|
|
- final String node = randomFrom(internalCluster().getNodeNames());
|
|
|
- final Environment env = internalCluster().getInstance(Environment.class, node);
|
|
|
- // this latch is to synchronize the start of the thread that updates the TLS config
|
|
|
- // and the continuation of the node restart
|
|
|
- final CountDownLatch latch = new CountDownLatch(2);
|
|
|
- // this latch is used by the test to signal when the updated TLS configuration has been
|
|
|
- // written so that the test should proceed with checks to ensure the node rejoins the cluster
|
|
|
- final CountDownLatch writtenLatch = new CountDownLatch(1);
|
|
|
-
|
|
|
- // restart the node
|
|
|
- internalCluster().restartNode(node, new RestartCallback() {
|
|
|
+ final String[] nodeNames = internalCluster().getNodeNames();
|
|
|
+ final String nodeName = randomFrom(nodeNames);
|
|
|
+ final Environment env = internalCluster().getInstance(Environment.class, nodeName);
|
|
|
+ final CountDownLatch beforeKeystoreFix = new CountDownLatch(2); // SYNC: Cert update & ES restart
|
|
|
+ final CountDownLatch afterKeystoreFix = new CountDownLatch(1); // SYNC: Verify cluster after cert update
|
|
|
+ final Path nodeKeystorePath = env.configFile().resolve("testnode.jks"); // all nodes have good keystore
|
|
|
+ final Path badKeystorePath = getDataPath(badKeyStoreFilePath); // stop a node, and apply this bad keystore
|
|
|
+ final Path goodKeystorePath = getDataPath(goodKeyStoreFilePath); // start the node, and apply this good keystore
|
|
|
+ assertTrue(Files.exists(nodeKeystorePath));
|
|
|
+ LOGGER.trace("Stopping node [{}] in {}-node cluster {}...", nodeName, nodeNames.length, nodeNames);
|
|
|
+ final long stopNanos = System.nanoTime();
|
|
|
+ internalCluster().restartNode(nodeName, new RestartCallback() {
|
|
|
@Override
|
|
|
public Settings onNodeStopped(String nodeName) throws Exception {
|
|
|
- Path origKeystorePath = getDataPath("/org/elasticsearch/xpack/security/transport/ssl/certs/simple/testnode.jks");
|
|
|
- Path keystorePath = env.configFile().resolve("testnode.jks");
|
|
|
- Path updatedKeystorePath = getDataPath("/org/elasticsearch/xpack/security/transport/ssl/certs/simple/testnode_updated.jks");
|
|
|
- assertTrue(Files.exists(keystorePath));
|
|
|
- // replace the keystore with one that will fail during handshaking
|
|
|
- copyAndAtomicMoveIfPossible(updatedKeystorePath, keystorePath);
|
|
|
- new Thread(() -> {
|
|
|
- latch.countDown();
|
|
|
+ LOGGER.debug("Node [{}] stopped in {}ms.", nodeName, TimeValue.timeValueNanos(System.nanoTime() - stopNanos).millisFrac());
|
|
|
+ atomicCopyIfPossible(badKeystorePath, nodeKeystorePath);
|
|
|
+ final Thread fixKeystoreThread = new Thread(() -> {
|
|
|
+ waitUntilNodeStartupIsReadyToBegin(beforeKeystoreFix); // SYNC: Cert update & ES restart
|
|
|
try {
|
|
|
- latch.await();
|
|
|
- Thread.sleep(randomLongBetween(1L, 2000L));
|
|
|
- } catch (InterruptedException e) {
|
|
|
- Thread.currentThread().interrupt();
|
|
|
- }
|
|
|
-
|
|
|
- // replace the bad keystore with a new copy of the good one
|
|
|
- try {
|
|
|
- copyAndAtomicMoveIfPossible(origKeystorePath, keystorePath);
|
|
|
- writtenLatch.countDown();
|
|
|
+ atomicCopyIfPossible(goodKeystorePath, nodeKeystorePath);
|
|
|
+ LOGGER.trace("Waiting for ES restart...");
|
|
|
+ afterKeystoreFix.countDown(); // SYNC: Cert update & ES restart
|
|
|
} catch (IOException e) {
|
|
|
throw new UncheckedIOException(e);
|
|
|
}
|
|
|
- }).start();
|
|
|
+ });
|
|
|
+ fixKeystoreThread.setName("Fix Keystore");
|
|
|
+ fixKeystoreThread.start();
|
|
|
+ waitUntilFixKeystoreIsReadyToBegin(beforeKeystoreFix); // SYNC: Cert update & ES restart
|
|
|
+ return super.onNodeStopped(nodeName); // ASSUME: RestartCallback will do ES start next
|
|
|
+ }
|
|
|
+ });
|
|
|
+ LOGGER.trace("Waiting for keystore fix...");
|
|
|
+ timed(LOGGER, Level.DEBUG, "Awaited {}ms. Verifying the cluster...", () -> {
|
|
|
+ try {
|
|
|
+ afterKeystoreFix.await(); // SYNC: Verify cluster after cert update
|
|
|
+ } catch (InterruptedException e) {
|
|
|
+ throw new RuntimeException(e);
|
|
|
+ }
|
|
|
+ });
|
|
|
+ timed(LOGGER, Level.TRACE, "Ensure cluster size consistency took {}ms.", this::ensureClusterSizeConsistency);
|
|
|
+ timed(LOGGER, Level.TRACE, "Ensure fully connected cluster took {}ms.", this::ensureFullyConnectedCluster);
|
|
|
+ }
|
|
|
|
|
|
- latch.countDown();
|
|
|
- try {
|
|
|
- latch.await();
|
|
|
- } catch (InterruptedException e) {
|
|
|
- Thread.currentThread().interrupt();
|
|
|
- }
|
|
|
- return super.onNodeStopped(nodeName);
|
|
|
+ private void waitUntilNodeStartupIsReadyToBegin(final CountDownLatch beforeKeystoreFix) {
|
|
|
+ LOGGER.trace("Waiting for ES start to begin...");
|
|
|
+ beforeKeystoreFix.countDown(); // SYNC: Cert update & ES restart
|
|
|
+ final long sleepMillis = randomLongBetween(1L, 2000L); // intended sleepMillis
|
|
|
+ timed(LOGGER, Level.DEBUG, "Awaited {}ms. Sleeping " + sleepMillis + "ms before fixing...", () -> {
|
|
|
+ try {
|
|
|
+ beforeKeystoreFix.await(); // SYNC: Cert update & ES restart
|
|
|
+ } catch (InterruptedException e) {
|
|
|
+ Thread.currentThread().interrupt();
|
|
|
+ }
|
|
|
+ });
|
|
|
+ timed(LOGGER, Level.DEBUG, "Slept {}ms, intended " + sleepMillis + "ms. Fixing can start now...", () -> {
|
|
|
+ try {
|
|
|
+ Thread.sleep(sleepMillis); // Simulate cert update delay relative to ES start
|
|
|
+ } catch (InterruptedException e) {
|
|
|
+ Thread.currentThread().interrupt();
|
|
|
}
|
|
|
});
|
|
|
- writtenLatch.await();
|
|
|
- ensureClusterSizeConsistency();
|
|
|
- ensureFullyConnectedCluster();
|
|
|
}
|
|
|
|
|
|
- private void copyAndAtomicMoveIfPossible(Path source, Path target) throws IOException {
|
|
|
+ private void waitUntilFixKeystoreIsReadyToBegin(final CountDownLatch beforeKeystoreFix) {
|
|
|
+ LOGGER.trace("Waiting for keystore fix to begin...");
|
|
|
+ beforeKeystoreFix.countDown(); // SYNC: Cert update & ES restart
|
|
|
+ timed(LOGGER, Level.DEBUG, "Awaited {}ms. Node can start now...", () -> {
|
|
|
+ try {
|
|
|
+ beforeKeystoreFix.await(); // SYNC: Cert update & ES restart
|
|
|
+ } catch (InterruptedException e) {
|
|
|
+ Thread.currentThread().interrupt();
|
|
|
+ }
|
|
|
+ });
|
|
|
+ }
|
|
|
+
|
|
|
+ static void timed(final Logger LOGGER, final Level level, final String message, final Runnable runnable) {
|
|
|
+ assert LOGGER != null;
|
|
|
+ assert level != null;
|
|
|
+ assert Strings.isEmpty(message) == false;
|
|
|
+ assert message.contains("{}ms") : "Message must contain {}ms";
|
|
|
+ assert message.replace("{}", "").contains("{}") == false : "Message can only contain one binding.";
|
|
|
+ assert runnable != null;
|
|
|
+ final long startNanos = System.nanoTime();
|
|
|
+ try {
|
|
|
+ runnable.run();
|
|
|
+ } finally {
|
|
|
+ LOGGER.log(level, message, TimeValue.timeValueNanos(System.nanoTime() - startNanos).millisFrac());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Copy a source file to a destination file, overwriting if necessary.
|
|
|
+ * Use an intermediate temporary file, to attempt an atomic move.
|
|
|
+ * If atomic move fails, fall back to non-atomic move.
|
|
|
+ * @param source File to be copied.
|
|
|
+ * @param target File to be created or overwritten.
|
|
|
+ * @throws IOException Cannot create temp file, or copy source file to temp file, or non-atomic move temp file to target file.
|
|
|
+ */
|
|
|
+ private void atomicCopyIfPossible(Path source, Path target) throws IOException {
|
|
|
+ LOGGER.trace("Copying [{}] to [{}]", source, target);
|
|
|
Path tmp = createTempFile();
|
|
|
+ LOGGER.trace("Created temporary file [{}]", tmp);
|
|
|
Files.copy(source, tmp, StandardCopyOption.REPLACE_EXISTING);
|
|
|
try {
|
|
|
Files.move(tmp, target, StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE);
|
|
|
+ LOGGER.debug("Atomic move succeeded from [{}] to [{}]", tmp, target);
|
|
|
} catch (AtomicMoveNotSupportedException e) {
|
|
|
Files.move(tmp, target, StandardCopyOption.REPLACE_EXISTING);
|
|
|
+ LOGGER.debug("Non-atomic move succeeded from [{}] to [{}]", tmp, target);
|
|
|
}
|
|
|
}
|
|
|
}
|