Browse Source

Unmute test for TLS cert reloading during ES restart (#91147)

Justin Cranford 2 years ago
parent
commit
b97954f307

+ 129 - 47
x-pack/plugin/security/src/internalClusterTest/java/org/elasticsearch/xpack/ssl/SSLReloadDuringStartupIntegTests.java

@@ -7,12 +7,18 @@
 
 package org.elasticsearch.xpack.ssl;
 
+import org.apache.logging.log4j.Level;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.apache.logging.log4j.util.Strings;
 import org.elasticsearch.common.settings.MockSecureSettings;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.core.TimeValue;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.env.TestEnvironment;
 import org.elasticsearch.test.InternalTestCluster.RestartCallback;
 import org.elasticsearch.test.SecurityIntegTestCase;
+import org.elasticsearch.test.junit.annotations.TestLogging;
 import org.junit.BeforeClass;
 
 import java.io.IOException;
@@ -23,22 +29,39 @@ import java.nio.file.Path;
 import java.nio.file.StandardCopyOption;
 import java.util.concurrent.CountDownLatch;
 
+/**
+ * Start a cluster. Restart a node with a bad Transport keystore so it can't rejoin.
+ * Update the Transport keystore, so ES can reload it at runtime. Verify reload was OK
+ * by verifying if the cluster goes back to normal.
+ */
 public class SSLReloadDuringStartupIntegTests extends SecurityIntegTestCase {
+    private final Logger LOGGER = LogManager.getLogger(SSLReloadDuringStartupIntegTests.class);
+
+    private final String goodKeyStoreFilePath = "/org/elasticsearch/xpack/security/transport/ssl/certs/simple/testnode.jks";
+    private final String badKeyStoreFilePath = "/org/elasticsearch/xpack/security/transport/ssl/certs/simple/testnode_updated.jks";
 
     @BeforeClass
     public static void skipInFips() {
         assumeFalse("Can't use JKS keystores in FIPS JVM", inFipsJvm());
     }
 
+    /**
+     * Called for each node. Copy the original testnode.jks file into each node's config directory.
+     *
+     * @param nodeOrdinal   Number of the node in the cluster.
+     * @param otherSettings Pass through settings for this test.
+     * @return Node settings with overrides for Transport SSL, so the test can update the Transport keystore file twice.
+     */
     @Override
     public Settings nodeSettings(int nodeOrdinal, Settings otherSettings) {
         Settings settings = super.nodeSettings(nodeOrdinal, otherSettings);
         Environment tmpEnv = TestEnvironment.newEnvironment(settings);
-        // each node gets its own keystore under its home/config dir
-        Path origKeystorePath = getDataPath("/org/elasticsearch/xpack/security/transport/ssl/certs/simple/testnode.jks");
-        Path nodeSpecificPath = tmpEnv.configFile().resolve("testnode.jks");
+
+        // For each node, copy the original testnode.jks into each node's config directory.
+        Path nodeKeystorePath = tmpEnv.configFile().resolve("testnode.jks");
         try {
-            Files.copy(origKeystorePath, nodeSpecificPath, StandardCopyOption.REPLACE_EXISTING);
+            Path goodKeystorePath = getDataPath(goodKeyStoreFilePath);
+            Files.copy(goodKeystorePath, nodeKeystorePath, StandardCopyOption.REPLACE_EXISTING);
         } catch (IOException e) {
             throw new UncheckedIOException(e);
         }
@@ -47,7 +70,7 @@ public class SSLReloadDuringStartupIntegTests extends SecurityIntegTestCase {
             .put(settings.filter((s) -> s.startsWith("xpack.security.transport.ssl.") == false), false);
         MockSecureSettings secureSettings = new MockSecureSettings();
         builder.setSecureSettings(secureSettings);
-        builder.put("xpack.security.transport.ssl.keystore.path", nodeSpecificPath)
+        builder.put("xpack.security.transport.ssl.keystore.path", nodeKeystorePath)
             .put("resource.reload.interval.high", "1s")
             .put("xpack.security.transport.ssl.enabled", true);
         secureSettings.setString("xpack.security.transport.ssl.keystore.secure_password", "testnode");
@@ -59,66 +82,125 @@ public class SSLReloadDuringStartupIntegTests extends SecurityIntegTestCase {
         return true;
     }
 
-    @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/77490")
+    /**
+     * This class start a cluster. For this test, restart a random node.
+     * While stopped, replace the Transport keystore with a bad one, so the node cannot rejoin the cluster.
+     * While restarting, replace the keystore with a good one, and verify if ES reloaded it by checking it if rejoined the cluster.
+     *
+     * @throws Exception Compare ES startup logs to diagnostic and timing logs for the test, to narrow down why ES startup failed.
+     */
+    @TestLogging(value = "org.elasticsearch.xpack.ssl.SSLReloadDuringStartupIntegTests:TRACE", reason = "See Keystore update vs ES restart")
     public void testReloadDuringStartup() throws Exception {
-        final String node = randomFrom(internalCluster().getNodeNames());
-        final Environment env = internalCluster().getInstance(Environment.class, node);
-        // this latch is to synchronize the start of the thread that updates the TLS config
-        // and the continuation of the node restart
-        final CountDownLatch latch = new CountDownLatch(2);
-        // this latch is used by the test to signal when the updated TLS configuration has been
-        // written so that the test should proceed with checks to ensure the node rejoins the cluster
-        final CountDownLatch writtenLatch = new CountDownLatch(1);
-
-        // restart the node
-        internalCluster().restartNode(node, new RestartCallback() {
+        final String[] nodeNames = internalCluster().getNodeNames();
+        final String nodeName = randomFrom(nodeNames);
+        final Environment env = internalCluster().getInstance(Environment.class, nodeName);
+        final CountDownLatch beforeKeystoreFix = new CountDownLatch(2); // SYNC: Cert update & ES restart
+        final CountDownLatch afterKeystoreFix = new CountDownLatch(1); // SYNC: Verify cluster after cert update
+        final Path nodeKeystorePath = env.configFile().resolve("testnode.jks"); // all nodes have good keystore
+        final Path badKeystorePath = getDataPath(badKeyStoreFilePath); // stop a node, and apply this bad keystore
+        final Path goodKeystorePath = getDataPath(goodKeyStoreFilePath); // start the node, and apply this good keystore
+        assertTrue(Files.exists(nodeKeystorePath));
+        LOGGER.trace("Stopping node [{}] in {}-node cluster {}...", nodeName, nodeNames.length, nodeNames);
+        final long stopNanos = System.nanoTime();
+        internalCluster().restartNode(nodeName, new RestartCallback() {
             @Override
             public Settings onNodeStopped(String nodeName) throws Exception {
-                Path origKeystorePath = getDataPath("/org/elasticsearch/xpack/security/transport/ssl/certs/simple/testnode.jks");
-                Path keystorePath = env.configFile().resolve("testnode.jks");
-                Path updatedKeystorePath = getDataPath("/org/elasticsearch/xpack/security/transport/ssl/certs/simple/testnode_updated.jks");
-                assertTrue(Files.exists(keystorePath));
-                // replace the keystore with one that will fail during handshaking
-                copyAndAtomicMoveIfPossible(updatedKeystorePath, keystorePath);
-                new Thread(() -> {
-                    latch.countDown();
+                LOGGER.debug("Node [{}] stopped in {}ms.", nodeName, TimeValue.timeValueNanos(System.nanoTime() - stopNanos).millisFrac());
+                atomicCopyIfPossible(badKeystorePath, nodeKeystorePath);
+                final Thread fixKeystoreThread = new Thread(() -> {
+                    waitUntilNodeStartupIsReadyToBegin(beforeKeystoreFix); // SYNC: Cert update & ES restart
                     try {
-                        latch.await();
-                        Thread.sleep(randomLongBetween(1L, 2000L));
-                    } catch (InterruptedException e) {
-                        Thread.currentThread().interrupt();
-                    }
-
-                    // replace the bad keystore with a new copy of the good one
-                    try {
-                        copyAndAtomicMoveIfPossible(origKeystorePath, keystorePath);
-                        writtenLatch.countDown();
+                        atomicCopyIfPossible(goodKeystorePath, nodeKeystorePath);
+                        LOGGER.trace("Waiting for ES restart...");
+                        afterKeystoreFix.countDown(); // SYNC: Cert update & ES restart
                     } catch (IOException e) {
                         throw new UncheckedIOException(e);
                     }
-                }).start();
+                });
+                fixKeystoreThread.setName("Fix Keystore");
+                fixKeystoreThread.start();
+                waitUntilFixKeystoreIsReadyToBegin(beforeKeystoreFix); // SYNC: Cert update & ES restart
+                return super.onNodeStopped(nodeName); // ASSUME: RestartCallback will do ES start next
+            }
+        });
+        LOGGER.trace("Waiting for keystore fix...");
+        timed(LOGGER, Level.DEBUG, "Awaited {}ms. Verifying the cluster...", () -> {
+            try {
+                afterKeystoreFix.await(); // SYNC: Verify cluster after cert update
+            } catch (InterruptedException e) {
+                throw new RuntimeException(e);
+            }
+        });
+        timed(LOGGER, Level.TRACE, "Ensure cluster size consistency took {}ms.", this::ensureClusterSizeConsistency);
+        timed(LOGGER, Level.TRACE, "Ensure fully connected cluster took {}ms.", this::ensureFullyConnectedCluster);
+    }
 
-                latch.countDown();
-                try {
-                    latch.await();
-                } catch (InterruptedException e) {
-                    Thread.currentThread().interrupt();
-                }
-                return super.onNodeStopped(nodeName);
+    private void waitUntilNodeStartupIsReadyToBegin(final CountDownLatch beforeKeystoreFix) {
+        LOGGER.trace("Waiting for ES start to begin...");
+        beforeKeystoreFix.countDown(); // SYNC: Cert update & ES restart
+        final long sleepMillis = randomLongBetween(1L, 2000L); // intended sleepMillis
+        timed(LOGGER, Level.DEBUG, "Awaited {}ms. Sleeping " + sleepMillis + "ms before fixing...", () -> {
+            try {
+                beforeKeystoreFix.await(); // SYNC: Cert update & ES restart
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+        });
+        timed(LOGGER, Level.DEBUG, "Slept {}ms, intended " + sleepMillis + "ms. Fixing can start now...", () -> {
+            try {
+                Thread.sleep(sleepMillis); // Simulate cert update delay relative to ES start
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
             }
         });
-        writtenLatch.await();
-        ensureClusterSizeConsistency();
-        ensureFullyConnectedCluster();
     }
 
-    private void copyAndAtomicMoveIfPossible(Path source, Path target) throws IOException {
+    private void waitUntilFixKeystoreIsReadyToBegin(final CountDownLatch beforeKeystoreFix) {
+        LOGGER.trace("Waiting for keystore fix to begin...");
+        beforeKeystoreFix.countDown(); // SYNC: Cert update & ES restart
+        timed(LOGGER, Level.DEBUG, "Awaited {}ms. Node can start now...", () -> {
+            try {
+                beforeKeystoreFix.await(); // SYNC: Cert update & ES restart
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+        });
+    }
+
+    static void timed(final Logger LOGGER, final Level level, final String message, final Runnable runnable) {
+        assert LOGGER != null;
+        assert level != null;
+        assert Strings.isEmpty(message) == false;
+        assert message.contains("{}ms") : "Message must contain {}ms";
+        assert message.replace("{}", "").contains("{}") == false : "Message can only contain one binding.";
+        assert runnable != null;
+        final long startNanos = System.nanoTime();
+        try {
+            runnable.run();
+        } finally {
+            LOGGER.log(level, message, TimeValue.timeValueNanos(System.nanoTime() - startNanos).millisFrac());
+        }
+    }
+
+    /**
+     * Copy a source file to a destination file, overwriting if necessary.
+     * Use an intermediate temporary file, to attempt an atomic move.
+     * If atomic move fails, fall back to non-atomic move.
+     * @param source File to be copied.
+     * @param target File to be created or overwritten.
+     * @throws IOException Cannot create temp file, or copy source file to temp file, or non-atomic move temp file to target file.
+     */
+    private void atomicCopyIfPossible(Path source, Path target) throws IOException {
+        LOGGER.trace("Copying [{}] to [{}]", source, target);
         Path tmp = createTempFile();
+        LOGGER.trace("Created temporary file [{}]", tmp);
         Files.copy(source, tmp, StandardCopyOption.REPLACE_EXISTING);
         try {
             Files.move(tmp, target, StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE);
+            LOGGER.debug("Atomic move succeeded from [{}] to [{}]", tmp, target);
         } catch (AtomicMoveNotSupportedException e) {
             Files.move(tmp, target, StandardCopyOption.REPLACE_EXISTING);
+            LOGGER.debug("Non-atomic move succeeded from [{}] to [{}]", tmp, target);
         }
     }
 }