Browse Source

Die with dignity

Today when a thread encounters a fatal unrecoverable error that
threatens the stability of the JVM, Elasticsearch marches on. This
includes out of memory errors, stack overflow errors and other errors
that leave the JVM in a questionable state. Instead, the Elasticsearch
JVM should die when these errors are encountered. This commit causes
this to be the case.

Relates #19272
Jason Tedor 9 years ago
parent
commit
e86aa29f67

+ 1 - 1
core/build.gradle

@@ -56,7 +56,7 @@ dependencies {
   compile "org.apache.lucene:lucene-spatial3d:${versions.lucene}"
   compile "org.apache.lucene:lucene-spatial3d:${versions.lucene}"
   compile "org.apache.lucene:lucene-suggest:${versions.lucene}"
   compile "org.apache.lucene:lucene-suggest:${versions.lucene}"
 
 
-  compile 'org.elasticsearch:securesm:1.0'
+  compile 'org.elasticsearch:securesm:1.1'
 
 
   // utilities
   // utilities
   compile 'net.sf.jopt-simple:jopt-simple:5.0.2'
   compile 'net.sf.jopt-simple:jopt-simple:5.0.2'

+ 6 - 0
core/src/main/java/org/elasticsearch/bootstrap/Bootstrap.java

@@ -246,6 +246,12 @@ final class Bootstrap {
             // fail if somebody replaced the lucene jars
             // fail if somebody replaced the lucene jars
             checkLucene();
             checkLucene();
 
 
+            // install the default uncaught exception handler; must be done before security is
+            // initialized as we do not want to grant the runtime permission
+            // setDefaultUncaughtExceptionHandler
+            Thread.setDefaultUncaughtExceptionHandler(
+                new ElasticsearchUncaughtExceptionHandler(() -> Node.NODE_NAME_SETTING.get(settings)));
+
             INSTANCE.setup(true, settings, environment);
             INSTANCE.setup(true, settings, environment);
 
 
             INSTANCE.start();
             INSTANCE.start();

+ 94 - 0
core/src/main/java/org/elasticsearch/bootstrap/ElasticsearchUncaughtExceptionHandler.java

@@ -0,0 +1,94 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.bootstrap;
+
+import org.apache.lucene.index.MergePolicy;
+import org.elasticsearch.common.SuppressForbidden;
+import org.elasticsearch.common.logging.ESLogger;
+import org.elasticsearch.common.logging.Loggers;
+
+import java.io.IOError;
+import java.util.Objects;
+import java.util.function.Supplier;
+
+class ElasticsearchUncaughtExceptionHandler implements Thread.UncaughtExceptionHandler {
+
+    private final Supplier<String> loggingPrefixSupplier;
+
+    ElasticsearchUncaughtExceptionHandler(final Supplier<String> loggingPrefixSupplier) {
+        this.loggingPrefixSupplier = Objects.requireNonNull(loggingPrefixSupplier);
+    }
+
+    @Override
+    public void uncaughtException(Thread t, Throwable e) {
+        if (isFatalUncaught(e)) {
+            try {
+                onFatalUncaught(t.getName(), e);
+            } finally {
+                // we use specific error codes in case the above notification failed, at least we
+                // will have some indication of the error bringing us down
+                if (e instanceof InternalError) {
+                    halt(128);
+                } else if (e instanceof OutOfMemoryError) {
+                    halt(127);
+                } else if (e instanceof StackOverflowError) {
+                    halt(126);
+                } else if (e instanceof UnknownError) {
+                    halt(125);
+                } else if (e instanceof IOError) {
+                    halt(124);
+                } else {
+                    halt(1);
+                }
+            }
+        } else {
+            onNonFatalUncaught(t.getName(), e);
+        }
+    }
+
+    // visible for testing
+    static boolean isFatalUncaught(Throwable e) {
+        return isFatalCause(e) || (e instanceof MergePolicy.MergeException && isFatalCause(e.getCause()));
+    }
+
+    private static boolean isFatalCause(Throwable cause) {
+        return cause instanceof Error;
+    }
+
+    // visible for testing
+    void onFatalUncaught(final String threadName, final Throwable t) {
+        final ESLogger logger = Loggers.getLogger(ElasticsearchUncaughtExceptionHandler.class, loggingPrefixSupplier.get());
+        logger.error("fatal error in thread [{}], exiting", t, threadName);
+    }
+
+    // visible for testing
+    void onNonFatalUncaught(final String threadName, final Throwable t) {
+        final ESLogger logger = Loggers.getLogger(ElasticsearchUncaughtExceptionHandler.class, loggingPrefixSupplier.get());
+        logger.warn("uncaught exception in thread [{}]", t, threadName);
+    }
+
+    // visible for testing
+    @SuppressForbidden(reason = "halt")
+    void halt(int status) {
+        // we halt to prevent shutdown hooks from running
+        Runtime.getRuntime().halt(status);
+    }
+
+}

+ 1 - 1
core/src/main/java/org/elasticsearch/bootstrap/Security.java

@@ -120,7 +120,7 @@ final class Security {
         Policy.setPolicy(new ESPolicy(createPermissions(environment), getPluginPermissions(environment), filterBadDefaults));
         Policy.setPolicy(new ESPolicy(createPermissions(environment), getPluginPermissions(environment), filterBadDefaults));
 
 
         // enable security manager
         // enable security manager
-        System.setSecurityManager(new SecureSM());
+        System.setSecurityManager(new SecureSM(new String[] { "org.elasticsearch.bootstrap." }));
 
 
         // do some basic tests
         // do some basic tests
         selfTest();
         selfTest();

+ 1 - 1
core/src/main/resources/org/elasticsearch/bootstrap/security.policy

@@ -24,7 +24,7 @@
 //// SecurityManager impl:
 //// SecurityManager impl:
 //// Must have all permissions to properly perform access checks
 //// Must have all permissions to properly perform access checks
 
 
-grant codeBase "${codebase.securesm-1.0.jar}" {
+grant codeBase "${codebase.securesm-1.1.jar}" {
   permission java.security.AllPermission;
   permission java.security.AllPermission;
 };
 };
 
 

+ 152 - 0
core/src/test/java/org/elasticsearch/bootstrap/ElasticsearchUncaughtExceptionHandlerTests.java

@@ -0,0 +1,152 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.bootstrap;
+
+import org.apache.lucene.index.MergePolicy;
+import org.elasticsearch.test.ESTestCase;
+import org.junit.Before;
+
+import java.io.IOError;
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicReference;
+
+import static org.hamcrest.CoreMatchers.equalTo;
+
+public class ElasticsearchUncaughtExceptionHandlerTests extends ESTestCase {
+
+    private Map<Class<? extends Error>, Integer> expectedStatus;
+
+    @Before
+    public void setUp() throws Exception {
+        super.setUp();
+        Map<Class<? extends Error>, Integer> expectedStatus = new HashMap<>();
+        expectedStatus.put(InternalError.class, 128);
+        expectedStatus.put(OutOfMemoryError.class, 127);
+        expectedStatus.put(StackOverflowError.class, 126);
+        expectedStatus.put(UnknownError.class, 125);
+        expectedStatus.put(IOError.class, 124);
+        this.expectedStatus = Collections.unmodifiableMap(expectedStatus);
+    }
+
+    public void testUncaughtError() throws InterruptedException {
+        final Error error = randomFrom(
+            new InternalError(),
+            new OutOfMemoryError(),
+            new StackOverflowError(),
+            new UnknownError(),
+            new IOError(new IOException("fatal")),
+            new Error() {});
+        final Thread thread = new Thread(() -> { throw error; });
+        final String name = randomAsciiOfLength(10);
+        thread.setName(name);
+        final AtomicBoolean halt = new AtomicBoolean();
+        final AtomicInteger observedStatus = new AtomicInteger();
+        final AtomicReference<String> threadNameReference = new AtomicReference<>();
+        final AtomicReference<Throwable> throwableReference = new AtomicReference<>();
+        thread.setUncaughtExceptionHandler(new ElasticsearchUncaughtExceptionHandler(() -> "testUncaughtError") {
+
+            @Override
+            void halt(int status) {
+                halt.set(true);
+                observedStatus.set(status);
+            }
+
+            @Override
+            void onFatalUncaught(String threadName, Throwable t) {
+                threadNameReference.set(threadName);
+                throwableReference.set(t);
+            }
+
+            @Override
+            void onNonFatalUncaught(String threadName, Throwable t) {
+                fail();
+            }
+
+        });
+        thread.start();
+        thread.join();
+        assertTrue(halt.get());
+        final int status;
+        if (expectedStatus.containsKey(error.getClass())) {
+            status = expectedStatus.get(error.getClass());
+        } else {
+            status = 1;
+        }
+        assertThat(observedStatus.get(), equalTo(status));
+        assertThat(threadNameReference.get(), equalTo(name));
+        assertThat(throwableReference.get(), equalTo(error));
+    }
+
+    public void testUncaughtException() throws InterruptedException {
+        final RuntimeException e = new RuntimeException("boom");
+        final Thread thread = new Thread(() -> { throw e; });
+        final String name = randomAsciiOfLength(10);
+        thread.setName(name);
+        final AtomicReference<String> threadNameReference = new AtomicReference<>();
+        final AtomicReference<Throwable> throwableReference = new AtomicReference<>();
+        thread.setUncaughtExceptionHandler(new ElasticsearchUncaughtExceptionHandler(() -> "testUncaughtException") {
+            @Override
+            void halt(int status) {
+                fail();
+            }
+
+            @Override
+            void onFatalUncaught(String threadName, Throwable t) {
+                fail();
+            }
+
+            @Override
+            void onNonFatalUncaught(String threadName, Throwable t) {
+                threadNameReference.set(threadName);
+                throwableReference.set(t);
+            }
+        });
+        thread.start();
+        thread.join();
+        assertThat(threadNameReference.get(), equalTo(name));
+        assertThat(throwableReference.get(), equalTo(e));
+    }
+
+    public void testIsFatalCause() {
+        assertFatal(new MergePolicy.MergeException(new OutOfMemoryError(), null));
+        assertFatal(new OutOfMemoryError());
+        assertFatal(new StackOverflowError());
+        assertFatal(new InternalError());
+        assertFatal(new UnknownError());
+        assertFatal(new IOError(new IOException()));
+        assertNonFatal(new RuntimeException());
+        assertNonFatal(new UncheckedIOException(new IOException()));
+    }
+
+    private void assertFatal(Throwable cause) {
+        assertTrue(ElasticsearchUncaughtExceptionHandler.isFatalUncaught(cause));
+    }
+
+    private void assertNonFatal(Throwable cause) {
+        assertFalse(ElasticsearchUncaughtExceptionHandler.isFatalUncaught(cause));
+    }
+
+}

+ 0 - 1
distribution/licenses/securesm-1.0.jar.sha1

@@ -1 +0,0 @@
-c0c6cf986ba0057390bfcc80c366a0e3157f944b

+ 1 - 0
distribution/licenses/securesm-1.1.jar.sha1

@@ -0,0 +1 @@
+1e423447d020041534be94c0f31a49fbdc1f2950

+ 8 - 0
docs/reference/migration/migrate_5_0/packaging.asciidoc

@@ -55,3 +55,11 @@ from Elasticsearch.
 Additionally, it was previously possible to set any setting in
 Additionally, it was previously possible to set any setting in
 Elasticsearch via JVM system properties. This has been removed from
 Elasticsearch via JVM system properties. This has been removed from
 Elasticsearch.
 Elasticsearch.
+
+==== Dying on fatal errors
+
+Previous versions of Elasticsearch would not halt the JVM if out of memory errors or other fatal
+errors were encountered during the life of the Elasticsearch instance. Because such errors leave
+the JVM in a questionable state, the best course of action is to halt the JVM when this occurs.
+Starting in Elasticsearch 5.x, this is now the case. Operators should consider configuring their
+Elasticsearch services so that they respawn automatically in the case of such a fatal crash.

+ 2 - 0
docs/reference/setup.asciidoc

@@ -47,3 +47,5 @@ include::setup/bootstrap-checks.asciidoc[]
 include::setup/sysconfig.asciidoc[]
 include::setup/sysconfig.asciidoc[]
 
 
 include::setup/upgrade.asciidoc[]
 include::setup/upgrade.asciidoc[]
+
+include::setup/stopping.asciidoc[]

+ 58 - 0
docs/reference/setup/stopping.asciidoc

@@ -0,0 +1,58 @@
+[[stopping-elasticsearch]]
+=== Stopping Elasticsearch
+
+An orderly shutdown of Elasticsearch ensures that Elasticsearch has a chance to cleanup and close
+outstanding resources. For example, a node that is shutdown in an orderly fashion will remove itself
+from the cluster, sync translogs to disk, and perform other related cleanup activities. You can help
+ensure an orderly shutdown by properly stopping Elasticsearch.
+
+If you're running Elasticsearch as a service, you can stop Elasticsearch via the service management
+functionality provided by your installation.
+
+If you're running Elasticsearch directly, you can stop Elasticsearch by sending control-C if you're
+running Elasticsearch in the console, or by sending `SIGTERM` to the Elasticsearch process on a
+POSIX system. You can obtain the PID to send the signal to via various tools (e.g., `ps` or `jps`):
+
+[source,sh]
+--------------------------------------------------
+$ jps | grep Elasticsearch
+14542 Elasticsearch
+--------------------------------------------------
+
+From the Elasticsearch startup logs:
+
+[source,sh]
+--------------------------------------------------
+[2016-07-07 12:26:18,908][INFO ][node                     ] [Reaper] version[5.0.0-alpha4], pid[15399], build[3f5b994/2016-06-27T16:23:46.861Z], OS[Mac OS X/10.11.5/x86_64], JVM[Oracle Corporation/Java HotSpot(TM) 64-Bit Server VM/1.8.0_92/25.92-b14]
+--------------------------------------------------
+
+Or by specifying a location to write a PID file to on startup (`-p <path>`):
+
+[source,sh]
+--------------------------------------------------
+$ ./bin/elasticsearch -p /tmp/elasticsearch-pid -d
+$ cat /tmp/elasticsearch-pid && echo
+15516
+$ kill -SIGTERM 15516
+--------------------------------------------------
+
+[[fatal-errors]
+[float]
+=== Stopping on Fatal Errors
+
+During the life of the Elasticsearch virtual machine, certain fatal errors could arise that put the
+virtual machine in a questionable state. Such fatal errors include out of memory errors, internal
+errors in virtual machine, and serious I/O errors.
+
+When Elasticsearch detects that the virtual machine has encountered such a fatal error Elasticsearch
+will attempt to log the error and then will halt the virtual machine. When Elasticsearch initiates
+such a shutdown, it does not go through an orderly shutdown as described above. The Elasticsearch
+process will also return with a special status code indicating the nature of the error.
+
+[horizontal]
+JVM internal error:: 128
+Out of memory error:: 127
+Stack overflow error:: 126
+Unknown virtual machine error:: 125
+Serious I/O error:: 124
+Unknown fatal error:: 1

+ 1 - 1
test/framework/src/main/java/org/elasticsearch/bootstrap/BootstrapForTesting.java

@@ -150,7 +150,7 @@ public class BootstrapForTesting {
                         return esPolicy.implies(domain, permission) || testFramework.implies(domain, permission);
                         return esPolicy.implies(domain, permission) || testFramework.implies(domain, permission);
                     }
                     }
                 });
                 });
-                System.setSecurityManager(new SecureSM(true));
+                System.setSecurityManager(SecureSM.createTestSecureSM());
                 Security.selfTest();
                 Security.selfTest();
 
 
                 // guarantee plugin classes are initialized first, in case they have one-time hacks.
                 // guarantee plugin classes are initialized first, in case they have one-time hacks.