Explorar o código

Die with dignity

Today when a thread encounters a fatal unrecoverable error that
threatens the stability of the JVM, Elasticsearch marches on. This
includes out of memory errors, stack overflow errors and other errors
that leave the JVM in a questionable state. Instead, the Elasticsearch
JVM should die when these errors are encountered. This commit causes
this to be the case.

Relates #19272
Jason Tedor %!s(int64=9) %!d(string=hai) anos
pai
achega
e86aa29f67

+ 1 - 1
core/build.gradle

@@ -56,7 +56,7 @@ dependencies {
   compile "org.apache.lucene:lucene-spatial3d:${versions.lucene}"
   compile "org.apache.lucene:lucene-suggest:${versions.lucene}"
 
-  compile 'org.elasticsearch:securesm:1.0'
+  compile 'org.elasticsearch:securesm:1.1'
 
   // utilities
   compile 'net.sf.jopt-simple:jopt-simple:5.0.2'

+ 6 - 0
core/src/main/java/org/elasticsearch/bootstrap/Bootstrap.java

@@ -246,6 +246,12 @@ final class Bootstrap {
             // fail if somebody replaced the lucene jars
             checkLucene();
 
+            // install the default uncaught exception handler; must be done before security is
+            // initialized as we do not want to grant the runtime permission
+            // setDefaultUncaughtExceptionHandler
+            Thread.setDefaultUncaughtExceptionHandler(
+                new ElasticsearchUncaughtExceptionHandler(() -> Node.NODE_NAME_SETTING.get(settings)));
+
             INSTANCE.setup(true, settings, environment);
 
             INSTANCE.start();

+ 94 - 0
core/src/main/java/org/elasticsearch/bootstrap/ElasticsearchUncaughtExceptionHandler.java

@@ -0,0 +1,94 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.bootstrap;
+
+import org.apache.lucene.index.MergePolicy;
+import org.elasticsearch.common.SuppressForbidden;
+import org.elasticsearch.common.logging.ESLogger;
+import org.elasticsearch.common.logging.Loggers;
+
+import java.io.IOError;
+import java.util.Objects;
+import java.util.function.Supplier;
+
+class ElasticsearchUncaughtExceptionHandler implements Thread.UncaughtExceptionHandler {
+
+    private final Supplier<String> loggingPrefixSupplier;
+
+    ElasticsearchUncaughtExceptionHandler(final Supplier<String> loggingPrefixSupplier) {
+        this.loggingPrefixSupplier = Objects.requireNonNull(loggingPrefixSupplier);
+    }
+
+    @Override
+    public void uncaughtException(Thread t, Throwable e) {
+        if (isFatalUncaught(e)) {
+            try {
+                onFatalUncaught(t.getName(), e);
+            } finally {
+                // we use specific error codes in case the above notification failed, at least we
+                // will have some indication of the error bringing us down
+                if (e instanceof InternalError) {
+                    halt(128);
+                } else if (e instanceof OutOfMemoryError) {
+                    halt(127);
+                } else if (e instanceof StackOverflowError) {
+                    halt(126);
+                } else if (e instanceof UnknownError) {
+                    halt(125);
+                } else if (e instanceof IOError) {
+                    halt(124);
+                } else {
+                    halt(1);
+                }
+            }
+        } else {
+            onNonFatalUncaught(t.getName(), e);
+        }
+    }
+
+    // visible for testing
+    static boolean isFatalUncaught(Throwable e) {
+        return isFatalCause(e) || (e instanceof MergePolicy.MergeException && isFatalCause(e.getCause()));
+    }
+
+    private static boolean isFatalCause(Throwable cause) {
+        return cause instanceof Error;
+    }
+
+    // visible for testing
+    void onFatalUncaught(final String threadName, final Throwable t) {
+        final ESLogger logger = Loggers.getLogger(ElasticsearchUncaughtExceptionHandler.class, loggingPrefixSupplier.get());
+        logger.error("fatal error in thread [{}], exiting", t, threadName);
+    }
+
+    // visible for testing
+    void onNonFatalUncaught(final String threadName, final Throwable t) {
+        final ESLogger logger = Loggers.getLogger(ElasticsearchUncaughtExceptionHandler.class, loggingPrefixSupplier.get());
+        logger.warn("uncaught exception in thread [{}]", t, threadName);
+    }
+
+    // visible for testing
+    @SuppressForbidden(reason = "halt")
+    void halt(int status) {
+        // we halt to prevent shutdown hooks from running
+        Runtime.getRuntime().halt(status);
+    }
+
+}

+ 1 - 1
core/src/main/java/org/elasticsearch/bootstrap/Security.java

@@ -120,7 +120,7 @@ final class Security {
         Policy.setPolicy(new ESPolicy(createPermissions(environment), getPluginPermissions(environment), filterBadDefaults));
 
         // enable security manager
-        System.setSecurityManager(new SecureSM());
+        System.setSecurityManager(new SecureSM(new String[] { "org.elasticsearch.bootstrap." }));
 
         // do some basic tests
         selfTest();

+ 1 - 1
core/src/main/resources/org/elasticsearch/bootstrap/security.policy

@@ -24,7 +24,7 @@
 //// SecurityManager impl:
 //// Must have all permissions to properly perform access checks
 
-grant codeBase "${codebase.securesm-1.0.jar}" {
+grant codeBase "${codebase.securesm-1.1.jar}" {
   permission java.security.AllPermission;
 };
 

+ 152 - 0
core/src/test/java/org/elasticsearch/bootstrap/ElasticsearchUncaughtExceptionHandlerTests.java

@@ -0,0 +1,152 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.bootstrap;
+
+import org.apache.lucene.index.MergePolicy;
+import org.elasticsearch.test.ESTestCase;
+import org.junit.Before;
+
+import java.io.IOError;
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicReference;
+
+import static org.hamcrest.CoreMatchers.equalTo;
+
+public class ElasticsearchUncaughtExceptionHandlerTests extends ESTestCase {
+
+    private Map<Class<? extends Error>, Integer> expectedStatus;
+
+    @Before
+    public void setUp() throws Exception {
+        super.setUp();
+        Map<Class<? extends Error>, Integer> expectedStatus = new HashMap<>();
+        expectedStatus.put(InternalError.class, 128);
+        expectedStatus.put(OutOfMemoryError.class, 127);
+        expectedStatus.put(StackOverflowError.class, 126);
+        expectedStatus.put(UnknownError.class, 125);
+        expectedStatus.put(IOError.class, 124);
+        this.expectedStatus = Collections.unmodifiableMap(expectedStatus);
+    }
+
+    public void testUncaughtError() throws InterruptedException {
+        final Error error = randomFrom(
+            new InternalError(),
+            new OutOfMemoryError(),
+            new StackOverflowError(),
+            new UnknownError(),
+            new IOError(new IOException("fatal")),
+            new Error() {});
+        final Thread thread = new Thread(() -> { throw error; });
+        final String name = randomAsciiOfLength(10);
+        thread.setName(name);
+        final AtomicBoolean halt = new AtomicBoolean();
+        final AtomicInteger observedStatus = new AtomicInteger();
+        final AtomicReference<String> threadNameReference = new AtomicReference<>();
+        final AtomicReference<Throwable> throwableReference = new AtomicReference<>();
+        thread.setUncaughtExceptionHandler(new ElasticsearchUncaughtExceptionHandler(() -> "testUncaughtError") {
+
+            @Override
+            void halt(int status) {
+                halt.set(true);
+                observedStatus.set(status);
+            }
+
+            @Override
+            void onFatalUncaught(String threadName, Throwable t) {
+                threadNameReference.set(threadName);
+                throwableReference.set(t);
+            }
+
+            @Override
+            void onNonFatalUncaught(String threadName, Throwable t) {
+                fail();
+            }
+
+        });
+        thread.start();
+        thread.join();
+        assertTrue(halt.get());
+        final int status;
+        if (expectedStatus.containsKey(error.getClass())) {
+            status = expectedStatus.get(error.getClass());
+        } else {
+            status = 1;
+        }
+        assertThat(observedStatus.get(), equalTo(status));
+        assertThat(threadNameReference.get(), equalTo(name));
+        assertThat(throwableReference.get(), equalTo(error));
+    }
+
+    public void testUncaughtException() throws InterruptedException {
+        final RuntimeException e = new RuntimeException("boom");
+        final Thread thread = new Thread(() -> { throw e; });
+        final String name = randomAsciiOfLength(10);
+        thread.setName(name);
+        final AtomicReference<String> threadNameReference = new AtomicReference<>();
+        final AtomicReference<Throwable> throwableReference = new AtomicReference<>();
+        thread.setUncaughtExceptionHandler(new ElasticsearchUncaughtExceptionHandler(() -> "testUncaughtException") {
+            @Override
+            void halt(int status) {
+                fail();
+            }
+
+            @Override
+            void onFatalUncaught(String threadName, Throwable t) {
+                fail();
+            }
+
+            @Override
+            void onNonFatalUncaught(String threadName, Throwable t) {
+                threadNameReference.set(threadName);
+                throwableReference.set(t);
+            }
+        });
+        thread.start();
+        thread.join();
+        assertThat(threadNameReference.get(), equalTo(name));
+        assertThat(throwableReference.get(), equalTo(e));
+    }
+
+    public void testIsFatalCause() {
+        assertFatal(new MergePolicy.MergeException(new OutOfMemoryError(), null));
+        assertFatal(new OutOfMemoryError());
+        assertFatal(new StackOverflowError());
+        assertFatal(new InternalError());
+        assertFatal(new UnknownError());
+        assertFatal(new IOError(new IOException()));
+        assertNonFatal(new RuntimeException());
+        assertNonFatal(new UncheckedIOException(new IOException()));
+    }
+
+    private void assertFatal(Throwable cause) {
+        assertTrue(ElasticsearchUncaughtExceptionHandler.isFatalUncaught(cause));
+    }
+
+    private void assertNonFatal(Throwable cause) {
+        assertFalse(ElasticsearchUncaughtExceptionHandler.isFatalUncaught(cause));
+    }
+
+}

+ 0 - 1
distribution/licenses/securesm-1.0.jar.sha1

@@ -1 +0,0 @@
-c0c6cf986ba0057390bfcc80c366a0e3157f944b

+ 1 - 0
distribution/licenses/securesm-1.1.jar.sha1

@@ -0,0 +1 @@
+1e423447d020041534be94c0f31a49fbdc1f2950

+ 8 - 0
docs/reference/migration/migrate_5_0/packaging.asciidoc

@@ -55,3 +55,11 @@ from Elasticsearch.
 Additionally, it was previously possible to set any setting in
 Elasticsearch via JVM system properties. This has been removed from
 Elasticsearch.
+
+==== Dying on fatal errors
+
+Previous versions of Elasticsearch would not halt the JVM if out of memory errors or other fatal
+errors were encountered during the life of the Elasticsearch instance. Because such errors leave
+the JVM in a questionable state, the best course of action is to halt the JVM when this occurs.
+Starting in Elasticsearch 5.x, this is now the case. Operators should consider configuring their
+Elasticsearch services so that they respawn automatically in the case of such a fatal crash.

+ 2 - 0
docs/reference/setup.asciidoc

@@ -47,3 +47,5 @@ include::setup/bootstrap-checks.asciidoc[]
 include::setup/sysconfig.asciidoc[]
 
 include::setup/upgrade.asciidoc[]
+
+include::setup/stopping.asciidoc[]

+ 58 - 0
docs/reference/setup/stopping.asciidoc

@@ -0,0 +1,58 @@
+[[stopping-elasticsearch]]
+=== Stopping Elasticsearch
+
+An orderly shutdown of Elasticsearch ensures that Elasticsearch has a chance to cleanup and close
+outstanding resources. For example, a node that is shutdown in an orderly fashion will remove itself
+from the cluster, sync translogs to disk, and perform other related cleanup activities. You can help
+ensure an orderly shutdown by properly stopping Elasticsearch.
+
+If you're running Elasticsearch as a service, you can stop Elasticsearch via the service management
+functionality provided by your installation.
+
+If you're running Elasticsearch directly, you can stop Elasticsearch by sending control-C if you're
+running Elasticsearch in the console, or by sending `SIGTERM` to the Elasticsearch process on a
+POSIX system. You can obtain the PID to send the signal to via various tools (e.g., `ps` or `jps`):
+
+[source,sh]
+--------------------------------------------------
+$ jps | grep Elasticsearch
+14542 Elasticsearch
+--------------------------------------------------
+
+From the Elasticsearch startup logs:
+
+[source,sh]
+--------------------------------------------------
+[2016-07-07 12:26:18,908][INFO ][node                     ] [Reaper] version[5.0.0-alpha4], pid[15399], build[3f5b994/2016-06-27T16:23:46.861Z], OS[Mac OS X/10.11.5/x86_64], JVM[Oracle Corporation/Java HotSpot(TM) 64-Bit Server VM/1.8.0_92/25.92-b14]
+--------------------------------------------------
+
+Or by specifying a location to write a PID file to on startup (`-p <path>`):
+
+[source,sh]
+--------------------------------------------------
+$ ./bin/elasticsearch -p /tmp/elasticsearch-pid -d
+$ cat /tmp/elasticsearch-pid && echo
+15516
+$ kill -SIGTERM 15516
+--------------------------------------------------
+
+[[fatal-errors]
+[float]
+=== Stopping on Fatal Errors
+
+During the life of the Elasticsearch virtual machine, certain fatal errors could arise that put the
+virtual machine in a questionable state. Such fatal errors include out of memory errors, internal
+errors in virtual machine, and serious I/O errors.
+
+When Elasticsearch detects that the virtual machine has encountered such a fatal error Elasticsearch
+will attempt to log the error and then will halt the virtual machine. When Elasticsearch initiates
+such a shutdown, it does not go through an orderly shutdown as described above. The Elasticsearch
+process will also return with a special status code indicating the nature of the error.
+
+[horizontal]
+JVM internal error:: 128
+Out of memory error:: 127
+Stack overflow error:: 126
+Unknown virtual machine error:: 125
+Serious I/O error:: 124
+Unknown fatal error:: 1

+ 1 - 1
test/framework/src/main/java/org/elasticsearch/bootstrap/BootstrapForTesting.java

@@ -150,7 +150,7 @@ public class BootstrapForTesting {
                         return esPolicy.implies(domain, permission) || testFramework.implies(domain, permission);
                     }
                 });
-                System.setSecurityManager(new SecureSM(true));
+                System.setSecurityManager(SecureSM.createTestSecureSM());
                 Security.selfTest();
 
                 // guarantee plugin classes are initialized first, in case they have one-time hacks.