1
0
Эх сурвалжийг харах

Fix Stacktraces Taking Memory in ILM Error Step Serialization (#84266)

Don't serialize stack-traces to ILM error steps. This will take multiple kb
per index in the CS. In bad cases where an error might affect thousands of
indices this can mean hundreds of MB of heap memory used on every node in
the cluster just to keep the serialized stack-traces around in the CS
that provide limited value anyway.
Armin Braun 3 жил өмнө
parent
commit
e3a847562c

+ 5 - 0
docs/changelog/84266.yaml

@@ -0,0 +1,5 @@
+pr: 84266
+summary: Fix Stacktraces Taking Memory in ILM Error Step Serialization
+area: ILM+SLM
+type: bug
+issues: []

+ 1 - 1
x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/ExecuteStepsUpdateTask.java

@@ -271,7 +271,7 @@ public class ExecuteStepsUpdateTask extends IndexLifecycleClusterStateUpdateTask
         logger.warn(new ParameterizedMessage("policy [{}] for index [{}] failed on step [{}].", policy, index, startStep.getKey()), e);
     }
 
-    private ClusterState moveToErrorStep(final ClusterState state, Step.StepKey currentStepKey, Exception cause) throws IOException {
+    private ClusterState moveToErrorStep(final ClusterState state, Step.StepKey currentStepKey, Exception cause) {
         this.failure = cause;
         logger.warn(
             "policy [{}] for index [{}] failed on cluster state step [{}]. Moving to ERROR step",

+ 8 - 23
x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransition.java

@@ -17,14 +17,10 @@ import org.elasticsearch.cluster.metadata.IndexMetadata;
 import org.elasticsearch.cluster.metadata.LifecycleExecutionState;
 import org.elasticsearch.cluster.metadata.Metadata;
 import org.elasticsearch.common.Strings;
-import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.license.XPackLicenseState;
-import org.elasticsearch.xcontent.ToXContent;
 import org.elasticsearch.xcontent.ToXContentObject;
-import org.elasticsearch.xcontent.XContentBuilder;
-import org.elasticsearch.xcontent.json.JsonXContent;
 import org.elasticsearch.xpack.core.ilm.ErrorStep;
 import org.elasticsearch.xpack.core.ilm.IndexLifecycleMetadata;
 import org.elasticsearch.xpack.core.ilm.InitializePolicyContextStep;
@@ -38,8 +34,6 @@ import org.elasticsearch.xpack.core.ilm.RolloverAction;
 import org.elasticsearch.xpack.core.ilm.Step;
 import org.elasticsearch.xpack.core.ilm.TerminalPolicyStep;
 
-import java.io.IOException;
-import java.util.Collections;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
@@ -48,8 +42,8 @@ import java.util.Set;
 import java.util.function.BiFunction;
 import java.util.function.LongSupplier;
 
-import static org.elasticsearch.ElasticsearchException.REST_EXCEPTION_SKIP_STACK_TRACE;
 import static org.elasticsearch.cluster.metadata.LifecycleExecutionState.ILM_CUSTOM_METADATA_KEY;
+import static org.elasticsearch.xcontent.ToXContent.EMPTY_PARAMS;
 
 /**
  * The {@link IndexLifecycleTransition} class handles cluster state transitions
@@ -61,9 +55,6 @@ import static org.elasticsearch.cluster.metadata.LifecycleExecutionState.ILM_CUS
  */
 public final class IndexLifecycleTransition {
     private static final Logger logger = LogManager.getLogger(IndexLifecycleTransition.class);
-    private static final ToXContent.Params STACKTRACE_PARAMS = new ToXContent.MapParams(
-        Collections.singletonMap(REST_EXCEPTION_SKIP_STACK_TRACE, "false")
-    );
 
     /**
      * Validates that the given transition from {@code currentStepKey} to {@code newStepKey} can be accomplished
@@ -158,14 +149,10 @@ public final class IndexLifecycleTransition {
         Exception cause,
         LongSupplier nowSupplier,
         BiFunction<IndexMetadata, Step.StepKey, Step> stepLookupFunction
-    ) throws IOException {
+    ) {
         IndexMetadata idxMeta = clusterState.getMetadata().index(index);
         IndexLifecycleMetadata ilmMeta = clusterState.metadata().custom(IndexLifecycleMetadata.TYPE);
         LifecyclePolicyMetadata policyMetadata = ilmMeta.getPolicyMetadatas().get(idxMeta.getLifecyclePolicyName());
-        XContentBuilder causeXContentBuilder = JsonXContent.contentBuilder();
-        causeXContentBuilder.startObject();
-        ElasticsearchException.generateThrowableXContent(causeXContentBuilder, STACKTRACE_PARAMS, cause);
-        causeXContentBuilder.endObject();
         LifecycleExecutionState currentState = idxMeta.getLifecycleExecutionState();
         Step.StepKey currentStep;
         // if an error is encountered while initialising the policy the lifecycle execution state will not yet contain any step information
@@ -189,7 +176,10 @@ public final class IndexLifecycleTransition {
 
         LifecycleExecutionState.Builder failedState = LifecycleExecutionState.builder(nextStepState);
         failedState.setFailedStep(currentStep.getName());
-        failedState.setStepInfo(BytesReference.bytes(causeXContentBuilder).utf8ToString());
+        failedState.setStepInfo(Strings.toString(((builder, params) -> {
+            ElasticsearchException.generateThrowableXContent(builder, EMPTY_PARAMS, cause);
+            return builder;
+        })));
         Step failedStep = stepLookupFunction.apply(idxMeta, currentStep);
 
         if (failedStep != null) {
@@ -444,20 +434,15 @@ public final class IndexLifecycleTransition {
      * @param stepInfo     the new step info to update
      * @return Updated cluster state with <code>stepInfo</code> if changed, otherwise the same cluster state
      * if no changes to step info exist
-     * @throws IOException if parsing step info fails
      */
-    static ClusterState addStepInfoToClusterState(Index index, ClusterState clusterState, ToXContentObject stepInfo) throws IOException {
+    static ClusterState addStepInfoToClusterState(Index index, ClusterState clusterState, ToXContentObject stepInfo) {
         IndexMetadata indexMetadata = clusterState.getMetadata().index(index);
         if (indexMetadata == null) {
             // This index doesn't exist anymore, we can't do anything
             return clusterState;
         }
         LifecycleExecutionState lifecycleState = indexMetadata.getLifecycleExecutionState();
-        final String stepInfoString;
-        try (XContentBuilder infoXContentBuilder = JsonXContent.contentBuilder()) {
-            stepInfo.toXContent(infoXContentBuilder, ToXContent.EMPTY_PARAMS);
-            stepInfoString = BytesReference.bytes(infoXContentBuilder).utf8ToString();
-        }
+        final String stepInfoString = Strings.toString(stepInfo);
         if (stepInfoString.equals(lifecycleState.stepInfo())) {
             return clusterState;
         }

+ 1 - 2
x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/MoveToErrorStepUpdateTask.java

@@ -20,7 +20,6 @@ import org.elasticsearch.cluster.metadata.LifecycleExecutionState;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.xpack.core.ilm.Step;
 
-import java.io.IOException;
 import java.util.function.BiFunction;
 import java.util.function.Consumer;
 import java.util.function.LongSupplier;
@@ -56,7 +55,7 @@ public class MoveToErrorStepUpdateTask extends ClusterStateUpdateTask {
     }
 
     @Override
-    public ClusterState execute(ClusterState currentState) throws IOException {
+    public ClusterState execute(ClusterState currentState) {
         IndexMetadata idxMeta = currentState.getMetadata().index(index);
         if (idxMeta == null) {
             // Index must have been since deleted, ignore it

+ 2 - 2
x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/ExecuteStepsUpdateTaskTests.java

@@ -303,7 +303,7 @@ public class ExecuteStepsUpdateTaskTests extends ESTestCase {
         assertThat(lifecycleState.phaseTime(), nullValue());
         assertThat(lifecycleState.actionTime(), nullValue());
         assertThat(lifecycleState.stepInfo(), containsString("""
-            {"type":"runtime_exception","reason":"error","stack_trace":\""""));
+            {"type":"runtime_exception","reason":"error\""""));
     }
 
     public void testClusterWaitStepThrowsException() throws Exception {
@@ -322,7 +322,7 @@ public class ExecuteStepsUpdateTaskTests extends ESTestCase {
         assertThat(lifecycleState.phaseTime(), nullValue());
         assertThat(lifecycleState.actionTime(), nullValue());
         assertThat(lifecycleState.stepInfo(), containsString("""
-            {"type":"runtime_exception","reason":"error","stack_trace":\""""));
+            {"type":"runtime_exception","reason":"error\""""));
     }
 
     private void setStateToKey(StepKey stepKey) throws IOException {

+ 1 - 1
x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransitionTests.java

@@ -346,7 +346,7 @@ public class IndexLifecycleTransitionTests extends ESTestCase {
             (idxMeta, stepKey) -> new MockStep(stepKey, nextStepKey)
         );
         assertClusterStateOnErrorStep(clusterState, index, currentStep, newClusterState, now, """
-            {"type":"illegal_argument_exception","reason":"non elasticsearch-exception","stack_trace":\"""");
+            {"type":"illegal_argument_exception","reason":"non elasticsearch-exception\"""");
     }
 
     public void testAddStepInfoToClusterState() throws IOException {

+ 1 - 10
x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/MoveToErrorStepUpdateTaskTests.java

@@ -13,13 +13,9 @@ import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.metadata.IndexMetadata;
 import org.elasticsearch.cluster.metadata.LifecycleExecutionState;
 import org.elasticsearch.cluster.metadata.Metadata;
-import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.test.ESTestCase;
-import org.elasticsearch.xcontent.ToXContent;
-import org.elasticsearch.xcontent.XContentBuilder;
-import org.elasticsearch.xcontent.json.JsonXContent;
 import org.elasticsearch.xpack.core.ilm.ErrorStep;
 import org.elasticsearch.xpack.core.ilm.IndexLifecycleMetadata;
 import org.elasticsearch.xpack.core.ilm.LifecyclePolicy;
@@ -98,13 +94,8 @@ public class MoveToErrorStepUpdateTaskTests extends ESTestCase {
         assertThat(lifecycleState.actionTime(), nullValue());
         assertThat(lifecycleState.stepTime(), equalTo(now));
 
-        XContentBuilder causeXContentBuilder = JsonXContent.contentBuilder();
-        causeXContentBuilder.startObject();
-        ElasticsearchException.generateThrowableXContent(causeXContentBuilder, ToXContent.EMPTY_PARAMS, cause);
-        causeXContentBuilder.endObject();
-        String expectedCauseValue = BytesReference.bytes(causeXContentBuilder).utf8ToString();
         assertThat(lifecycleState.stepInfo(), containsString("""
-            {"type":"exception","reason":"THIS IS AN EXPECTED CAUSE","stack_trace":\""""));
+            {"type":"exception","reason":"THIS IS AN EXPECTED CAUSE\""""));
     }
 
     public void testExecuteNoopDifferentStep() throws IOException {