Browse Source

[ML] Move open job failure explanation out of root cause (#31925)

When an ML job cannot be allocated to a node the exception
contained an explanation of why the job couldn't be
allocated to each node in the cluster.  For large clusters
this was not particularly easy to read and made the error
displayed in the UI look very scary.

This commit changes the structure of the error to an outer
ElasticsearchException with a high level message and an
inner IllegalStateException containing the detailed
explanation.  Because the definition of root cause is the
innermost ElasticsearchException the detailed explanation
will not be the root cause (which is what Kibana displays).

Fixes #29950
David Roberts 7 years ago
parent
commit
d2461643cd

+ 11 - 6
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportOpenJobAction.java

@@ -678,10 +678,7 @@ public class TransportOpenJobAction extends TransportMasterNodeAction<OpenJobAct
             PersistentTasksCustomMetaData.Assignment assignment = selectLeastLoadedMlNode(params.getJobId(), clusterState,
                     maxConcurrentJobAllocations, fallbackMaxNumberOfOpenJobs, maxMachineMemoryPercent, logger);
             if (assignment.getExecutorNode() == null) {
-                String msg = "Could not open job because no suitable nodes were found, allocation explanation ["
-                        + assignment.getExplanation() + "]";
-                logger.warn("[{}] {}", params.getJobId(), msg);
-                throw new ElasticsearchStatusException(msg, RestStatus.TOO_MANY_REQUESTS);
+                throw makeNoSuitableNodesException(logger, params.getJobId(), assignment.getExplanation());
             }
         }
 
@@ -785,9 +782,9 @@ public class TransportOpenJobAction extends TransportMasterNodeAction<OpenJobAct
                 // and this is why this class must only be used when opening a job
                 if (assignment != null && assignment.equals(PersistentTasksCustomMetaData.INITIAL_ASSIGNMENT) == false &&
                         assignment.isAssigned() == false) {
+                    OpenJobAction.JobParams params = (OpenJobAction.JobParams) persistentTask.getParams();
                     // Assignment has failed on the master node despite passing our "fast fail" validation
-                    exception = new ElasticsearchStatusException("Could not open job because no suitable nodes were found, " +
-                            "allocation explanation [" + assignment.getExplanation() + "]", RestStatus.TOO_MANY_REQUESTS);
+                    exception = makeNoSuitableNodesException(logger, params.getJobId(), assignment.getExplanation());
                     // The persistent task should be cancelled so that the observed outcome is the
                     // same as if the "fast fail" validation on the coordinating node had failed
                     shouldCancel = true;
@@ -813,4 +810,12 @@ public class TransportOpenJobAction extends TransportMasterNodeAction<OpenJobAct
             }
         }
     }
+
+    static ElasticsearchException makeNoSuitableNodesException(Logger logger, String jobId, String explanation) {
+        String msg = "Could not open job because no suitable nodes were found, allocation explanation [" + explanation + "]";
+        logger.warn("[{}] {}", jobId, msg);
+        Exception detail = new IllegalStateException(msg);
+        return new ElasticsearchStatusException("Could not open job because no ML nodes with sufficient capacity were found",
+            RestStatus.TOO_MANY_REQUESTS, detail);
+    }
 }

+ 8 - 4
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/integration/BasicDistributedJobsIT.java

@@ -366,10 +366,14 @@ public class BasicDistributedJobsIT extends BaseMlIntegTestCase {
 
         Exception e = expectThrows(ElasticsearchStatusException.class,
                 () -> client().execute(OpenJobAction.INSTANCE, openJobRequest).actionGet());
-        assertTrue(e.getMessage(),
-                e.getMessage().startsWith("Could not open job because no suitable nodes were found, allocation explanation"));
-        assertTrue(e.getMessage(), e.getMessage().endsWith("because not all primary shards are active for the following indices "
-                + "[.ml-state,.ml-anomalies-shared]]"));
+        assertEquals("Could not open job because no ML nodes with sufficient capacity were found", e.getMessage());
+        IllegalStateException detail = (IllegalStateException) e.getCause();
+        assertNotNull(detail);
+        String detailedMessage = detail.getMessage();
+        assertTrue(detailedMessage,
+            detailedMessage.startsWith("Could not open job because no suitable nodes were found, allocation explanation"));
+        assertTrue(detailedMessage, detailedMessage.endsWith("because not all primary shards are active for the following indices " +
+            "[.ml-state,.ml-anomalies-shared]]"));
 
         logger.info("Start data node");
         String nonMlNode = internalCluster().startNode(Settings.builder()

+ 13 - 9
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/integration/TooManyJobsIT.java

@@ -92,18 +92,22 @@ public class TooManyJobsIT extends BaseMlIntegTestCase {
                 });
                 logger.info("Opened {}th job", i);
             } catch (ElasticsearchStatusException e) {
-                assertTrue(e.getMessage(),
-                        e.getMessage().startsWith("Could not open job because no suitable nodes were found, allocation explanation"));
+                assertEquals("Could not open job because no ML nodes with sufficient capacity were found", e.getMessage());
+                IllegalStateException detail = (IllegalStateException) e.getCause();
+                assertNotNull(detail);
+                String detailedMessage = detail.getMessage();
+                assertTrue(detailedMessage,
+                    detailedMessage.startsWith("Could not open job because no suitable nodes were found, allocation explanation"));
                 if (expectMemoryLimitBeforeCountLimit) {
                     int expectedJobsAlreadyOpenOnNode = (i - 1) / numNodes;
-                    assertTrue(e.getMessage(),
-                            e.getMessage().endsWith("because this node has insufficient available memory. Available memory for ML [" +
-                                    maxMlMemoryPerNode + "], memory required by existing jobs [" +
-                                    (expectedJobsAlreadyOpenOnNode * memoryFootprintPerJob) +
-                                    "], estimated memory required for this job [" + memoryFootprintPerJob + "]]"));
+                    assertTrue(detailedMessage,
+                        detailedMessage.endsWith("because this node has insufficient available memory. Available memory for ML [" +
+                            maxMlMemoryPerNode + "], memory required by existing jobs [" +
+                            (expectedJobsAlreadyOpenOnNode * memoryFootprintPerJob) + "], estimated memory required for this job [" +
+                            memoryFootprintPerJob + "]]"));
                 } else {
-                    assertTrue(e.getMessage(), e.getMessage().endsWith("because this node is full. Number of opened jobs [" +
-                            maxNumberOfJobsPerNode + "], xpack.ml.max_open_jobs [" + maxNumberOfJobsPerNode + "]]"));
+                    assertTrue(detailedMessage, detailedMessage.endsWith("because this node is full. Number of opened jobs [" +
+                        maxNumberOfJobsPerNode + "], xpack.ml.max_open_jobs [" + maxNumberOfJobsPerNode + "]]"));
                 }
                 logger.info("good news everybody --> reached maximum number of allowed opened jobs, after trying to open the {}th job", i);