|
|
@@ -63,6 +63,7 @@ import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.ModelSizeSta
|
|
|
import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.ModelSnapshot;
|
|
|
import org.elasticsearch.xpack.core.ml.notifications.NotificationsIndex;
|
|
|
import org.elasticsearch.xpack.ml.MachineLearning;
|
|
|
+import org.elasticsearch.xpack.ml.job.persistence.JobResultsPersister;
|
|
|
import org.elasticsearch.xpack.ml.job.process.autodetect.BlackHoleAutodetectProcess;
|
|
|
import org.elasticsearch.xpack.ml.support.BaseMlIntegTestCase;
|
|
|
import org.junit.After;
|
|
|
@@ -542,7 +543,108 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
|
|
|
assertThat(dataCounts.getProcessedRecordCount(), equalTo(numDocs));
|
|
|
assertThat(dataCounts.getOutOfOrderTimeStampCount(), equalTo(0L));
|
|
|
});
|
|
|
+ }
|
|
|
+
|
|
|
+ public void testClusterWithTwoMlNodes_StopsDatafeed_GivenJobFailsOnReassign() throws Exception {
|
|
|
+ internalCluster().ensureAtMostNumDataNodes(0);
|
|
|
+ logger.info("Starting dedicated master node...");
|
|
|
+ internalCluster().startMasterOnlyNode();
|
|
|
+ logger.info("Starting ml and data node...");
|
|
|
+ internalCluster().startNode(onlyRoles(Set.of(DiscoveryNodeRole.DATA_ROLE, DiscoveryNodeRole.ML_ROLE)));
|
|
|
+ logger.info("Starting another ml and data node...");
|
|
|
+ internalCluster().startNode(onlyRoles(Set.of(DiscoveryNodeRole.DATA_ROLE, DiscoveryNodeRole.ML_ROLE)));
|
|
|
+ ensureStableCluster();
|
|
|
+
|
|
|
+ // index some datafeed data
|
|
|
+ client().admin().indices().prepareCreate("data").setMapping("time", "type=date").get();
|
|
|
+ long numDocs = 80000;
|
|
|
+ long now = System.currentTimeMillis();
|
|
|
+ long weekAgo = now - 604800000;
|
|
|
+ long twoWeeksAgo = weekAgo - 604800000;
|
|
|
+ indexDocs(logger, "data", numDocs, twoWeeksAgo, weekAgo);
|
|
|
+
|
|
|
+ String jobId = "test-node-goes-down-while-running-job";
|
|
|
+ String datafeedId = jobId + "-datafeed";
|
|
|
+
|
|
|
+ Job.Builder job = createScheduledJob(jobId);
|
|
|
+ PutJobAction.Request putJobRequest = new PutJobAction.Request(job);
|
|
|
+ client().execute(PutJobAction.INSTANCE, putJobRequest).actionGet();
|
|
|
+
|
|
|
+ DatafeedConfig config = createDatafeed(datafeedId, job.getId(), Collections.singletonList("data"), TimeValue.timeValueHours(1));
|
|
|
+ PutDatafeedAction.Request putDatafeedRequest = new PutDatafeedAction.Request(config);
|
|
|
+ client().execute(PutDatafeedAction.INSTANCE, putDatafeedRequest).actionGet();
|
|
|
+
|
|
|
+ client().execute(OpenJobAction.INSTANCE, new OpenJobAction.Request(job.getId()));
|
|
|
+
|
|
|
+ assertBusy(() -> {
|
|
|
+ GetJobsStatsAction.Response statsResponse = client().execute(
|
|
|
+ GetJobsStatsAction.INSTANCE,
|
|
|
+ new GetJobsStatsAction.Request(job.getId())
|
|
|
+ ).actionGet();
|
|
|
+ assertEquals(JobState.OPENED, statsResponse.getResponse().results().get(0).getState());
|
|
|
+ }, 30, TimeUnit.SECONDS);
|
|
|
+
|
|
|
+ DiscoveryNode nodeRunningJob = client().execute(GetJobsStatsAction.INSTANCE, new GetJobsStatsAction.Request(job.getId()))
|
|
|
+ .actionGet()
|
|
|
+ .getResponse()
|
|
|
+ .results()
|
|
|
+ .get(0)
|
|
|
+ .getNode();
|
|
|
+
|
|
|
+ setMlIndicesDelayedNodeLeftTimeoutToZero();
|
|
|
+
|
|
|
+ StartDatafeedAction.Request startDatafeedRequest = new StartDatafeedAction.Request(config.getId(), 0L);
|
|
|
+ client().execute(StartDatafeedAction.INSTANCE, startDatafeedRequest).get();
|
|
|
+
|
|
|
+ waitForJobToHaveProcessedAtLeast(jobId, 1000);
|
|
|
+
|
|
|
+ // The datafeed should be started
|
|
|
+ assertBusy(() -> {
|
|
|
+ GetDatafeedsStatsAction.Response statsResponse = client().execute(
|
|
|
+ GetDatafeedsStatsAction.INSTANCE,
|
|
|
+ new GetDatafeedsStatsAction.Request(config.getId())
|
|
|
+ ).actionGet();
|
|
|
+ assertEquals(DatafeedState.STARTED, statsResponse.getResponse().results().get(0).getDatafeedState());
|
|
|
+ }, 30, TimeUnit.SECONDS);
|
|
|
+
|
|
|
+ // Create a problem that will make the job fail when it restarts on a different node
|
|
|
+ String snapshotId = "123";
|
|
|
+ ModelSnapshot modelSnapshot = new ModelSnapshot.Builder(jobId).setSnapshotId(snapshotId).setTimestamp(new Date()).build();
|
|
|
+ JobResultsPersister jobResultsPersister = internalCluster().getInstance(
|
|
|
+ JobResultsPersister.class,
|
|
|
+ internalCluster().getMasterName()
|
|
|
+ );
|
|
|
+ jobResultsPersister.persistModelSnapshot(modelSnapshot, WriteRequest.RefreshPolicy.IMMEDIATE, () -> true);
|
|
|
+ UpdateJobAction.Request updateJobRequest = UpdateJobAction.Request.internal(
|
|
|
+ jobId,
|
|
|
+ new JobUpdate.Builder(jobId).setModelSnapshotId(snapshotId).build()
|
|
|
+ );
|
|
|
+ client().execute(UpdateJobAction.INSTANCE, updateJobRequest).actionGet();
|
|
|
+ refresh(AnomalyDetectorsIndex.resultsWriteAlias(jobId));
|
|
|
+
|
|
|
+ // Make the job move to a different node
|
|
|
+ internalCluster().stopNode(nodeRunningJob.getName());
|
|
|
+
|
|
|
+ // Wait for the job to fail during reassignment
|
|
|
+ assertBusy(() -> {
|
|
|
+ GetJobsStatsAction.Response statsResponse = client().execute(
|
|
|
+ GetJobsStatsAction.INSTANCE,
|
|
|
+ new GetJobsStatsAction.Request(job.getId())
|
|
|
+ ).actionGet();
|
|
|
+ assertEquals(JobState.FAILED, statsResponse.getResponse().results().get(0).getState());
|
|
|
+ }, 30, TimeUnit.SECONDS);
|
|
|
+
|
|
|
+ // The datafeed should then be stopped
|
|
|
+ assertBusy(() -> {
|
|
|
+ GetDatafeedsStatsAction.Response statsResponse = client().execute(
|
|
|
+ GetDatafeedsStatsAction.INSTANCE,
|
|
|
+ new GetDatafeedsStatsAction.Request(config.getId())
|
|
|
+ ).actionGet();
|
|
|
+ assertEquals(DatafeedState.STOPPED, statsResponse.getResponse().results().get(0).getDatafeedState());
|
|
|
+ }, 30, TimeUnit.SECONDS);
|
|
|
|
|
|
+ // Force close the failed job to clean up
|
|
|
+ client().execute(CloseJobAction.INSTANCE, new CloseJobAction.Request(jobId).setForce(true)).actionGet();
|
|
|
}
|
|
|
|
|
|
private void setupJobWithoutDatafeed(String jobId, ByteSizeValue modelMemoryLimit) throws Exception {
|