Просмотр исходного кода

[ML] add new snapshot upgrader API for upgrading older snapshots (#64665)

This new API provides a way for users to upgrade their own anomaly job
model snapshots.

To upgrade a snapshot the following is done:
- Open a native process given the job id and the desired snapshot id
- load the snapshot to the process
- write the snapshot again from the native task (now updated via the
  native process)

relates #64154
Benjamin Trent 5 лет назад
Родитель
Сommit
33de89d94c
52 измененных файлов с 3343 добавлено и 175 удалено
  1. 24 0
      client/rest-high-level/src/main/java/org/elasticsearch/client/MLRequestConverters.java
  2. 46 0
      client/rest-high-level/src/main/java/org/elasticsearch/client/MachineLearningClient.java
  3. 121 0
      client/rest-high-level/src/main/java/org/elasticsearch/client/ml/UpgradeJobModelSnapshotRequest.java
  4. 99 0
      client/rest-high-level/src/main/java/org/elasticsearch/client/ml/UpgradeJobModelSnapshotResponse.java
  5. 25 0
      client/rest-high-level/src/test/java/org/elasticsearch/client/MLRequestConvertersTests.java
  6. 28 1
      client/rest-high-level/src/test/java/org/elasticsearch/client/MachineLearningIT.java
  7. 80 0
      client/rest-high-level/src/test/java/org/elasticsearch/client/documentation/MlClientDocumentationIT.java
  8. 45 0
      client/rest-high-level/src/test/java/org/elasticsearch/client/ml/UpgradeJobModelSnapshotRequestTests.java
  9. 44 0
      client/rest-high-level/src/test/java/org/elasticsearch/client/ml/UpgradeJobModelSnapshotResponseTests.java
  10. 44 0
      docs/java-rest/high-level/ml/upgrade-job-model-snapshot.asciidoc
  11. 2 0
      docs/java-rest/high-level/supported-apis.asciidoc
  12. 85 0
      docs/reference/ml/anomaly-detection/apis/upgrade-job-model-snapshot.asciidoc
  13. 3 0
      x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackClientPlugin.java
  14. 13 0
      x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/MlTasks.java
  15. 204 0
      x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/UpgradeJobModelSnapshotAction.java
  16. 37 0
      x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/snapshot/upgrade/SnapshotUpgradeState.java
  17. 112 0
      x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/snapshot/upgrade/SnapshotUpgradeTaskState.java
  18. 37 0
      x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/UpgradeJobModelSnapshotRequestTests.java
  19. 34 0
      x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/UpgradeJobModelSnapshotResponseTests.java
  20. 43 15
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java
  21. 6 16
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportOpenJobAction.java
  22. 11 1
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportRevertModelSnapshotAction.java
  23. 261 0
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportUpgradeJobModelSnapshotAction.java
  24. 5 0
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/AbstractNativeAnalyticsProcess.java
  25. 4 0
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/NativeAnalyticsProcess.java
  26. 57 38
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/NodeLoadDetector.java
  27. 65 62
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/persistence/JobResultsProvider.java
  28. 9 1
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/AutodetectProcessFactory.java
  29. 102 25
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/AutodetectProcessManager.java
  30. 4 0
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/BlackHoleAutodetectProcess.java
  31. 307 0
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/JobModelSnapshotUpgrader.java
  32. 5 0
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/NativeAutodetectProcess.java
  33. 6 5
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/NativeAutodetectProcessFactory.java
  34. 251 0
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/output/JobSnapshotUpgraderResultProcessor.java
  35. 16 0
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/AutodetectControlMsgWriter.java
  36. 4 0
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/normalizer/MultiplyingNormalizerProcess.java
  37. 4 0
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/normalizer/NativeNormalizerProcess.java
  38. 85 0
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/snapshot/upgrader/SnapshotUpgradePredicate.java
  39. 39 0
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/snapshot/upgrader/SnapshotUpgradeTask.java
  40. 282 0
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/snapshot/upgrader/SnapshotUpgradeTaskExecutor.java
  41. 100 0
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/snapshot/upgrader/SnapshotUpgradeTaskParams.java
  42. 21 5
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/task/OpenJobPersistentTasksExecutor.java
  43. 9 0
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/process/NativeProcess.java
  44. 60 0
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/modelsnapshots/RestUpgradeJobModelSnapshotAction.java
  45. 10 2
      x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/task/AbstractJobPersistentTasksExecutor.java
  46. 3 3
      x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/AutodetectProcessManagerTests.java
  47. 89 0
      x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/snapshot/upgrader/SnapshotUpgradePredicateTests.java
  48. 4 0
      x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/process/AbstractNativeProcessTests.java
  49. 2 1
      x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/task/AbstractJobPersistentTasksExecutorTests.java
  50. 41 0
      x-pack/plugin/src/test/resources/rest-api-spec/api/ml.upgrade_job_snapshot.json
  51. 80 0
      x-pack/plugin/src/test/resources/rest-api-spec/test/ml/upgrade_job_snapshot.yml
  52. 275 0
      x-pack/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/MlJobSnapshotUpgradeIT.java

+ 24 - 0
client/rest-high-level/src/main/java/org/elasticsearch/client/MLRequestConverters.java

@@ -86,6 +86,7 @@ import org.elasticsearch.client.ml.UpdateDatafeedRequest;
 import org.elasticsearch.client.ml.UpdateFilterRequest;
 import org.elasticsearch.client.ml.UpdateJobRequest;
 import org.elasticsearch.client.ml.UpdateModelSnapshotRequest;
+import org.elasticsearch.client.ml.UpgradeJobModelSnapshotRequest;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.xcontent.XContentType;
@@ -429,6 +430,29 @@ final class MLRequestConverters {
         return request;
     }
 
+    static Request upgradeJobSnapshot(UpgradeJobModelSnapshotRequest upgradeJobModelSnapshotRequest) {
+        String endpoint = new EndpointBuilder()
+            .addPathPartAsIs("_ml")
+            .addPathPartAsIs("anomaly_detectors")
+            .addPathPart(upgradeJobModelSnapshotRequest.getJobId())
+            .addPathPartAsIs("model_snapshots")
+            .addPathPart(upgradeJobModelSnapshotRequest.getSnapshotId())
+            .addPathPartAsIs("_upgrade")
+            .build();
+        Request request = new Request(HttpPost.METHOD_NAME, endpoint);
+        RequestConverters.Params params = new RequestConverters.Params();
+        if (upgradeJobModelSnapshotRequest.getTimeout() != null) {
+            params.putParam(UpgradeJobModelSnapshotRequest.TIMEOUT.getPreferredName(),
+                upgradeJobModelSnapshotRequest.getTimeout().getStringRep());
+        }
+        if (upgradeJobModelSnapshotRequest.getWaitForCompletion() != null) {
+            params.putParam(UpgradeJobModelSnapshotRequest.WAIT_FOR_COMPLETION.getPreferredName(),
+                upgradeJobModelSnapshotRequest.getWaitForCompletion().toString());
+        }
+        request.addParameters(params.asMap());
+        return request;
+    }
+
     static Request revertModelSnapshot(RevertModelSnapshotRequest revertModelSnapshotsRequest) throws IOException {
         String endpoint = new EndpointBuilder()
             .addPathPartAsIs("_ml")

+ 46 - 0
client/rest-high-level/src/main/java/org/elasticsearch/client/MachineLearningClient.java

@@ -121,6 +121,8 @@ import org.elasticsearch.client.ml.UpdateFilterRequest;
 import org.elasticsearch.client.ml.UpdateJobRequest;
 import org.elasticsearch.client.ml.UpdateModelSnapshotRequest;
 import org.elasticsearch.client.ml.UpdateModelSnapshotResponse;
+import org.elasticsearch.client.ml.UpgradeJobModelSnapshotRequest;
+import org.elasticsearch.client.ml.UpgradeJobModelSnapshotResponse;
 import org.elasticsearch.client.ml.job.stats.JobStats;
 
 import java.io.IOException;
@@ -1178,6 +1180,50 @@ public final class MachineLearningClient {
             Collections.emptySet());
     }
 
+    /**
+     * Upgrades a snapshot for a Machine Learning Job to the current major version.
+     * <p>
+     * For additional info
+     * see <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/ml-upgrade-job-model-snapshot.html">
+     * ML Upgrade job snapshots documentation</a>
+     *
+     * @param request The request
+     * @param options Additional request options (e.g. headers), use {@link RequestOptions#DEFAULT} if nothing needs to be customized
+     * @throws IOException when there is a serialization issue sending the request or receiving the response
+     */
+    public UpgradeJobModelSnapshotResponse upgradeJobSnapshot(UpgradeJobModelSnapshotRequest request,
+                                                              RequestOptions options) throws IOException {
+        return restHighLevelClient.performRequestAndParseEntity(request,
+            MLRequestConverters::upgradeJobSnapshot,
+            options,
+            UpgradeJobModelSnapshotResponse::fromXContent,
+            Collections.emptySet());
+    }
+
+    /**
+     * Upgrades a snapshot for a Machine Learning Job to the current major version,
+     * notifies listener once the upgrade has started.
+     * <p>
+     * For additional info
+     * see <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/ml-upgrade-job-model-snapshot.html">
+     * ML Upgrade job snapshots documentation</a>
+     *
+     * @param request  The request
+     * @param options  Additional request options (e.g. headers), use {@link RequestOptions#DEFAULT} if nothing needs to be customized
+     * @param listener Listener to be notified upon request completion
+     * @return cancellable that may be used to cancel the request
+     */
+    public Cancellable upgradeJobSnapshotAsync(UpgradeJobModelSnapshotRequest request,
+                                               RequestOptions options,
+                                               ActionListener<UpgradeJobModelSnapshotResponse> listener) {
+        return restHighLevelClient.performRequestAsyncAndParseEntity(request,
+            MLRequestConverters::upgradeJobSnapshot,
+            options,
+            UpgradeJobModelSnapshotResponse::fromXContent,
+            listener,
+            Collections.emptySet());
+    }
+
     /**
      * Gets overall buckets for a set of Machine Learning Jobs.
      * <p>

+ 121 - 0
client/rest-high-level/src/main/java/org/elasticsearch/client/ml/UpgradeJobModelSnapshotRequest.java

@@ -0,0 +1,121 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.client.ml;
+
+import org.elasticsearch.client.Validatable;
+import org.elasticsearch.client.ml.job.config.Job;
+import org.elasticsearch.common.ParseField;
+import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.common.xcontent.ConstructingObjectParser;
+import org.elasticsearch.common.xcontent.ToXContentObject;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.common.xcontent.XContentParser;
+
+import java.io.IOException;
+import java.util.Objects;
+
+public class UpgradeJobModelSnapshotRequest implements Validatable, ToXContentObject {
+
+    public static final ParseField SNAPSHOT_ID = new ParseField("snapshot_id");
+    public static final ParseField TIMEOUT = new ParseField("timeout");
+    public static final ParseField WAIT_FOR_COMPLETION = new ParseField("wait_for_completion");
+
+    private static final ConstructingObjectParser<UpgradeJobModelSnapshotRequest, Void> PARSER = new ConstructingObjectParser<>(
+        "upgrade_job_snapshot_request",
+        true,
+        a -> new UpgradeJobModelSnapshotRequest((String) a[0], (String) a[1], (String) a[2], (Boolean) a[3]));
+
+    static {
+        PARSER.declareString(ConstructingObjectParser.constructorArg(), Job.ID);
+        PARSER.declareString(ConstructingObjectParser.constructorArg(), SNAPSHOT_ID);
+        PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), TIMEOUT);
+        PARSER.declareBoolean(ConstructingObjectParser.optionalConstructorArg(), WAIT_FOR_COMPLETION);
+    }
+
+    private final String jobId;
+    private final String snapshotId;
+    private final TimeValue timeout;
+    private final Boolean waitForCompletion;
+
+    UpgradeJobModelSnapshotRequest(String jobId, String snapshotId, String timeout, Boolean waitForCompletion) {
+        this(jobId,
+            snapshotId,
+            timeout == null ? null : TimeValue.parseTimeValue(timeout, TIMEOUT.getPreferredName()),
+            waitForCompletion);
+    }
+
+    public UpgradeJobModelSnapshotRequest(String jobId, String snapshotId, TimeValue timeValue, Boolean waitForCompletion) {
+        this.jobId = Objects.requireNonNull(jobId, Job.ID.getPreferredName());
+        this.snapshotId = Objects.requireNonNull(snapshotId, SNAPSHOT_ID.getPreferredName());
+        this.timeout = timeValue;
+        this.waitForCompletion = waitForCompletion;
+    }
+
+    public static UpgradeJobModelSnapshotRequest fromXContent(XContentParser parser) {
+        return PARSER.apply(parser, null);
+    }
+
+    public String getJobId() {
+        return jobId;
+    }
+
+    public String getSnapshotId() {
+        return snapshotId;
+    }
+
+    public TimeValue getTimeout() {
+        return timeout;
+    }
+
+    public Boolean getWaitForCompletion() {
+        return waitForCompletion;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        UpgradeJobModelSnapshotRequest request = (UpgradeJobModelSnapshotRequest) o;
+        return Objects.equals(jobId, request.jobId) &&
+            Objects.equals(timeout, request.timeout) &&
+            Objects.equals(waitForCompletion, request.waitForCompletion) &&
+            Objects.equals(snapshotId, request.snapshotId);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(jobId, snapshotId, timeout, waitForCompletion);
+    }
+
+    @Override
+    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
+        builder.startObject();
+        builder.field(Job.ID.getPreferredName(), jobId);
+        builder.field(SNAPSHOT_ID.getPreferredName(), snapshotId);
+        if (timeout != null) {
+            builder.field(TIMEOUT.getPreferredName(), timeout.getStringRep());
+        }
+        if (waitForCompletion != null) {
+            builder.field(WAIT_FOR_COMPLETION.getPreferredName(), waitForCompletion);
+        }
+        builder.endObject();
+        return builder;
+    }
+
+}

+ 99 - 0
client/rest-high-level/src/main/java/org/elasticsearch/client/ml/UpgradeJobModelSnapshotResponse.java

@@ -0,0 +1,99 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.client.ml;
+
+import org.elasticsearch.common.ParseField;
+import org.elasticsearch.common.xcontent.ConstructingObjectParser;
+import org.elasticsearch.common.xcontent.ToXContentObject;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.common.xcontent.XContentParser;
+
+import java.io.IOException;
+import java.util.Objects;
+
+public class UpgradeJobModelSnapshotResponse implements ToXContentObject {
+
+    private static final ParseField COMPLETED = new ParseField("completed");
+    private static final ParseField NODE = new ParseField("node");
+
+    public static final ConstructingObjectParser<UpgradeJobModelSnapshotResponse, Void> PARSER =
+        new ConstructingObjectParser<>("upgrade_job_snapshot_response", true,
+            (a) -> new UpgradeJobModelSnapshotResponse((Boolean) a[0], (String) a[1]));
+
+    static {
+        PARSER.declareBoolean(ConstructingObjectParser.optionalConstructorArg(), COMPLETED);
+        PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), NODE);
+    }
+
+    private final boolean completed;
+    private final String node;
+
+    public UpgradeJobModelSnapshotResponse(Boolean opened, String node) {
+        this.completed = opened != null && opened;
+        this.node = node;
+    }
+
+    public static UpgradeJobModelSnapshotResponse fromXContent(XContentParser parser) throws IOException {
+        return PARSER.parse(parser, null);
+    }
+
+    public boolean isCompleted() {
+        return completed;
+    }
+
+    /**
+     * The node that the job was assigned to
+     *
+     * @return The ID of a node if the job was assigned to a node.
+     */
+    public String getNode() {
+        return node;
+    }
+
+    @Override
+    public boolean equals(Object other) {
+        if (this == other) {
+            return true;
+        }
+
+        if (other == null || getClass() != other.getClass()) {
+            return false;
+        }
+
+        UpgradeJobModelSnapshotResponse that = (UpgradeJobModelSnapshotResponse) other;
+        return completed == that.completed
+            && Objects.equals(node, that.node);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(completed, node);
+    }
+
+    @Override
+    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
+        builder.startObject();
+        builder.field(COMPLETED.getPreferredName(), completed);
+        if (node != null) {
+            builder.field(NODE.getPreferredName(), node);
+        }
+        builder.endObject();
+        return builder;
+    }
+}

+ 25 - 0
client/rest-high-level/src/test/java/org/elasticsearch/client/MLRequestConvertersTests.java

@@ -84,6 +84,7 @@ import org.elasticsearch.client.ml.UpdateDataFrameAnalyticsRequest;
 import org.elasticsearch.client.ml.UpdateFilterRequest;
 import org.elasticsearch.client.ml.UpdateJobRequest;
 import org.elasticsearch.client.ml.UpdateModelSnapshotRequest;
+import org.elasticsearch.client.ml.UpgradeJobModelSnapshotRequest;
 import org.elasticsearch.client.ml.calendars.Calendar;
 import org.elasticsearch.client.ml.calendars.CalendarTests;
 import org.elasticsearch.client.ml.calendars.ScheduledEvent;
@@ -507,6 +508,30 @@ public class MLRequestConvertersTests extends ESTestCase {
         }
     }
 
+    public void testUpgradeJobModelSnapshot() {
+        String jobId = randomAlphaOfLength(10);
+        String snapshotId = randomAlphaOfLength(10);
+        TimeValue timeout = TimeValue.parseTimeValue(randomTimeValue(), "test");
+        boolean waitForCompletion = randomBoolean();
+        boolean includeTimeout = randomBoolean();
+        boolean includeWaitForCompletion = randomBoolean();
+        UpgradeJobModelSnapshotRequest upgradeJobModelSnapshotRequest = new UpgradeJobModelSnapshotRequest(jobId,
+            snapshotId,
+            includeTimeout ? timeout : null,
+            includeWaitForCompletion ? waitForCompletion : null);
+
+        Request request = MLRequestConverters.upgradeJobSnapshot(upgradeJobModelSnapshotRequest);
+        assertEquals(HttpPost.METHOD_NAME, request.getMethod());
+        assertEquals("/_ml/anomaly_detectors/" + jobId + "/model_snapshots/" + snapshotId + "/_upgrade", request.getEndpoint());
+        assertThat(request.getParameters().isEmpty(), equalTo(includeTimeout == false && includeWaitForCompletion == false));
+        if (includeTimeout) {
+            assertThat(request.getParameters().get("timeout"), equalTo(timeout.getStringRep()));
+        }
+        if (includeWaitForCompletion) {
+            assertThat(request.getParameters().get("wait_for_completion"), equalTo(Boolean.toString(waitForCompletion)));
+        }
+    }
+
     public void testRevertModelSnapshot() throws IOException {
         String jobId = randomAlphaOfLength(10);
         String snapshotId = randomAlphaOfLength(10);

+ 28 - 1
client/rest-high-level/src/test/java/org/elasticsearch/client/MachineLearningIT.java

@@ -19,7 +19,9 @@
 package org.elasticsearch.client;
 
 import com.carrotsearch.randomizedtesting.generators.CodepointSetGenerator;
+import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.ElasticsearchStatusException;
+import org.elasticsearch.Version;
 import org.elasticsearch.action.bulk.BulkRequest;
 import org.elasticsearch.action.get.GetRequest;
 import org.elasticsearch.action.get.GetResponse;
@@ -121,6 +123,7 @@ import org.elasticsearch.client.ml.UpdateFilterRequest;
 import org.elasticsearch.client.ml.UpdateJobRequest;
 import org.elasticsearch.client.ml.UpdateModelSnapshotRequest;
 import org.elasticsearch.client.ml.UpdateModelSnapshotResponse;
+import org.elasticsearch.client.ml.UpgradeJobModelSnapshotRequest;
 import org.elasticsearch.client.ml.calendars.Calendar;
 import org.elasticsearch.client.ml.calendars.CalendarTests;
 import org.elasticsearch.client.ml.calendars.ScheduledEvent;
@@ -214,6 +217,7 @@ import static org.hamcrest.Matchers.anyOf;
 import static org.hamcrest.Matchers.closeTo;
 import static org.hamcrest.Matchers.contains;
 import static org.hamcrest.Matchers.containsInAnyOrder;
+import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.greaterThanOrEqualTo;
 import static org.hamcrest.Matchers.hasItem;
@@ -2726,6 +2730,10 @@ public class MachineLearningIT extends ESRestHighLevelClientTestCase {
     }
 
     public void createModelSnapshot(String jobId, String snapshotId) throws IOException {
+        createModelSnapshot(jobId, snapshotId, Version.CURRENT);
+    }
+
+    public void createModelSnapshot(String jobId, String snapshotId, Version minVersion) throws IOException {
         String documentId = jobId + "_model_snapshot_" + snapshotId;
         Job job = MachineLearningIT.buildJob(jobId);
         highLevelClient().machineLearning().putJob(new PutJobRequest(job), RequestOptions.DEFAULT);
@@ -2739,11 +2747,13 @@ public class MachineLearningIT extends ESRestHighLevelClientTestCase {
             "\"total_by_field_count\":3, \"total_over_field_count\":0, \"total_partition_field_count\":2," +
             "\"bucket_allocation_failures_count\":0, \"memory_status\":\"ok\", \"log_time\":1541587919000, " +
             "\"timestamp\":1519930800000}, \"latest_record_time_stamp\":1519931700000," +
-            "\"latest_result_time_stamp\":1519930800000, \"retain\":false}", XContentType.JSON);
+            "\"latest_result_time_stamp\":1519930800000, \"retain\":false, \"min_version\":\"" + minVersion.toString() + "\"}",
+            XContentType.JSON);
 
         highLevelClient().index(indexRequest, RequestOptions.DEFAULT);
     }
 
+
     public void createModelSnapshots(String jobId, List<String> snapshotIds) throws IOException {
         Job job = MachineLearningIT.buildJob(jobId);
         highLevelClient().machineLearning().putJob(new PutJobRequest(job), RequestOptions.DEFAULT);
@@ -2818,6 +2828,23 @@ public class MachineLearningIT extends ESRestHighLevelClientTestCase {
             getModelSnapshotsResponse2.snapshots().get(0).getDescription());
     }
 
+    public void testUpgradeJobSnapshot() throws Exception {
+        String jobId = "test-upgrade-model-snapshot";
+        String snapshotId = "1541587919";
+
+        createModelSnapshot(jobId, snapshotId, Version.CURRENT);
+        MachineLearningClient machineLearningClient = highLevelClient().machineLearning();
+        UpgradeJobModelSnapshotRequest request = new UpgradeJobModelSnapshotRequest(jobId, snapshotId, null, true);
+        ElasticsearchException ex = expectThrows(ElasticsearchException.class,
+            () -> execute(request, machineLearningClient::upgradeJobSnapshot, machineLearningClient::upgradeJobSnapshotAsync));
+        assertThat(
+            ex.getMessage(),
+            containsString(
+                "Cannot upgrade job [test-upgrade-model-snapshot] snapshot [1541587919] as it is already compatible with current version"
+            )
+        );
+    }
+
     public void testRevertModelSnapshot() throws IOException {
         String jobId = "test-revert-model-snapshot";
 

+ 80 - 0
client/rest-high-level/src/test/java/org/elasticsearch/client/documentation/MlClientDocumentationIT.java

@@ -18,6 +18,7 @@
  */
 package org.elasticsearch.client.documentation;
 
+import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.action.ActionListener;
 import org.elasticsearch.action.LatchedActionListener;
 import org.elasticsearch.action.bulk.BulkRequest;
@@ -135,6 +136,8 @@ import org.elasticsearch.client.ml.UpdateFilterRequest;
 import org.elasticsearch.client.ml.UpdateJobRequest;
 import org.elasticsearch.client.ml.UpdateModelSnapshotRequest;
 import org.elasticsearch.client.ml.UpdateModelSnapshotResponse;
+import org.elasticsearch.client.ml.UpgradeJobModelSnapshotRequest;
+import org.elasticsearch.client.ml.UpgradeJobModelSnapshotResponse;
 import org.elasticsearch.client.ml.calendars.Calendar;
 import org.elasticsearch.client.ml.calendars.ScheduledEvent;
 import org.elasticsearch.client.ml.calendars.ScheduledEventTests;
@@ -238,6 +241,7 @@ import static org.hamcrest.Matchers.allOf;
 import static org.hamcrest.Matchers.closeTo;
 import static org.hamcrest.Matchers.contains;
 import static org.hamcrest.Matchers.containsInAnyOrder;
+import static org.hamcrest.Matchers.containsString;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.greaterThan;
 import static org.hamcrest.Matchers.hasSize;
@@ -2332,6 +2336,82 @@ public class MlClientDocumentationIT extends ESRestHighLevelClientTestCase {
         }
     }
 
+    public void testUpgradeJobSnapshot() throws IOException, InterruptedException {
+        RestHighLevelClient client = highLevelClient();
+
+        String jobId = "test-upgrade-job-model-snapshot";
+        String snapshotId = "1541587919";
+        Job job = MachineLearningIT.buildJob(jobId);
+        client.machineLearning().putJob(new PutJobRequest(job), RequestOptions.DEFAULT);
+
+        // Let us index a snapshot
+        String documentId = jobId + "_model_snapshot_" + snapshotId;
+        IndexRequest indexRequest = new IndexRequest(".ml-anomalies-shared").id(documentId);
+        indexRequest.setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE);
+        indexRequest.source("{\"job_id\":\"test-upgrade-job-model-snapshot\", \"timestamp\":1541587919000, " +
+            "\"description\":\"State persisted due to job close at 2018-11-07T10:51:59+0000\", " +
+            "\"snapshot_id\":\"1541587919\", \"snapshot_doc_count\":1, \"model_size_stats\":{" +
+            "\"job_id\":\"test-revert-model-snapshot\", \"result_type\":\"model_size_stats\",\"model_bytes\":51722, " +
+            "\"total_by_field_count\":3, \"total_over_field_count\":0, \"total_partition_field_count\":2," +
+            "\"bucket_allocation_failures_count\":0, \"memory_status\":\"ok\", \"log_time\":1541587919000, " +
+            "\"timestamp\":1519930800000}, \"latest_record_time_stamp\":1519931700000," +
+            "\"latest_result_time_stamp\":1519930800000, \"retain\":false, " +
+            "\"quantiles\":{\"job_id\":\"test-revert-model-snapshot\", \"timestamp\":1541587919000, " +
+            "\"quantile_state\":\"state\"}}", XContentType.JSON);
+        client.index(indexRequest, RequestOptions.DEFAULT);
+
+        {
+            // tag::upgrade-job-model-snapshot-request
+            UpgradeJobModelSnapshotRequest request = new UpgradeJobModelSnapshotRequest(
+                jobId, // <1>
+                snapshotId, // <2>
+                TimeValue.timeValueMinutes(30), // <3>
+                false); // <4>
+            // end::upgrade-job-model-snapshot-request
+
+            try {
+                // tag::upgrade-job-model-snapshot-execute
+                UpgradeJobModelSnapshotResponse response = client.machineLearning().upgradeJobSnapshot(request, RequestOptions.DEFAULT);
+                // end::upgrade-job-model-snapshot-execute
+            } catch (ElasticsearchException ex) {
+                assertThat(ex.getMessage(), containsString("Expected persisted state but no state exists"));
+            }
+            UpgradeJobModelSnapshotResponse response = new UpgradeJobModelSnapshotResponse(true, "");
+
+            // tag::upgrade-job-model-snapshot-response
+            boolean completed = response.isCompleted(); // <1>
+            String node = response.getNode(); // <2>
+            // end::upgrade-job-model-snapshot-response
+        }
+        {
+            UpgradeJobModelSnapshotRequest request = new UpgradeJobModelSnapshotRequest(jobId, snapshotId, null, false);
+
+            // tag::upgrade-job-model-snapshot-execute-listener
+            ActionListener<UpgradeJobModelSnapshotResponse> listener =
+                new ActionListener<UpgradeJobModelSnapshotResponse>() {
+                    @Override
+                    public void onResponse(UpgradeJobModelSnapshotResponse revertModelSnapshotResponse) {
+                        // <1>
+                    }
+
+                    @Override
+                    public void onFailure(Exception e) {
+                        // <2>
+                    }
+                };
+            // end::upgrade-job-model-snapshot-execute-listener
+
+            // Replace the empty listener by a blocking listener in test
+            final CountDownLatch latch = new CountDownLatch(1);
+            listener = new LatchedActionListener<>(listener, latch);
+
+            // tag::upgrade-job-model-snapshot-execute-async
+            client.machineLearning().upgradeJobSnapshotAsync(request, RequestOptions.DEFAULT, listener); // <1>
+            // end::upgrade-job-model-snapshot-execute-async
+
+            assertTrue(latch.await(30L, TimeUnit.SECONDS));
+        }
+    }
 
     public void testUpdateModelSnapshot() throws IOException, InterruptedException {
         RestHighLevelClient client = highLevelClient();

+ 45 - 0
client/rest-high-level/src/test/java/org/elasticsearch/client/ml/UpgradeJobModelSnapshotRequestTests.java

@@ -0,0 +1,45 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.client.ml;
+
+import org.elasticsearch.common.xcontent.XContentParser;
+import org.elasticsearch.test.AbstractXContentTestCase;
+
+import java.io.IOException;
+
+public class UpgradeJobModelSnapshotRequestTests extends AbstractXContentTestCase<UpgradeJobModelSnapshotRequest> {
+
+    @Override
+    protected UpgradeJobModelSnapshotRequest createTestInstance() {
+        return new UpgradeJobModelSnapshotRequest(randomAlphaOfLength(10),
+            randomAlphaOfLength(10),
+            randomBoolean() ? null : randomTimeValue(),
+            randomBoolean() ? null : randomBoolean());
+    }
+
+    @Override
+    protected UpgradeJobModelSnapshotRequest doParseInstance(XContentParser parser) throws IOException {
+        return UpgradeJobModelSnapshotRequest.fromXContent(parser);
+    }
+
+    @Override
+    protected boolean supportsUnknownFields() {
+        return true;
+    }
+}

+ 44 - 0
client/rest-high-level/src/test/java/org/elasticsearch/client/ml/UpgradeJobModelSnapshotResponseTests.java

@@ -0,0 +1,44 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.client.ml;
+
+import org.elasticsearch.common.xcontent.XContentParser;
+import org.elasticsearch.test.AbstractXContentTestCase;
+
+import java.io.IOException;
+
+
+public class UpgradeJobModelSnapshotResponseTests extends AbstractXContentTestCase<UpgradeJobModelSnapshotResponse> {
+
+    @Override
+    protected UpgradeJobModelSnapshotResponse createTestInstance() {
+        return new UpgradeJobModelSnapshotResponse(randomBoolean() ? null : randomBoolean(),
+            randomBoolean() ? null : randomAlphaOfLength(10));
+    }
+
+    @Override
+    protected UpgradeJobModelSnapshotResponse doParseInstance(XContentParser parser) throws IOException {
+        return UpgradeJobModelSnapshotResponse.fromXContent(parser);
+    }
+
+    @Override
+    protected boolean supportsUnknownFields() {
+        return true;
+    }
+}

+ 44 - 0
docs/java-rest/high-level/ml/upgrade-job-model-snapshot.asciidoc

@@ -0,0 +1,44 @@
+--
+:api: upgrade-job-model-snapshot
+:request: UpgradeJobModelSnapshotRequest
+:response: UpgradeJobModelSnapshotResponse
+--
+[role="xpack"]
+
+[id="{upid}-{api}"]
+=== Upgrade job snapshot API
+
+Upgrades a previously stored {ml} model snapshot to the
+current major version.
+It accepts an +{request}+ object and responds
+with an +{response}+ object.
+
+[id="{upid}-{api}-request"]
+==== Upgrade job snapshots request
+
+A +{request}+ requires the following arguments:
+
+["source","java",subs="attributes,callouts,macros"]
+--------------------------------------------------
+include-tagged::{doc-tests-file}[{api}-request]
+--------------------------------------------------
+<1> The job that owns the snapshot
+<2> The snapshot id to upgrade
+<3> The time out of the request. When `wait_for_completion` is `false` the timeout value is
+    applied to the time it takes for the task to be assigned to a node. When `wait_for_completion`
+    is `true` this timeout applies for the whole upgrade process. The default value is `30m`
+<4> When true, this causes the request to not return until the upgrade is complete.
+    Otherwise, the response returns as soon as the task
+    is assigned to a node. The default is false.
+
+[id="{upid}-{api}-response"]
+==== Upgrade job snapshot response
+
+A +{response}+ contains information on if the request was completed and its assigned node.
+
+["source","java",subs="attributes,callouts,macros"]
+--------------------------------------------------
+include-tagged::{doc-tests-file}[{api}-response]
+--------------------------------------------------
+<1> Was the upgrade completed
+<2> What is the assigned node if the task is not completed

+ 2 - 0
docs/java-rest/high-level/supported-apis.asciidoc

@@ -344,6 +344,7 @@ The Java High Level REST Client supports the following {ml} APIs:
 * <<{upid}-update-datafeed>>
 * <<{upid}-update-filter>>
 * <<{upid}-update-model-snapshot>>
+* <<{upid}-upgrade-job-model-snapshot>>
 
 // CLOSE
 include::ml/close-job.asciidoc[]
@@ -421,6 +422,7 @@ include::ml/update-data-frame-analytics.asciidoc[]
 include::ml/update-datafeed.asciidoc[]
 include::ml/update-filter.asciidoc[]
 include::ml/update-model-snapshot.asciidoc[]
+include::ml/upgrade-job-model-snapshot.asciidoc[]
 
 == Migration APIs
 

+ 85 - 0
docs/reference/ml/anomaly-detection/apis/upgrade-job-model-snapshot.asciidoc

@@ -0,0 +1,85 @@
+[role="xpack"]
+[testenv="platinum"]
+[[ml-upgrade-job-model-snapshot]]
+= Upgrade {anomaly-jobs} Snapshot API
+++++
+<titleabbrev>Upgrade job snapshot</titleabbrev>
+++++
+
+Upgrades an {anomaly-job} snapshot to the latest major version.
+
+[[ml-upgrade-job-model-snapshot-request]]
+== {api-request-title}
+
+`POST _ml/anomaly_detectors/{job_id}/model_snapshots/{snapshot_id}/_upgrade`
+
+[[ml-upgrade-job-model-snapshot-prereqs]]
+== {api-prereq-title}
+
+* If the {es} {security-features} are enabled, you must have `manage_ml` or
+`manage` cluster privileges to use this API. See
+<<security-privileges>> and {ml-docs-setup-privileges}.
+* The upgraded snapshot must have a version matching the previous major version.
+* The upgraded snapshot must NOT be the current {anomaly-job} snapshot.
+
+[[ml-upgrade-job-model-snapshot-desc]]
+== {api-description-title}
+
+Overtime, older snapshot formats are deprecated and removed. {anomaly-jobs} will only
+support snapshots that are from the current or previous major version.
+
+This API provides a means to upgrade a snapshot to the current major version. This
+aids in preparing the cluster for an upgrade to the next major version.
+
+Only one snapshot per {anomaly-job} can be upgraded at a time and the upgraded snapshot cannot
+be the current snapshot of the {anomaly-job}.
+
+[[ml-upgrade-job-model-snapshot-path-parms]]
+== {api-path-parms-title}
+
+`<job_id>`::
+(Required, string)
+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=job-id-anomaly-detection]
+
+`<snapshot_id>`::
+(Required, string)
+include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=snapshot-id]
+
+[[ml-upgrade-job-model-snapshot-path-parms]]
+== {api-path-parms-title}
+
+`timeout`::
+  (Optional, time) Controls the time to wait for the request to complete. The default
+  value is 30 minutes.
+
+`wait_for_completion`::
+(Optional, boolean) When true, the API won't respond until the upgrade is complete. Otherwise,
+it responds as soon as the upgrade task is assigned to a node. Default is false.
+
+[[ml-upgrade-job-model-snapshot-response-body]]
+== {api-response-body-title}
+
+`node`::
+  (string) The ID of the assigned node for the upgrade task if it is still running.
+
+`completed`::
+  (boolean) When true, this means the task is complete. When false, it is still running.
+
+[[ml-upgrade-job-model-snapshot-example]]
+== {api-examples-title}
+
+[source,console]
+--------------------------------------------------
+POST _ml/anomaly_detectors/low_request_rate/model_snapshots/1828371/_upgrade?timeout=45m&wait_for_completion=true
+--------------------------------------------------
+// TEST[skip:Kibana sample data]
+
+When the snapshot upgrade starts, you receive the following results:
+
+[source,console-result]
+----
+{
+  "completed" : false,
+  "node" : "node-1"
+}
+----

+ 3 - 0
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/XPackClientPlugin.java

@@ -141,6 +141,7 @@ import org.elasticsearch.xpack.core.ml.action.ValidateJobConfigAction;
 import org.elasticsearch.xpack.core.ml.datafeed.DatafeedState;
 import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsTaskState;
 import org.elasticsearch.xpack.core.ml.job.config.JobTaskState;
+import org.elasticsearch.xpack.core.ml.job.snapshot.upgrade.SnapshotUpgradeTaskState;
 import org.elasticsearch.xpack.core.monitoring.MonitoringFeatureSetUsage;
 import org.elasticsearch.xpack.core.rollup.RollupFeatureSetUsage;
 import org.elasticsearch.xpack.core.rollup.RollupField;
@@ -534,6 +535,8 @@ public class XPackClientPlugin extends Plugin implements ActionPlugin, NetworkPl
                 new NamedXContentRegistry.Entry(PersistentTaskState.class, new ParseField(JobTaskState.NAME), JobTaskState::fromXContent),
                 new NamedXContentRegistry.Entry(PersistentTaskState.class, new ParseField(DataFrameAnalyticsTaskState.NAME),
                     DataFrameAnalyticsTaskState::fromXContent),
+                new NamedXContentRegistry.Entry(PersistentTaskState.class, new ParseField(SnapshotUpgradeTaskState.NAME),
+                    SnapshotUpgradeTaskState::fromXContent),
                 // watcher
                 new NamedXContentRegistry.Entry(Metadata.Custom.class, new ParseField(WatcherMetadata.TYPE),
                         WatcherMetadata::fromXContent),

+ 13 - 0
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/MlTasks.java

@@ -26,10 +26,12 @@ public final class MlTasks {
     public static final String JOB_TASK_NAME = "xpack/ml/job";
     public static final String DATAFEED_TASK_NAME = "xpack/ml/datafeed";
     public static final String DATA_FRAME_ANALYTICS_TASK_NAME = "xpack/ml/data_frame/analytics";
+    public static final String JOB_SNAPSHOT_UPGRADE_TASK_NAME = "xpack/ml/job/snapshot/upgrade";
 
     public static final String JOB_TASK_ID_PREFIX = "job-";
     public static final String DATAFEED_TASK_ID_PREFIX = "datafeed-";
     public static final String DATA_FRAME_ANALYTICS_TASK_ID_PREFIX = "data_frame_analytics-";
+    public static final String JOB_SNAPSHOT_UPGRADE_TASK_ID_PREFIX = "job-snapshot-upgrade-";
 
     public static final PersistentTasksCustomMetadata.Assignment AWAITING_UPGRADE =
         new PersistentTasksCustomMetadata.Assignment(null,
@@ -54,6 +56,10 @@ public final class MlTasks {
         return DATAFEED_TASK_ID_PREFIX + datafeedId;
     }
 
+    public static String snapshotUpgradeTaskId(String jobId, String snapshotId) {
+        return JOB_SNAPSHOT_UPGRADE_TASK_ID_PREFIX + jobId + "-" + snapshotId;
+    }
+
     /**
      * Namespaces the task ids for data frame analytics.
      */
@@ -78,6 +84,13 @@ public final class MlTasks {
         return tasks == null ? null : tasks.getTask(dataFrameAnalyticsTaskId(analyticsId));
     }
 
+    @Nullable
+    public static PersistentTasksCustomMetadata.PersistentTask<?> getSnapshotUpgraderTask(String jobId,
+                                                                                          String snapshotId,
+                                                                                          @Nullable PersistentTasksCustomMetadata tasks) {
+        return tasks == null ? null : tasks.getTask(snapshotUpgradeTaskId(jobId, snapshotId));
+    }
+
     /**
      * Note that the return value of this method does NOT take node relocations into account.
      * Use {@link #getJobStateModifiedForReassignments} to return a value adjusted to the most

+ 204 - 0
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/UpgradeJobModelSnapshotAction.java

@@ -0,0 +1,204 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.core.ml.action;
+
+import org.elasticsearch.action.ActionRequestValidationException;
+import org.elasticsearch.action.ActionResponse;
+import org.elasticsearch.action.ActionType;
+import org.elasticsearch.action.support.master.MasterNodeRequest;
+import org.elasticsearch.common.ParseField;
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.common.xcontent.ConstructingObjectParser;
+import org.elasticsearch.common.xcontent.ToXContentObject;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.common.xcontent.XContentParser;
+import org.elasticsearch.xpack.core.ml.job.config.Job;
+import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
+
+import java.io.IOException;
+import java.util.Objects;
+
+public class UpgradeJobModelSnapshotAction extends ActionType<UpgradeJobModelSnapshotAction.Response> {
+
+    public static final UpgradeJobModelSnapshotAction INSTANCE = new UpgradeJobModelSnapshotAction();
+    public static final String NAME = "cluster:admin/xpack/ml/job/model_snapshots/upgrade";
+
+    private UpgradeJobModelSnapshotAction() {
+        super(NAME, Response::new);
+    }
+
+    public static class Request extends MasterNodeRequest<Request> implements ToXContentObject {
+        // Default to 30m as loading an older snapshot can take a while
+        public static final TimeValue DEFAULT_TIMEOUT = TimeValue.timeValueMinutes(30);
+
+        public static final ParseField SNAPSHOT_ID = new ParseField("snapshot_id");
+        public static final ParseField TIMEOUT = new ParseField("timeout");
+        public static final ParseField WAIT_FOR_COMPLETION = new ParseField("wait_for_completion");
+
+        private static final ConstructingObjectParser<Request, Void> PARSER = new ConstructingObjectParser<>(
+            NAME,
+            a -> new UpgradeJobModelSnapshotAction.Request((String) a[0], (String) a[1], (String) a[2], (Boolean) a[3]));
+        static {
+            PARSER.declareString(ConstructingObjectParser.constructorArg(), Job.ID);
+            PARSER.declareString(ConstructingObjectParser.constructorArg(), SNAPSHOT_ID);
+            PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), TIMEOUT);
+            PARSER.declareBoolean(ConstructingObjectParser.optionalConstructorArg(), WAIT_FOR_COMPLETION);
+        }
+
+        public static UpgradeJobModelSnapshotAction.Request parseRequest(XContentParser parser) {
+            return PARSER.apply(parser, null);
+        }
+
+        private final String jobId;
+        private final String snapshotId;
+        private final TimeValue timeout;
+        private final boolean waitForCompletion;
+
+        Request(String jobId, String snapshotId, String timeout, Boolean waitForCompletion) {
+            this(jobId,
+                snapshotId,
+                timeout == null ? null : TimeValue.parseTimeValue(timeout, TIMEOUT.getPreferredName()),
+                waitForCompletion != null && waitForCompletion);
+        }
+
+        public Request(String jobId, String snapshotId, TimeValue timeValue, boolean waitForCompletion) {
+            this.jobId = ExceptionsHelper.requireNonNull(jobId, Job.ID);
+            this.snapshotId = ExceptionsHelper.requireNonNull(snapshotId, SNAPSHOT_ID);
+            this.timeout = timeValue == null ? DEFAULT_TIMEOUT : timeValue;
+            this.waitForCompletion = waitForCompletion;
+        }
+
+        public Request(StreamInput in) throws IOException {
+            super(in);
+            this.jobId = in.readString();
+            this.snapshotId = in.readString();
+            this.timeout = in.readTimeValue();
+            this.waitForCompletion = in.readBoolean();
+        }
+
+        @Override
+        public void writeTo(StreamOutput out) throws IOException {
+            super.writeTo(out);
+            out.writeString(jobId);
+            out.writeString(snapshotId);
+            out.writeTimeValue(timeout);
+            out.writeBoolean(waitForCompletion);
+        }
+
+        @Override
+        public ActionRequestValidationException validate() {
+            return null;
+        }
+
+        public String getJobId() {
+            return jobId;
+        }
+
+        public String getSnapshotId() {
+            return snapshotId;
+        }
+
+        public TimeValue getTimeout() {
+            return timeout;
+        }
+
+        public boolean isWaitForCompletion() {
+            return waitForCompletion;
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o) return true;
+            if (o == null || getClass() != o.getClass()) return false;
+            Request request = (Request) o;
+            return Objects.equals(jobId, request.jobId) &&
+                Objects.equals(timeout, request.timeout) &&
+                Objects.equals(snapshotId, request.snapshotId) &&
+                waitForCompletion == request.waitForCompletion;
+        }
+
+        @Override
+        public int hashCode() {
+            return Objects.hash(jobId, snapshotId, timeout, waitForCompletion);
+        }
+
+        @Override
+        public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
+            builder.startObject();
+            builder.field(Job.ID.getPreferredName(), jobId);
+            builder.field(SNAPSHOT_ID.getPreferredName(), snapshotId);
+            builder.field(TIMEOUT.getPreferredName(), timeout.getStringRep());
+            builder.field(WAIT_FOR_COMPLETION.getPreferredName(), waitForCompletion);
+            builder.endObject();
+            return builder;
+        }
+    }
+
+    public static class Response extends ActionResponse implements ToXContentObject {
+
+        public static final ParseField NODE = new ParseField("node");
+        public static final ParseField COMPLETED = new ParseField("completed");
+
+        private static final ConstructingObjectParser<Response, Void> PARSER = new ConstructingObjectParser<>(
+            NAME,
+            a -> new UpgradeJobModelSnapshotAction.Response((boolean) a[0], (String) a[1]));
+        static {
+            PARSER.declareBoolean(ConstructingObjectParser.constructorArg(), COMPLETED);
+            PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), NODE);
+        }
+
+        public static UpgradeJobModelSnapshotAction.Response parseRequest(XContentParser parser) {
+            return PARSER.apply(parser, null);
+        }
+
+        private final boolean completed;
+        private final String node;
+
+        public Response(boolean completed, String node) {
+            this.completed = completed;
+            this.node = node;
+        }
+
+        public Response(StreamInput in) throws IOException {
+            super(in);
+            this.completed = in.readBoolean();
+            this.node = in.readOptionalString();
+        }
+
+        @Override
+        public void writeTo(StreamOutput out) throws IOException {
+            out.writeBoolean(completed);
+            out.writeOptionalString(node);
+        }
+
+        @Override
+        public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
+            builder.startObject();
+            builder.field(COMPLETED.getPreferredName(), completed);
+            if (node != null) {
+                builder.field(NODE.getPreferredName(), node);
+            }
+            builder.endObject();
+            return builder;
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o) return true;
+            if (o == null || getClass() != o.getClass()) return false;
+            Response response = (Response) o;
+            return completed == response.completed &&
+                Objects.equals(node, response.node);
+        }
+
+        @Override
+        public int hashCode() {
+            return Objects.hash(completed, node);
+        }
+    }
+}

+ 37 - 0
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/snapshot/upgrade/SnapshotUpgradeState.java

@@ -0,0 +1,37 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.core.ml.job.snapshot.upgrade;
+
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.common.io.stream.Writeable;
+
+import java.io.IOException;
+import java.util.Locale;
+
+public enum SnapshotUpgradeState implements Writeable {
+
+    LOADING_OLD_STATE, SAVING_NEW_STATE, STOPPED, FAILED;
+
+    public static SnapshotUpgradeState fromString(String name) {
+        return valueOf(name.trim().toUpperCase(Locale.ROOT));
+    }
+
+    public static SnapshotUpgradeState fromStream(StreamInput in) throws IOException {
+        return in.readEnum(SnapshotUpgradeState.class);
+    }
+
+    @Override
+    public void writeTo(StreamOutput out) throws IOException {
+        out.writeEnum(this);
+    }
+
+    @Override
+    public String toString() {
+        return name().toLowerCase(Locale.ROOT);
+    }
+
+}

+ 112 - 0
x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/snapshot/upgrade/SnapshotUpgradeTaskState.java

@@ -0,0 +1,112 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+
+package org.elasticsearch.xpack.core.ml.job.snapshot.upgrade;
+
+import org.elasticsearch.common.Nullable;
+import org.elasticsearch.common.ParseField;
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.common.xcontent.ConstructingObjectParser;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.common.xcontent.XContentParser;
+import org.elasticsearch.persistent.PersistentTaskState;
+import org.elasticsearch.xpack.core.ml.MlTasks;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.Objects;
+
+public class SnapshotUpgradeTaskState implements PersistentTaskState {
+
+    public static final String NAME = MlTasks.JOB_SNAPSHOT_UPGRADE_TASK_NAME;
+
+    private static ParseField STATE = new ParseField("state");
+    private static ParseField ALLOCATION_ID = new ParseField("allocation_id");
+    private static ParseField REASON = new ParseField("reason");
+
+    private final SnapshotUpgradeState state;
+    private final long allocationId;
+    private final String reason;
+
+    private static final ConstructingObjectParser<SnapshotUpgradeTaskState, Void> PARSER =
+        new ConstructingObjectParser<>(NAME, true,
+            a -> new SnapshotUpgradeTaskState((SnapshotUpgradeState) a[0], (long) a[1], (String) a[2]));
+
+    static {
+        PARSER.declareString(ConstructingObjectParser.constructorArg(), SnapshotUpgradeState::fromString, STATE);
+        PARSER.declareLong(ConstructingObjectParser.constructorArg(), ALLOCATION_ID);
+        PARSER.declareString(ConstructingObjectParser.optionalConstructorArg(), REASON);
+    }
+
+    public static SnapshotUpgradeTaskState fromXContent(XContentParser parser) {
+        try {
+            return PARSER.parse(parser, null);
+        } catch (IOException e) {
+            throw new UncheckedIOException(e);
+        }
+    }
+
+    public SnapshotUpgradeTaskState(SnapshotUpgradeState state, long allocationId, @Nullable String reason) {
+        this.state = Objects.requireNonNull(state);
+        this.allocationId = allocationId;
+        this.reason = reason;
+    }
+
+    public SnapshotUpgradeTaskState(StreamInput in) throws IOException {
+        this.state = SnapshotUpgradeState.fromStream(in);
+        this.allocationId = in.readLong();
+        this.reason = in.readOptionalString();
+    }
+
+    public SnapshotUpgradeState getState() {
+        return state;
+    }
+
+    @Nullable
+    public String getReason() {
+        return reason;
+    }
+
+    @Override
+    public String getWriteableName() {
+        return NAME;
+    }
+
+    @Override
+    public void writeTo(StreamOutput out) throws IOException {
+        state.writeTo(out);
+        out.writeLong(allocationId);
+        out.writeOptionalString(reason);
+    }
+
+    @Override
+    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
+        builder.startObject();
+        builder.field(STATE.getPreferredName(), state.toString());
+        builder.field(ALLOCATION_ID.getPreferredName(), allocationId);
+        if (reason != null) {
+            builder.field(REASON.getPreferredName(), reason);
+        }
+        builder.endObject();
+        return builder;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        SnapshotUpgradeTaskState that = (SnapshotUpgradeTaskState) o;
+        return allocationId == that.allocationId &&
+            state == that.state &&
+            Objects.equals(reason, that.reason);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(state, allocationId, reason);
+    }
+}

+ 37 - 0
x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/UpgradeJobModelSnapshotRequestTests.java

@@ -0,0 +1,37 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.core.ml.action;
+
+import org.elasticsearch.common.io.stream.Writeable;
+import org.elasticsearch.common.xcontent.XContentParser;
+import org.elasticsearch.test.AbstractSerializingTestCase;
+import org.elasticsearch.xpack.core.ml.action.UpgradeJobModelSnapshotAction.Request;
+
+public class UpgradeJobModelSnapshotRequestTests extends AbstractSerializingTestCase<Request> {
+
+    @Override
+    protected Request createTestInstance() {
+        return new Request(randomAlphaOfLength(10),
+            randomAlphaOfLength(10),
+            randomBoolean() ? null : randomTimeValue(),
+            randomBoolean() ? null : randomBoolean());
+    }
+
+    @Override
+    protected Writeable.Reader<Request> instanceReader() {
+        return Request::new;
+    }
+
+    @Override
+    protected boolean supportsUnknownFields() {
+        return false;
+    }
+
+    @Override
+    protected Request doParseInstance(XContentParser parser) {
+        return Request.parseRequest(parser);
+    }
+}

+ 34 - 0
x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/UpgradeJobModelSnapshotResponseTests.java

@@ -0,0 +1,34 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.core.ml.action;
+
+import org.elasticsearch.common.io.stream.Writeable;
+import org.elasticsearch.common.xcontent.XContentParser;
+import org.elasticsearch.test.AbstractSerializingTestCase;
+import org.elasticsearch.xpack.core.ml.action.UpgradeJobModelSnapshotAction.Response;
+
+public class UpgradeJobModelSnapshotResponseTests extends AbstractSerializingTestCase<Response> {
+
+    @Override
+    protected Response createTestInstance() {
+        return new Response(randomBoolean(), randomBoolean() ? null : randomAlphaOfLength(10));
+    }
+
+    @Override
+    protected Writeable.Reader<Response> instanceReader() {
+        return Response::new;
+    }
+
+    @Override
+    protected boolean supportsUnknownFields() {
+        return false;
+    }
+
+    @Override
+    protected Response doParseInstance(XContentParser parser) {
+        return Response.parseRequest(parser);
+    }
+}

+ 43 - 15
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java

@@ -141,6 +141,7 @@ import org.elasticsearch.xpack.core.ml.action.UpdateFilterAction;
 import org.elasticsearch.xpack.core.ml.action.UpdateJobAction;
 import org.elasticsearch.xpack.core.ml.action.UpdateModelSnapshotAction;
 import org.elasticsearch.xpack.core.ml.action.UpdateProcessAction;
+import org.elasticsearch.xpack.core.ml.action.UpgradeJobModelSnapshotAction;
 import org.elasticsearch.xpack.core.ml.action.ValidateDetectorAction;
 import org.elasticsearch.xpack.core.ml.action.ValidateJobConfigAction;
 import org.elasticsearch.xpack.core.ml.datafeed.DatafeedState;
@@ -153,6 +154,7 @@ import org.elasticsearch.xpack.core.ml.inference.persistence.InferenceIndexConst
 import org.elasticsearch.xpack.core.ml.job.config.JobTaskState;
 import org.elasticsearch.xpack.core.ml.job.persistence.AnomalyDetectorsIndex;
 import org.elasticsearch.xpack.core.ml.job.persistence.AnomalyDetectorsIndexFields;
+import org.elasticsearch.xpack.core.ml.job.snapshot.upgrade.SnapshotUpgradeTaskState;
 import org.elasticsearch.xpack.core.ml.notifications.NotificationsIndex;
 import org.elasticsearch.xpack.core.template.TemplateUtils;
 import org.elasticsearch.xpack.ml.action.TransportCloseJobAction;
@@ -218,6 +220,7 @@ import org.elasticsearch.xpack.ml.action.TransportUpdateFilterAction;
 import org.elasticsearch.xpack.ml.action.TransportUpdateJobAction;
 import org.elasticsearch.xpack.ml.action.TransportUpdateModelSnapshotAction;
 import org.elasticsearch.xpack.ml.action.TransportUpdateProcessAction;
+import org.elasticsearch.xpack.ml.action.TransportUpgradeJobModelSnapshotAction;
 import org.elasticsearch.xpack.ml.action.TransportValidateDetectorAction;
 import org.elasticsearch.xpack.ml.action.TransportValidateJobConfigAction;
 import org.elasticsearch.xpack.ml.annotations.AnnotationPersister;
@@ -259,6 +262,8 @@ import org.elasticsearch.xpack.ml.job.process.normalizer.MultiplyingNormalizerPr
 import org.elasticsearch.xpack.ml.job.process.normalizer.NativeNormalizerProcessFactory;
 import org.elasticsearch.xpack.ml.job.process.normalizer.NormalizerFactory;
 import org.elasticsearch.xpack.ml.job.process.normalizer.NormalizerProcessFactory;
+import org.elasticsearch.xpack.ml.job.snapshot.upgrader.SnapshotUpgradeTaskExecutor;
+import org.elasticsearch.xpack.ml.job.snapshot.upgrader.SnapshotUpgradeTaskParams;
 import org.elasticsearch.xpack.ml.job.task.OpenJobPersistentTasksExecutor;
 import org.elasticsearch.xpack.ml.notifications.AnomalyDetectionAuditor;
 import org.elasticsearch.xpack.ml.notifications.DataFrameAnalyticsAuditor;
@@ -322,6 +327,7 @@ import org.elasticsearch.xpack.ml.rest.job.RestOpenJobAction;
 import org.elasticsearch.xpack.ml.rest.job.RestPostDataAction;
 import org.elasticsearch.xpack.ml.rest.job.RestPostJobUpdateAction;
 import org.elasticsearch.xpack.ml.rest.job.RestPutJobAction;
+import org.elasticsearch.xpack.ml.rest.modelsnapshots.RestUpgradeJobModelSnapshotAction;
 import org.elasticsearch.xpack.ml.rest.modelsnapshots.RestDeleteModelSnapshotAction;
 import org.elasticsearch.xpack.ml.rest.modelsnapshots.RestGetModelSnapshotsAction;
 import org.elasticsearch.xpack.ml.rest.modelsnapshots.RestRevertModelSnapshotAction;
@@ -638,11 +644,11 @@ public class MachineLearning extends Plugin implements SystemIndexPlugin,
 
         NativeStorageProvider nativeStorageProvider = new NativeStorageProvider(environment, MIN_DISK_SPACE_OFF_HEAP.get(settings));
 
-        MlController mlController;
-        AutodetectProcessFactory autodetectProcessFactory;
-        NormalizerProcessFactory normalizerProcessFactory;
-        AnalyticsProcessFactory<AnalyticsResult> analyticsProcessFactory;
-        AnalyticsProcessFactory<MemoryUsageEstimationResult> memoryEstimationProcessFactory;
+        final MlController mlController;
+        final AutodetectProcessFactory autodetectProcessFactory;
+        final NormalizerProcessFactory normalizerProcessFactory;
+        final AnalyticsProcessFactory<AnalyticsResult> analyticsProcessFactory;
+        final AnalyticsProcessFactory<MemoryUsageEstimationResult> memoryEstimationProcessFactory;
         if (MachineLearningField.AUTODETECT_PROCESS.get(settings)) {
             try {
                 NativeController nativeController =
@@ -677,8 +683,8 @@ public class MachineLearning extends Plugin implements SystemIndexPlugin,
             }
         } else {
             mlController = new DummyController();
-            autodetectProcessFactory = (job, autodetectParams, executorService, onProcessCrash) ->
-                    new BlackHoleAutodetectProcess(job.getId(), onProcessCrash);
+            autodetectProcessFactory = (pipelineId, job, autodetectParams, executorService, onProcessCrash) ->
+                new BlackHoleAutodetectProcess(pipelineId, onProcessCrash);
             // factor of 1.0 makes renormalization a no-op
             normalizerProcessFactory = (jobId, quantilesState, bucketSpan, executorService) -> new MultiplyingNormalizerProcess(1.0);
             analyticsProcessFactory = (jobId, analyticsProcessConfig, hasState, executorService, onProcessCrash) -> null;
@@ -687,9 +693,9 @@ public class MachineLearning extends Plugin implements SystemIndexPlugin,
         NormalizerFactory normalizerFactory = new NormalizerFactory(normalizerProcessFactory,
                 threadPool.executor(MachineLearning.UTILITY_THREAD_POOL_NAME));
         AutodetectProcessManager autodetectProcessManager = new AutodetectProcessManager(settings, client, threadPool,
-                xContentRegistry, anomalyDetectionAuditor, clusterService, jobManager, jobResultsProvider, jobResultsPersister,
-                jobDataCountsPersister, anomalyDetectionAnnotationPersister, autodetectProcessFactory, normalizerFactory,
-                nativeStorageProvider, indexNameExpressionResolver);
+            xContentRegistry, anomalyDetectionAuditor, clusterService, jobManager, jobResultsProvider, jobResultsPersister,
+            jobDataCountsPersister, anomalyDetectionAnnotationPersister, autodetectProcessFactory,
+            normalizerFactory, nativeStorageProvider, indexNameExpressionResolver);
         this.autodetectProcessManager.set(autodetectProcessManager);
         DatafeedJobBuilder datafeedJobBuilder =
             new DatafeedJobBuilder(
@@ -804,12 +810,27 @@ public class MachineLearning extends Plugin implements SystemIndexPlugin,
         }
 
         return Arrays.asList(
-                new OpenJobPersistentTasksExecutor(settings, clusterService, autodetectProcessManager.get(),
-                    memoryTracker.get(), client, expressionResolver),
+                new OpenJobPersistentTasksExecutor(settings,
+                    clusterService,
+                    autodetectProcessManager.get(),
+                    memoryTracker.get(),
+                    client,
+                    expressionResolver),
                 new TransportStartDatafeedAction.StartDatafeedPersistentTasksExecutor(datafeedManager.get(), expressionResolver),
-                new TransportStartDataFrameAnalyticsAction.TaskExecutor(settings, client, clusterService, dataFrameAnalyticsManager.get(),
-                    dataFrameAnalyticsAuditor.get(), memoryTracker.get(), expressionResolver,
-                    MlIndexTemplateRegistry.INFERENCE_TEMPLATE)
+                new TransportStartDataFrameAnalyticsAction.TaskExecutor(settings,
+                    client,
+                    clusterService,
+                    dataFrameAnalyticsManager.get(),
+                    dataFrameAnalyticsAuditor.get(),
+                    memoryTracker.get(),
+                    expressionResolver,
+                    MlIndexTemplateRegistry.INFERENCE_TEMPLATE),
+                new SnapshotUpgradeTaskExecutor(settings,
+                    clusterService,
+                    autodetectProcessManager.get(),
+                    memoryTracker.get(),
+                    expressionResolver,
+                    client)
         );
     }
 
@@ -882,6 +903,7 @@ public class MachineLearning extends Plugin implements SystemIndexPlugin,
             new RestDeleteTrainedModelAction(),
             new RestGetTrainedModelsStatsAction(),
             new RestPutTrainedModelAction(),
+            new RestUpgradeJobModelSnapshotAction(),
             // CAT Handlers
             new RestCatJobsAction(),
             new RestCatTrainedModelsAction(),
@@ -965,6 +987,7 @@ public class MachineLearning extends Plugin implements SystemIndexPlugin,
                 new ActionHandler<>(DeleteTrainedModelAction.INSTANCE, TransportDeleteTrainedModelAction.class),
                 new ActionHandler<>(GetTrainedModelsStatsAction.INSTANCE, TransportGetTrainedModelsStatsAction.class),
                 new ActionHandler<>(PutTrainedModelAction.INSTANCE, TransportPutTrainedModelAction.class),
+                new ActionHandler<>(UpgradeJobModelSnapshotAction.INSTANCE, TransportUpgradeJobModelSnapshotAction.class),
             usageAction,
                 infoAction);
     }
@@ -1090,12 +1113,17 @@ public class MachineLearning extends Plugin implements SystemIndexPlugin,
             OpenJobAction.JobParams::new));
         namedWriteables.add(new NamedWriteableRegistry.Entry(PersistentTaskParams.class, MlTasks.DATA_FRAME_ANALYTICS_TASK_NAME,
             StartDataFrameAnalyticsAction.TaskParams::new));
+        namedWriteables.add(new NamedWriteableRegistry.Entry(PersistentTaskParams.class, MlTasks.JOB_SNAPSHOT_UPGRADE_TASK_NAME,
+            SnapshotUpgradeTaskParams::new));
 
         // Persistent task states
         namedWriteables.add(new NamedWriteableRegistry.Entry(PersistentTaskState.class, JobTaskState.NAME, JobTaskState::new));
         namedWriteables.add(new NamedWriteableRegistry.Entry(PersistentTaskState.class, DatafeedState.NAME, DatafeedState::fromStream));
         namedWriteables.add(new NamedWriteableRegistry.Entry(PersistentTaskState.class, DataFrameAnalyticsTaskState.NAME,
             DataFrameAnalyticsTaskState::new));
+        namedWriteables.add(new NamedWriteableRegistry.Entry(PersistentTaskState.class,
+            SnapshotUpgradeTaskState.NAME,
+            SnapshotUpgradeTaskState::new));
 
         namedWriteables.addAll(new MlDataFrameAnalysisNamedXContentProvider().getNamedWriteables());
         namedWriteables.addAll(new AnalysisStatsNamedWriteablesProvider().getNamedWriteables());

+ 6 - 16
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportOpenJobAction.java

@@ -27,7 +27,6 @@ import org.elasticsearch.license.LicenseUtils;
 import org.elasticsearch.license.XPackLicenseState;
 import org.elasticsearch.persistent.PersistentTasksCustomMetadata;
 import org.elasticsearch.persistent.PersistentTasksService;
-import org.elasticsearch.persistent.decider.EnableAssignmentDecider;
 import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.tasks.Task;
 import org.elasticsearch.threadpool.ThreadPool;
@@ -48,12 +47,10 @@ import org.elasticsearch.xpack.ml.job.JobNodeSelector;
 import org.elasticsearch.xpack.ml.job.persistence.JobConfigProvider;
 import org.elasticsearch.xpack.ml.process.MlMemoryTracker;
 
+import java.util.Optional;
 import java.util.function.Predicate;
 
-import static org.elasticsearch.xpack.core.ml.MlTasks.AWAITING_UPGRADE;
-import static org.elasticsearch.xpack.ml.job.task.OpenJobPersistentTasksExecutor.makeAssignmentsNotAllowedException;
-import static org.elasticsearch.xpack.ml.job.task.OpenJobPersistentTasksExecutor.makeCurrentlyBeingUpgradedException;
-import static org.elasticsearch.xpack.ml.job.task.OpenJobPersistentTasksExecutor.makeNoSuitableNodesException;
+import static org.elasticsearch.xpack.ml.job.task.OpenJobPersistentTasksExecutor.checkAssignmentState;
 
 /*
  This class extends from TransportMasterNodeAction for cluster state observing purposes.
@@ -287,17 +284,10 @@ public class TransportOpenJobAction extends TransportMasterNodeAction<OpenJobAct
 
                 // This logic is only appropriate when opening a job, not when reallocating following a failure,
                 // and this is why this class must only be used when opening a job
-                if (assignment != null && assignment.equals(PersistentTasksCustomMetadata.INITIAL_ASSIGNMENT) == false &&
-                        assignment.isAssigned() == false) {
-                    OpenJobAction.JobParams params = (OpenJobAction.JobParams) persistentTask.getParams();
-                    // Assignment has failed on the master node despite passing our "fast fail" validation
-                    if (assignment.equals(AWAITING_UPGRADE)) {
-                        exception = makeCurrentlyBeingUpgradedException(logger, params.getJobId(), assignment.getExplanation());
-                    } else if (assignment.getExplanation().contains("[" + EnableAssignmentDecider.ALLOCATION_NONE_EXPLANATION + "]")) {
-                        exception = makeAssignmentsNotAllowedException(logger, params.getJobId());
-                    } else {
-                        exception = makeNoSuitableNodesException(logger, params.getJobId(), assignment.getExplanation());
-                    }
+                OpenJobAction.JobParams params = (OpenJobAction.JobParams) persistentTask.getParams();
+                Optional<ElasticsearchException> assignmentException = checkAssignmentState(assignment, params.getJobId(), logger);
+                if (assignmentException.isPresent()) {
+                    exception = assignmentException.get();
                     // The persistent task should be cancelled so that the observed outcome is the
                     // same as if the "fast fail" validation on the coordinating node had failed
                     shouldCancel = true;

+ 11 - 1
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportRevertModelSnapshotAction.java

@@ -86,7 +86,17 @@ public class TransportRevertModelSnapshotAction extends TransportMasterNodeActio
                 JobState jobState = MlTasks.getJobState(request.getJobId(), tasks);
 
                 if (jobState.equals(JobState.CLOSED) == false) {
-                    throw ExceptionsHelper.conflictStatusException(Messages.getMessage(Messages.REST_JOB_NOT_CLOSED_REVERT));
+                    listener.onFailure(ExceptionsHelper.conflictStatusException(Messages.getMessage(Messages.REST_JOB_NOT_CLOSED_REVERT)));
+                    return;
+                }
+
+                if (MlTasks.getSnapshotUpgraderTask(request.getJobId(), request.getSnapshotId(), tasks) != null) {
+                    listener.onFailure(ExceptionsHelper.conflictStatusException(
+                        "Cannot revert job [{}] to snapshot [{}] as it is being upgraded",
+                        request.getJobId(),
+                        request.getSnapshotId()
+                    ));
+                    return;
                 }
 
                 getModelSnapshot(request, jobResultsProvider, modelSnapshot -> {

+ 261 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportUpgradeJobModelSnapshotAction.java

@@ -0,0 +1,261 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.action;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.apache.logging.log4j.message.ParameterizedMessage;
+import org.elasticsearch.ElasticsearchException;
+import org.elasticsearch.ResourceAlreadyExistsException;
+import org.elasticsearch.ResourceNotFoundException;
+import org.elasticsearch.Version;
+import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.action.support.ActionFilters;
+import org.elasticsearch.action.support.master.TransportMasterNodeAction;
+import org.elasticsearch.cluster.ClusterState;
+import org.elasticsearch.cluster.block.ClusterBlockException;
+import org.elasticsearch.cluster.block.ClusterBlockLevel;
+import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver;
+import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.license.LicenseUtils;
+import org.elasticsearch.license.XPackLicenseState;
+import org.elasticsearch.persistent.PersistentTasksCustomMetadata;
+import org.elasticsearch.persistent.PersistentTasksCustomMetadata.PersistentTask;
+import org.elasticsearch.persistent.PersistentTasksService;
+import org.elasticsearch.tasks.Task;
+import org.elasticsearch.threadpool.ThreadPool;
+import org.elasticsearch.transport.TransportService;
+import org.elasticsearch.xpack.core.XPackField;
+import org.elasticsearch.xpack.core.ml.MlTasks;
+import org.elasticsearch.xpack.core.ml.action.UpgradeJobModelSnapshotAction;
+import org.elasticsearch.xpack.core.ml.action.UpgradeJobModelSnapshotAction.Request;
+import org.elasticsearch.xpack.core.ml.action.UpgradeJobModelSnapshotAction.Response;
+import org.elasticsearch.xpack.core.ml.job.config.Job;
+import org.elasticsearch.xpack.core.ml.job.config.JobState;
+import org.elasticsearch.xpack.core.ml.job.messages.Messages;
+import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.ModelSnapshot;
+import org.elasticsearch.xpack.core.ml.job.results.Result;
+import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
+import org.elasticsearch.xpack.ml.MlConfigMigrationEligibilityCheck;
+import org.elasticsearch.xpack.ml.job.persistence.JobConfigProvider;
+import org.elasticsearch.xpack.ml.job.persistence.JobResultsProvider;
+import org.elasticsearch.xpack.ml.job.snapshot.upgrader.SnapshotUpgradePredicate;
+import org.elasticsearch.xpack.ml.job.snapshot.upgrader.SnapshotUpgradeTaskParams;
+import org.elasticsearch.xpack.ml.process.MlMemoryTracker;
+
+
+public class TransportUpgradeJobModelSnapshotAction extends TransportMasterNodeAction<Request, Response> {
+
+    // If the snapshot is from any version other than the current major, we consider it for upgrade.
+    // This is to support upgrading to the NEXT major without worry
+    private static final byte UPGRADE_FROM_MAJOR = Version.CURRENT.major;
+
+    private static final Logger logger = LogManager.getLogger(TransportUpgradeJobModelSnapshotAction.class);
+
+    private final XPackLicenseState licenseState;
+    private final PersistentTasksService persistentTasksService;
+    private final JobConfigProvider jobConfigProvider;
+    private final JobResultsProvider jobResultsProvider;
+    private final MlMemoryTracker memoryTracker;
+    private final MlConfigMigrationEligibilityCheck migrationEligibilityCheck;
+
+    @Inject
+    public TransportUpgradeJobModelSnapshotAction(Settings settings, TransportService transportService, ThreadPool threadPool,
+                                                  XPackLicenseState licenseState, ClusterService clusterService,
+                                                  PersistentTasksService persistentTasksService, ActionFilters actionFilters,
+                                                  IndexNameExpressionResolver indexNameExpressionResolver,
+                                                  JobConfigProvider jobConfigProvider, MlMemoryTracker memoryTracker,
+                                                  JobResultsProvider jobResultsProvider) {
+        super(UpgradeJobModelSnapshotAction.NAME, transportService, clusterService, threadPool, actionFilters, Request::new,
+            indexNameExpressionResolver, Response::new, ThreadPool.Names.SAME);
+        this.licenseState = licenseState;
+        this.persistentTasksService = persistentTasksService;
+        this.jobConfigProvider = jobConfigProvider;
+        this.jobResultsProvider = jobResultsProvider;
+        this.memoryTracker = memoryTracker;
+        this.migrationEligibilityCheck = new MlConfigMigrationEligibilityCheck(settings, clusterService);
+    }
+
+    @Override
+    protected ClusterBlockException checkBlock(Request request, ClusterState state) {
+        return state.blocks().globalBlockedException(ClusterBlockLevel.METADATA_WRITE);
+    }
+
+    @Override
+    protected void masterOperation(Task task, Request request, ClusterState state,
+                                   ActionListener<Response> listener) {
+        if (migrationEligibilityCheck.jobIsEligibleForMigration(request.getJobId(), state)) {
+            listener.onFailure(ExceptionsHelper.configHasNotBeenMigrated("upgrade job snapshot", request.getJobId()));
+            return;
+        }
+
+        if (licenseState.checkFeature(XPackLicenseState.Feature.MACHINE_LEARNING) == false) {
+            listener.onFailure(LicenseUtils.newComplianceException(XPackField.MACHINE_LEARNING));
+            return;
+        }
+
+        if (state.nodes().getMaxNodeVersion().after(state.nodes().getMinNodeVersion())) {
+            listener.onFailure(ExceptionsHelper.conflictStatusException(
+                "Cannot upgrade job [{}] snapshot [{}] as not all nodes are on version {}. All nodes must be the same version",
+                request.getJobId(),
+                request.getSnapshotId(),
+                state.nodes().getMaxNodeVersion().toString()));
+            return;
+        }
+
+        PersistentTasksCustomMetadata customMetadata = state.getMetadata().custom(PersistentTasksCustomMetadata.TYPE);
+        if (customMetadata != null && (customMetadata.findTasks(
+            MlTasks.JOB_SNAPSHOT_UPGRADE_TASK_NAME,
+            t -> t.getParams() instanceof SnapshotUpgradeTaskParams
+                && ((SnapshotUpgradeTaskParams)t.getParams()).getJobId().equals(request.getJobId())).isEmpty() == false)) {
+            listener.onFailure(ExceptionsHelper.conflictStatusException(
+                "Cannot upgrade job [{}] snapshot [{}] as there is currently a snapshot for this job being upgraded",
+                request.getJobId(),
+                request.getSnapshotId()));
+            return;
+        }
+
+        final SnapshotUpgradeTaskParams params = new SnapshotUpgradeTaskParams(request.getJobId(), request.getSnapshotId());
+        // Wait for job to be started
+        ActionListener<PersistentTask<SnapshotUpgradeTaskParams>> waitForJobToStart = ActionListener.wrap(
+            persistentTask -> waitForJobStarted(persistentTask.getId(), params, request, listener),
+            e -> {
+                if (ExceptionsHelper.unwrapCause(e) instanceof ResourceAlreadyExistsException) {
+                    e = ExceptionsHelper.conflictStatusException(
+                        "Cannot upgrade job [{}] snapshot [{}] because upgrade is already in progress",
+                        e,
+                        request.getJobId(),
+                        request.getSnapshotId());
+                }
+                listener.onFailure(e);
+            });
+
+        // Start job task
+        ActionListener<Long> memoryRequirementRefreshListener = ActionListener.wrap(
+            mem -> {
+                logger.info("[{}] [{}] sending start upgrade request", params.getJobId(), params.getSnapshotId());
+                persistentTasksService.sendStartRequest(
+                    MlTasks.snapshotUpgradeTaskId(params.getJobId(), params.getSnapshotId()),
+                    MlTasks.JOB_SNAPSHOT_UPGRADE_TASK_NAME,
+                    params,
+                    waitForJobToStart);
+            },
+            listener::onFailure
+        );
+
+        // Check that model snapshot exists and should actually be upgraded
+        // Then refresh the memory
+        ActionListener<Result<ModelSnapshot>> getSnapshotHandler = ActionListener.wrap(
+            response -> {
+                if (response == null) {
+                    listener.onFailure(
+                        new ResourceNotFoundException(
+                            Messages.getMessage(Messages.REST_NO_SUCH_MODEL_SNAPSHOT, request.getSnapshotId(), request.getJobId())));
+                    return;
+                }
+                if (Version.CURRENT.equals(response.result.getMinVersion())) {
+                    listener.onFailure(ExceptionsHelper.conflictStatusException(
+                        "Cannot upgrade job [{}] snapshot [{}] as it is already compatible with current version {}",
+                        request.getJobId(),
+                        request.getSnapshotId(),
+                        Version.CURRENT));
+                    return;
+                }
+                memoryTracker.refreshAnomalyDetectorJobMemoryAndAllOthers(params.getJobId(), memoryRequirementRefreshListener);
+            },
+            listener::onFailure
+        );
+
+        ActionListener<Job> getJobHandler = ActionListener.wrap(
+            job -> {
+                if (request.getSnapshotId().equals(job.getModelSnapshotId())
+                    && (JobState.CLOSED.equals(MlTasks.getJobState(request.getJobId(), customMetadata)) == false)) {
+                    listener.onFailure(ExceptionsHelper.conflictStatusException(
+                        "Cannot upgrade snapshot [{}] for job [{}] as it is the current primary job snapshot and the job's state is [{}]",
+                        request.getSnapshotId(),
+                        request.getJobId(),
+                        MlTasks.getJobState(request.getJobId(), customMetadata)
+                    ));
+                    return;
+                }
+                jobResultsProvider.getModelSnapshot(
+                    request.getJobId(),
+                    request.getSnapshotId(),
+                    getSnapshotHandler::onResponse,
+                    getSnapshotHandler::onFailure);
+            },
+            listener::onFailure
+        );
+
+        // Get the job config to verify it exists
+        jobConfigProvider.getJob(request.getJobId(), ActionListener.wrap(
+            builder -> getJobHandler.onResponse(builder.build()),
+            listener::onFailure
+        ));
+    }
+
+    private void waitForJobStarted(String taskId,
+                                   SnapshotUpgradeTaskParams params,
+                                   Request request,
+                                   ActionListener<UpgradeJobModelSnapshotAction.Response> listener) {
+        SnapshotUpgradePredicate predicate = new SnapshotUpgradePredicate(request.isWaitForCompletion(), logger);
+        persistentTasksService.waitForPersistentTaskCondition(taskId, predicate, request.getTimeout(),
+            new PersistentTasksService.WaitForPersistentTaskListener<SnapshotUpgradeTaskParams>() {
+                @Override
+                public void onResponse(PersistentTask<SnapshotUpgradeTaskParams> persistentTask) {
+                    if (predicate.getException() != null) {
+                        if (predicate.isShouldCancel()) {
+                            // We want to return to the caller without leaving an unassigned persistent task, to match
+                            // what would have happened if the error had been detected in the "fast fail" validation
+                            cancelJobStart(persistentTask, predicate.getException(), listener);
+                        } else {
+                            listener.onFailure(predicate.getException());
+                        }
+                    } else {
+                        listener.onResponse(new Response(predicate.isCompleted(), predicate.getNode()));
+                }
+            }
+
+            @Override
+            public void onFailure(Exception e) {
+                listener.onFailure(e);
+            }
+
+            @Override
+            public void onTimeout(TimeValue timeout) {
+                listener.onFailure(new ElasticsearchException(
+                    "snapshot upgrader request [{}] [{}] timed out after [{}]",
+                    params.getJobId(),
+                    params.getSnapshotId(),
+                    timeout));
+            }
+        });
+    }
+
+    private void cancelJobStart(PersistentTask<SnapshotUpgradeTaskParams> persistentTask,
+                                Exception exception,
+                                ActionListener<Response> listener) {
+        persistentTasksService.sendRemoveRequest(persistentTask.getId(),
+            ActionListener.wrap(
+                t -> listener.onFailure(exception),
+                e -> {
+                    logger.error(
+                        new ParameterizedMessage(
+                            "[{}] [{}] Failed to cancel persistent task that could not be assigned due to {}",
+                            persistentTask.getParams().getJobId(),
+                            persistentTask.getParams().getSnapshotId(),
+                            exception.getMessage()
+                        ),
+                        e);
+                    listener.onFailure(exception);
+                }
+            ));
+    }
+
+}

+ 5 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/AbstractNativeAnalyticsProcess.java

@@ -43,6 +43,11 @@ abstract class AbstractNativeAnalyticsProcess<Result> extends AbstractNativeProc
         // Nothing to persist
     }
 
+    @Override
+    public void persistState(long timestamp, String id, String description) {
+        // Nothing to persist
+    }
+
     @Override
     public void writeEndOfDataMessage() throws IOException {
         new AnalyticsControlMessageWriter(recordWriter(), numberOfFields()).writeEndOfData();

+ 4 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/dataframe/process/NativeAnalyticsProcess.java

@@ -53,6 +53,10 @@ public class NativeAnalyticsProcess extends AbstractNativeAnalyticsProcess<Analy
         // Nothing to persist
     }
 
+    @Override
+    public void persistState(long snapshotTimestamp, String snapshotId, String snapshotDescription) {
+    }
+
     @Override
     public void writeEndOfDataMessage() throws IOException {
         new AnalyticsControlMessageWriter(recordWriter(), numberOfFields()).writeEndOfData();

+ 57 - 38
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/NodeLoadDetector.java

@@ -19,6 +19,7 @@ import org.elasticsearch.xpack.core.ml.action.StartDataFrameAnalyticsAction;
 import org.elasticsearch.xpack.core.ml.dataframe.DataFrameAnalyticsState;
 import org.elasticsearch.xpack.core.ml.job.config.JobState;
 import org.elasticsearch.xpack.ml.MachineLearning;
+import org.elasticsearch.xpack.ml.job.snapshot.upgrader.SnapshotUpgradeTaskParams;
 import org.elasticsearch.xpack.ml.process.MlMemoryTracker;
 import org.elasticsearch.xpack.ml.utils.NativeMemoryCalculator;
 
@@ -89,49 +90,21 @@ public class NodeLoadDetector {
                 MlTasks.JOB_TASK_NAME, task -> nodeLoad.getNodeId().equals(task.getExecutorNode()));
             for (PersistentTasksCustomMetadata.PersistentTask<?> assignedTask : assignedAnomalyDetectorTasks) {
                 JobState jobState = MlTasks.getJobStateModifiedForReassignments(assignedTask);
-                if (jobState.isAnyOf(JobState.CLOSED, JobState.FAILED) == false) {
-                    // Don't count CLOSED or FAILED jobs, as they don't consume native memory
-                    ++nodeLoad.numAssignedJobs;
-                    if (jobState == JobState.OPENING) {
-                        ++nodeLoad.numAllocatingJobs;
-                    }
-                    OpenJobAction.JobParams params = (OpenJobAction.JobParams) assignedTask.getParams();
-                    Long jobMemoryRequirement = mlMemoryTracker.getAnomalyDetectorJobMemoryRequirement(params.getJobId());
-                    if (jobMemoryRequirement == null) {
-                        nodeLoad.useMemory = false;
-                        logger.debug(() -> new ParameterizedMessage(
-                            "[{}] memory requirement was not available. Calculating load by number of assigned jobs.",
-                            params.getJobId()
-                        ));
-                    } else {
-                        nodeLoad.assignedJobMemory += jobMemoryRequirement;
-                    }
-                }
+                OpenJobAction.JobParams params = (OpenJobAction.JobParams) assignedTask.getParams();
+                nodeLoad.adjustForAnomalyJob(jobState, params == null ? null : params.getJobId(), mlMemoryTracker);
+            }
+            Collection<PersistentTasksCustomMetadata.PersistentTask<?>> assignedShapshotUpgraderTasks = persistentTasks.findTasks(
+                MlTasks.JOB_SNAPSHOT_UPGRADE_TASK_NAME, task -> nodeLoad.getNodeId().equals(task.getExecutorNode()));
+            for (PersistentTasksCustomMetadata.PersistentTask<?> assignedTask : assignedShapshotUpgraderTasks) {
+                SnapshotUpgradeTaskParams params = (SnapshotUpgradeTaskParams) assignedTask.getParams();
+                nodeLoad.adjustForAnomalyJob(JobState.OPENED, params == null ? null : params.getJobId(), mlMemoryTracker);
             }
+
             // find all the data frame analytics job tasks assigned to this node
             Collection<PersistentTasksCustomMetadata.PersistentTask<?>> assignedAnalyticsTasks = persistentTasks.findTasks(
                 MlTasks.DATA_FRAME_ANALYTICS_TASK_NAME, task -> nodeLoad.getNodeId().equals(task.getExecutorNode()));
             for (PersistentTasksCustomMetadata.PersistentTask<?> assignedTask : assignedAnalyticsTasks) {
-                DataFrameAnalyticsState dataFrameAnalyticsState = MlTasks.getDataFrameAnalyticsState(assignedTask);
-
-                // Don't count stopped and failed df-analytics tasks as they don't consume native memory
-                if (dataFrameAnalyticsState.isAnyOf(DataFrameAnalyticsState.STOPPED, DataFrameAnalyticsState.FAILED) == false) {
-                    // The native process is only running in the ANALYZING and STOPPING states, but in the STARTED
-                    // and REINDEXING states we're committed to using the memory soon, so account for it here
-                    ++nodeLoad.numAssignedJobs;
-                    StartDataFrameAnalyticsAction.TaskParams params =
-                        (StartDataFrameAnalyticsAction.TaskParams) assignedTask.getParams();
-                    Long jobMemoryRequirement = mlMemoryTracker.getDataFrameAnalyticsJobMemoryRequirement(params.getId());
-                    if (jobMemoryRequirement == null) {
-                        nodeLoad.useMemory = false;
-                        logger.debug(() -> new ParameterizedMessage(
-                            "[{}] memory requirement was not available. Calculating load by number of assigned jobs.",
-                            params.getId()
-                        ));
-                    } else {
-                        nodeLoad.assignedJobMemory += jobMemoryRequirement;
-                    }
-                }
+                nodeLoad.adjustForAnalyticsJob(assignedTask, mlMemoryTracker);
             }
             // if any jobs are running then the native code will be loaded, but shared between all jobs,
             // so increase the total memory usage of the assigned jobs to account for this
@@ -158,6 +131,52 @@ public class NodeLoadDetector {
             this.useMemory = useMemory;
         }
 
+        private void adjustForAnomalyJob(JobState jobState,
+                                         String jobId,
+                                         MlMemoryTracker mlMemoryTracker) {
+            if ((jobState.isAnyOf(JobState.CLOSED, JobState.FAILED) == false) && jobId != null) {
+                // Don't count CLOSED or FAILED jobs, as they don't consume native memory
+                ++numAssignedJobs;
+                if (jobState == JobState.OPENING) {
+                    ++numAllocatingJobs;
+                }
+                Long jobMemoryRequirement = mlMemoryTracker.getAnomalyDetectorJobMemoryRequirement(jobId);
+                if (jobMemoryRequirement == null) {
+                    useMemory = false;
+                    logger.debug(() -> new ParameterizedMessage(
+                        "[{}] memory requirement was not available. Calculating load by number of assigned jobs.",
+                        jobId
+                    ));
+                } else {
+                    assignedJobMemory += jobMemoryRequirement;
+                }
+            }
+        }
+
+        private void adjustForAnalyticsJob(PersistentTasksCustomMetadata.PersistentTask<?> assignedTask,
+                                           MlMemoryTracker mlMemoryTracker) {
+            DataFrameAnalyticsState dataFrameAnalyticsState = MlTasks.getDataFrameAnalyticsState(assignedTask);
+
+            // Don't count stopped and failed df-analytics tasks as they don't consume native memory
+            if (dataFrameAnalyticsState.isAnyOf(DataFrameAnalyticsState.STOPPED, DataFrameAnalyticsState.FAILED) == false) {
+                // The native process is only running in the ANALYZING and STOPPING states, but in the STARTED
+                // and REINDEXING states we're committed to using the memory soon, so account for it here
+                ++numAssignedJobs;
+                StartDataFrameAnalyticsAction.TaskParams params =
+                    (StartDataFrameAnalyticsAction.TaskParams) assignedTask.getParams();
+                Long jobMemoryRequirement = mlMemoryTracker.getDataFrameAnalyticsJobMemoryRequirement(params.getId());
+                if (jobMemoryRequirement == null) {
+                    useMemory = false;
+                    logger.debug(() -> new ParameterizedMessage(
+                        "[{}] memory requirement was not available. Calculating load by number of assigned jobs.",
+                        params.getId()
+                    ));
+                } else {
+                    assignedJobMemory += jobMemoryRequirement;
+                }
+            }
+        }
+
         /**
          * @return The total number of assigned jobs
          */

+ 65 - 62
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/persistence/JobResultsProvider.java

@@ -560,23 +560,22 @@ public class JobResultsProvider {
                 .unmappedType("double").order(SortOrder.DESC));
     }
 
-    public void getAutodetectParams(Job job, Consumer<AutodetectParams> consumer, Consumer<Exception> errorHandler) {
-
+    public void getAutodetectParams(Job job, String snapshotId, Consumer<AutodetectParams> consumer, Consumer<Exception> errorHandler)  {
         String jobId = job.getId();
 
         ActionListener<AutodetectParams.Builder> getScheduledEventsListener = ActionListener.wrap(
-                paramsBuilder -> {
-                    ScheduledEventsQueryBuilder scheduledEventsQueryBuilder = new ScheduledEventsQueryBuilder();
-                    scheduledEventsQueryBuilder.start(job.earliestValidTimestamp(paramsBuilder.getDataCounts()));
-                    scheduledEventsForJob(jobId, job.getGroups(), scheduledEventsQueryBuilder, ActionListener.wrap(
-                            events -> {
-                                paramsBuilder.setScheduledEvents(events.results());
-                                consumer.accept(paramsBuilder.build());
-                            },
-                            errorHandler
-                    ));
-                },
-                errorHandler
+            paramsBuilder -> {
+                ScheduledEventsQueryBuilder scheduledEventsQueryBuilder = new ScheduledEventsQueryBuilder();
+                scheduledEventsQueryBuilder.start(job.earliestValidTimestamp(paramsBuilder.getDataCounts()));
+                scheduledEventsForJob(jobId, job.getGroups(), scheduledEventsQueryBuilder, ActionListener.wrap(
+                    events -> {
+                        paramsBuilder.setScheduledEvents(events.results());
+                        consumer.accept(paramsBuilder.build());
+                    },
+                    errorHandler
+                ));
+            },
+            errorHandler
         );
 
         AutodetectParams.Builder paramsBuilder = new AutodetectParams.Builder(job.getId());
@@ -584,61 +583,65 @@ public class JobResultsProvider {
         String stateIndex = AnomalyDetectorsIndex.jobStateIndexPattern();
 
         MultiSearchRequestBuilder msearch = client.prepareMultiSearch()
-                .add(createLatestDataCountsSearch(resultsIndex, jobId))
-                .add(createLatestModelSizeStatsSearch(resultsIndex))
-                .add(createLatestTimingStatsSearch(resultsIndex, jobId))
-                // These next two document IDs never need to be the legacy ones due to the rule
-                // that you cannot open a 5.4 job in a subsequent version of the product
-                .add(createDocIdSearch(resultsIndex, ModelSnapshot.documentId(jobId, job.getModelSnapshotId())))
-                .add(createDocIdSearch(stateIndex, Quantiles.documentId(jobId)));
+            .add(createLatestDataCountsSearch(resultsIndex, jobId))
+            .add(createLatestModelSizeStatsSearch(resultsIndex))
+            .add(createLatestTimingStatsSearch(resultsIndex, jobId))
+            // These next two document IDs never need to be the legacy ones due to the rule
+            // that you cannot open a 5.4 job in a subsequent version of the product
+            .add(createDocIdSearch(resultsIndex, ModelSnapshot.documentId(jobId, snapshotId)))
+            .add(createDocIdSearch(stateIndex, Quantiles.documentId(jobId)));
 
         for (String filterId : job.getAnalysisConfig().extractReferencedFilters()) {
             msearch.add(createDocIdSearch(MlMetaIndex.indexName(), MlFilter.documentId(filterId)));
         }
 
         executeAsyncWithOrigin(client.threadPool().getThreadContext(), ML_ORIGIN, msearch.request(),
-                ActionListener.<MultiSearchResponse>wrap(
-                        response -> {
-                            for (int i = 0; i < response.getResponses().length; i++) {
-                                MultiSearchResponse.Item itemResponse = response.getResponses()[i];
-                                if (itemResponse.isFailure()) {
-                                    errorHandler.accept(itemResponse.getFailure());
-                                    return;
-                                }
-                                SearchResponse searchResponse = itemResponse.getResponse();
-                                ShardSearchFailure[] shardFailures = searchResponse.getShardFailures();
-                                int unavailableShards = searchResponse.getTotalShards() - searchResponse.getSuccessfulShards();
-                                if (CollectionUtils.isEmpty(shardFailures) == false) {
-                                    LOGGER.error("[{}] Search request returned shard failures: {}", jobId,
-                                        Arrays.toString(shardFailures));
-                                    errorHandler.accept(new ElasticsearchException(
-                                        ExceptionsHelper.shardFailuresToErrorMsg(jobId, shardFailures)));
-                                    return;
-                                }
-                                if (unavailableShards > 0) {
-                                    errorHandler.accept(new ElasticsearchException("[" + jobId
-                                        + "] Search request encountered [" + unavailableShards + "] unavailable shards"));
-                                    return;
-                                }
-                                SearchHits hits = searchResponse.getHits();
-                                long hitsCount = hits.getHits().length;
-                                if (hitsCount == 0) {
-                                    SearchRequest searchRequest = msearch.request().requests().get(i);
-                                    LOGGER.debug("Found 0 hits for [{}]", new Object[]{searchRequest.indices()});
-                                }
-                                for (SearchHit hit : hits) {
-                                    try {
-                                        parseAutodetectParamSearchHit(jobId, paramsBuilder, hit);
-                                    } catch (Exception e) {
-                                        errorHandler.accept(e);
-                                        return;
-                                    }
-                                }
+            ActionListener.<MultiSearchResponse>wrap(
+                response -> {
+                    for (int i = 0; i < response.getResponses().length; i++) {
+                        MultiSearchResponse.Item itemResponse = response.getResponses()[i];
+                        if (itemResponse.isFailure()) {
+                            errorHandler.accept(itemResponse.getFailure());
+                            return;
+                        }
+                        SearchResponse searchResponse = itemResponse.getResponse();
+                        ShardSearchFailure[] shardFailures = searchResponse.getShardFailures();
+                        int unavailableShards = searchResponse.getTotalShards() - searchResponse.getSuccessfulShards();
+                        if (CollectionUtils.isEmpty(shardFailures) == false) {
+                            LOGGER.error("[{}] Search request returned shard failures: {}", jobId,
+                                Arrays.toString(shardFailures));
+                            errorHandler.accept(new ElasticsearchException(
+                                ExceptionsHelper.shardFailuresToErrorMsg(jobId, shardFailures)));
+                            return;
+                        }
+                        if (unavailableShards > 0) {
+                            errorHandler.accept(new ElasticsearchException("[" + jobId
+                                + "] Search request encountered [" + unavailableShards + "] unavailable shards"));
+                            return;
+                        }
+                        SearchHits hits = searchResponse.getHits();
+                        long hitsCount = hits.getHits().length;
+                        if (hitsCount == 0) {
+                            SearchRequest searchRequest = msearch.request().requests().get(i);
+                            LOGGER.debug("Found 0 hits for [{}]", new Object[]{searchRequest.indices()});
+                        }
+                        for (SearchHit hit : hits) {
+                            try {
+                                parseAutodetectParamSearchHit(jobId, paramsBuilder, hit);
+                            } catch (Exception e) {
+                                errorHandler.accept(e);
+                                return;
                             }
-                            getScheduledEventsListener.onResponse(paramsBuilder);
-                        },
-                        errorHandler
-                ), client::multiSearch);
+                        }
+                    }
+                    getScheduledEventsListener.onResponse(paramsBuilder);
+                },
+                errorHandler
+            ), client::multiSearch);
+    }
+
+    public void getAutodetectParams(Job job, Consumer<AutodetectParams> consumer, Consumer<Exception> errorHandler) {
+        getAutodetectParams(job, job.getModelSnapshotId(), consumer, errorHandler);
     }
 
     private SearchRequestBuilder createDocIdSearch(String index, String id) {

+ 9 - 1
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/AutodetectProcessFactory.java

@@ -26,7 +26,15 @@ public interface AutodetectProcessFactory {
      * @param onProcessCrash    Callback to execute if the process stops unexpectedly
      * @return The process
      */
-    AutodetectProcess createAutodetectProcess(Job job,
+    default AutodetectProcess createAutodetectProcess(Job job,
+                                                      AutodetectParams autodetectParams,
+                                                      ExecutorService executorService,
+                                                      Consumer<String> onProcessCrash) {
+        return createAutodetectProcess(job.getId(), job, autodetectParams, executorService, onProcessCrash);
+    }
+
+    AutodetectProcess createAutodetectProcess(String pipelineId,
+                                              Job job,
                                               AutodetectParams autodetectParams,
                                               ExecutorService executorService,
                                               Consumer<String> onProcessCrash);

+ 102 - 25
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/AutodetectProcessManager.java

@@ -30,7 +30,6 @@ import org.elasticsearch.common.xcontent.XContentType;
 import org.elasticsearch.core.internal.io.IOUtils;
 import org.elasticsearch.index.analysis.AnalysisRegistry;
 import org.elasticsearch.indices.InvalidAliasNameException;
-import org.elasticsearch.persistent.PersistentTasksCustomMetadata.PersistentTask;
 import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.xpack.core.action.util.PageParams;
@@ -50,6 +49,8 @@ import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.DataCounts;
 import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.ModelSizeStats;
 import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.ModelSnapshot;
 import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.TimingStats;
+import org.elasticsearch.xpack.core.ml.job.snapshot.upgrade.SnapshotUpgradeState;
+import org.elasticsearch.xpack.core.ml.job.snapshot.upgrade.SnapshotUpgradeTaskState;
 import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
 import org.elasticsearch.xpack.ml.MachineLearning;
 import org.elasticsearch.xpack.ml.annotations.AnnotationPersister;
@@ -70,6 +71,7 @@ import org.elasticsearch.xpack.ml.job.process.normalizer.NormalizerFactory;
 import org.elasticsearch.xpack.ml.job.process.normalizer.Renormalizer;
 import org.elasticsearch.xpack.ml.job.process.normalizer.ScoresUpdater;
 import org.elasticsearch.xpack.ml.job.process.normalizer.ShortCircuitingRenormalizer;
+import org.elasticsearch.xpack.ml.job.snapshot.upgrader.SnapshotUpgradeTask;
 import org.elasticsearch.xpack.ml.job.task.JobTask;
 import org.elasticsearch.xpack.ml.notifications.AnomalyDetectionAuditor;
 import org.elasticsearch.xpack.ml.process.NativeStorageProvider;
@@ -90,6 +92,7 @@ import java.util.concurrent.ConcurrentMap;
 import java.util.concurrent.ExecutorService;
 import java.util.function.BiConsumer;
 import java.util.function.Consumer;
+import java.util.function.Function;
 
 import static org.elasticsearch.xpack.core.ClientHelper.ML_ORIGIN;
 import static org.elasticsearch.xpack.core.ClientHelper.executeAsyncWithOrigin;
@@ -120,6 +123,7 @@ public class AutodetectProcessManager implements ClusterStateListener {
     private final AnomalyDetectionAuditor auditor;
 
     private volatile boolean upgradeInProgress;
+    private volatile boolean nodeDying;
 
     public AutodetectProcessManager(Settings settings, Client client, ThreadPool threadPool,
                                     NamedXContentRegistry xContentRegistry, AnomalyDetectionAuditor auditor, ClusterService clusterService,
@@ -151,6 +155,8 @@ public class AutodetectProcessManager implements ClusterStateListener {
     }
 
     public synchronized void closeAllJobsOnThisNode(String reason) {
+        // Note, snapshot upgrader processes could still be running, but those are short lived
+        // Leaving them running is OK.
         int numJobs = processByAllocation.size();
         if (numJobs != 0) {
             logger.info("Closing [{}] jobs, because [{}]", numJobs, reason);
@@ -186,14 +192,19 @@ public class AutodetectProcessManager implements ClusterStateListener {
     }
 
     public void killAllProcessesOnThisNode() {
+        // For snapshot upgrade tasks, they don't exist in `processByAllocation`
+        // They are short lived, but once they are marked as "started" they cannot be restarted as the snapshot could be corrupted
+        // Consequently, just let them die with the node. But try not to move forward with saving the upgraded state if the node
+        // is dying
+        nodeDying = true;
         Iterator<ProcessContext> iterator = processByAllocation.values().iterator();
         while (iterator.hasNext()) {
             ProcessContext processContext = iterator.next();
             processContext.newKillBuilder()
-                    .setAwaitCompletion(false)
-                    .setFinish(false)
-                    .setSilent(true)
-                    .kill();
+                .setAwaitCompletion(false)
+                .setFinish(false)
+                .setSilent(true)
+                .kill();
             iterator.remove();
         }
     }
@@ -376,6 +387,67 @@ public class AutodetectProcessManager implements ClusterStateListener {
         }
     }
 
+    public void upgradeSnapshot(SnapshotUpgradeTask task, Consumer<Exception> closeHandler) {
+        final String jobId = task.getJobId();
+        final String snapshotId = task.getSnapshotId();
+        final Function<String, SnapshotUpgradeTaskState> failureBuilder =
+            (reason) -> new SnapshotUpgradeTaskState(SnapshotUpgradeState.FAILED, task.getAllocationId(), reason);
+        // Start the process
+        jobManager.getJob(jobId, ActionListener.wrap(
+            job -> {
+                if (job.getJobVersion() == null) {
+                    closeHandler.accept(ExceptionsHelper.badRequestException("Cannot open job [" + jobId
+                        + "] because jobs created prior to version 5.5 are not supported"));
+                    return;
+                }
+                jobResultsProvider.getAutodetectParams(job, snapshotId, params -> {
+                    if (params.modelSnapshot() == null) {
+                        closeHandler.accept(new ElasticsearchStatusException(
+                            "cannot find snapshot [{}] for job [{}] to upgrade",
+                            RestStatus.NOT_FOUND,
+                            jobId,
+                            snapshotId));
+                        return;
+                    }
+                    // We need to fork, otherwise we restore model state from a network thread (several GET api calls):
+                    threadPool.executor(MachineLearning.UTILITY_THREAD_POOL_NAME).execute(new AbstractRunnable() {
+                        @Override
+                        public void onFailure(Exception e) {
+                            closeHandler.accept(e);
+                        }
+
+                        @Override
+                        protected void doRun() {
+                            if (nodeDying) {
+                                logger.info(() -> new ParameterizedMessage(
+                                    "Aborted upgrading snapshot [{}] for job [{}] as node is dying",
+                                    snapshotId,
+                                    jobId));
+                                closeHandler.accept(null);
+                                return;
+                            }
+                            runSnapshotUpgrade(task, job, params, closeHandler);
+                        }
+                    });
+                }, e1 -> {
+                    logger.warn(() -> new ParameterizedMessage(
+                            "[{}] [{}] Failed to gather information required to upgrade snapshot job",
+                            jobId,
+                            snapshotId),
+                        e1);
+                    task.updatePersistentTaskState(failureBuilder.apply(e1.getMessage()), ActionListener.wrap(
+                        t -> closeHandler.accept(e1),
+                        e2 -> {
+                            logger.warn(() -> new ParameterizedMessage("[{}] [{}] failed to set task to failed", jobId, snapshotId), e2);
+                            closeHandler.accept(e1);
+                        }
+                    ));
+                });
+            },
+            closeHandler
+        ));
+    }
+
     public void openJob(JobTask jobTask, ClusterState clusterState, BiConsumer<Exception, Boolean> closeHandler) {
         String jobId = jobTask.getJobId();
         logger.info("Opening job [{}]", jobId);
@@ -473,6 +545,20 @@ public class AutodetectProcessManager implements ClusterStateListener {
         AnnotationIndex.createAnnotationsIndexIfNecessary(client, clusterState, annotationsIndexUpdateHandler);
     }
 
+    private void runSnapshotUpgrade(SnapshotUpgradeTask task, Job job, AutodetectParams params, Consumer<Exception> handler) {
+        JobModelSnapshotUpgrader jobModelSnapshotUpgrader = new JobModelSnapshotUpgrader(task,
+            job,
+            params,
+            threadPool,
+            autodetectProcessFactory,
+            jobResultsPersister,
+            client,
+            nativeStorageProvider,
+            handler,
+            () -> nodeDying == false);
+        jobModelSnapshotUpgrader.start();
+    }
+
     private boolean createProcessAndSetRunning(ProcessContext processContext,
                                                Job job,
                                                AutodetectParams params,
@@ -715,17 +801,12 @@ public class AutodetectProcessManager implements ClusterStateListener {
 
     void setJobState(JobTask jobTask, JobState state, String reason) {
         JobTaskState jobTaskState = new JobTaskState(state, jobTask.getAllocationId(), reason);
-        jobTask.updatePersistentTaskState(jobTaskState, new ActionListener<>() {
-            @Override
-            public void onResponse(PersistentTask<?> persistentTask) {
-                logger.info("Successfully set job state to [{}] for job [{}]", state, jobTask.getJobId());
-            }
-
-            @Override
-            public void onFailure(Exception e) {
-                logger.error("Could not set job state to [" + state + "] for job [" + jobTask.getJobId() + "]", e);
-            }
-        });
+        jobTask.updatePersistentTaskState(jobTaskState, ActionListener.wrap(
+            persistentTask -> logger.info("Successfully set job state to [{}] for job [{}]", state, jobTask.getJobId()),
+            e -> logger.error(
+                () -> new ParameterizedMessage("Could not set job state to [{}] for job [{}]", state, jobTask.getJobId()),
+                e)
+        ));
     }
 
     void setJobState(JobTask jobTask, JobState state) {
@@ -734,25 +815,21 @@ public class AutodetectProcessManager implements ClusterStateListener {
 
     void setJobState(JobTask jobTask, JobState state, String reason, CheckedConsumer<Exception, IOException> handler) {
         JobTaskState jobTaskState = new JobTaskState(state, jobTask.getAllocationId(), reason);
-        jobTask.updatePersistentTaskState(jobTaskState, new ActionListener<>() {
-            @Override
-            public void onResponse(PersistentTask<?> persistentTask) {
+        jobTask.updatePersistentTaskState(jobTaskState, ActionListener.wrap(
+            persistentTask -> {
                 try {
                     handler.accept(null);
                 } catch (IOException e1) {
                     logger.warn("Error while delegating response", e1);
                 }
-            }
-
-            @Override
-            public void onFailure(Exception e) {
+            },
+            e -> {
                 try {
                     handler.accept(e);
                 } catch (IOException e1) {
                     logger.warn("Error while delegating exception [" + e.getMessage() + "]", e1);
                 }
-            }
-        });
+            }));
     }
 
     public Optional<Tuple<DataCounts, Tuple<ModelSizeStats, TimingStats>>> getStatistics(JobTask jobTask) {

+ 4 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/BlackHoleAutodetectProcess.java

@@ -118,6 +118,10 @@ public class BlackHoleAutodetectProcess implements AutodetectProcess {
     public void persistState() {
     }
 
+    @Override
+    public void persistState(long snapshotTimestamp, String snapshotId, String snapshotDescription) {
+    }
+
     @Override
     public void flushStream() {
     }

+ 307 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/JobModelSnapshotUpgrader.java

@@ -0,0 +1,307 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+
+package org.elasticsearch.xpack.ml.job.process.autodetect;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.apache.logging.log4j.message.ParameterizedMessage;
+import org.elasticsearch.ElasticsearchException;
+import org.elasticsearch.ElasticsearchStatusException;
+import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.client.Client;
+import org.elasticsearch.common.CheckedSupplier;
+import org.elasticsearch.common.util.concurrent.AbstractRunnable;
+import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
+import org.elasticsearch.common.util.concurrent.ThreadContext;
+import org.elasticsearch.core.internal.io.IOUtils;
+import org.elasticsearch.persistent.PersistentTasksCustomMetadata.PersistentTask;
+import org.elasticsearch.rest.RestStatus;
+import org.elasticsearch.threadpool.ThreadPool;
+import org.elasticsearch.xpack.core.ml.job.config.AnalysisConfig;
+import org.elasticsearch.xpack.core.ml.job.config.Job;
+import org.elasticsearch.xpack.core.ml.job.snapshot.upgrade.SnapshotUpgradeState;
+import org.elasticsearch.xpack.core.ml.job.snapshot.upgrade.SnapshotUpgradeTaskState;
+import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
+import org.elasticsearch.xpack.ml.MachineLearning;
+import org.elasticsearch.xpack.ml.job.persistence.JobResultsPersister;
+import org.elasticsearch.xpack.ml.job.persistence.StateStreamer;
+import org.elasticsearch.xpack.ml.job.process.autodetect.output.JobSnapshotUpgraderResultProcessor;
+import org.elasticsearch.xpack.ml.job.process.autodetect.params.AutodetectParams;
+import org.elasticsearch.xpack.ml.job.snapshot.upgrader.SnapshotUpgradeTask;
+import org.elasticsearch.xpack.ml.process.NativeStorageProvider;
+import org.elasticsearch.xpack.ml.process.writer.LengthEncodedWriter;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Objects;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeoutException;
+import java.util.function.BiConsumer;
+import java.util.function.Consumer;
+import java.util.function.Supplier;
+
+import static org.elasticsearch.xpack.ml.MachineLearning.UTILITY_THREAD_POOL_NAME;
+
+public final class JobModelSnapshotUpgrader {
+
+    private static final Logger logger = LogManager.getLogger(JobModelSnapshotUpgrader.class);
+
+    private final SnapshotUpgradeTask task;
+    private final Job job;
+    private final String jobId;
+    private final String snapshotId;
+    private final AutodetectParams params;
+    private final Client client;
+    private final Consumer<Exception> onFinish;
+    private final Supplier<Boolean> continueRunning;
+    private final ThreadPool threadPool;
+    private final AutodetectProcessFactory autodetectProcessFactory;
+    private final JobResultsPersister jobResultsPersister;
+    private final NativeStorageProvider nativeStorageProvider;
+
+    JobModelSnapshotUpgrader(SnapshotUpgradeTask task,
+                             Job job,
+                             AutodetectParams params,
+                             ThreadPool threadPool,
+                             AutodetectProcessFactory autodetectProcessFactory,
+                             JobResultsPersister jobResultsPersister,
+                             Client client,
+                             NativeStorageProvider nativeStorageProvider,
+                             Consumer<Exception> onFinish,
+                             Supplier<Boolean> continueRunning) {
+        this.task = Objects.requireNonNull(task);
+        this.job = Objects.requireNonNull(job);
+        this.params = Objects.requireNonNull(params);
+        this.threadPool = Objects.requireNonNull(threadPool);
+        this.autodetectProcessFactory = Objects.requireNonNull(autodetectProcessFactory);
+        this.jobResultsPersister = Objects.requireNonNull(jobResultsPersister);
+        this.nativeStorageProvider = Objects.requireNonNull(nativeStorageProvider);
+        this.client = Objects.requireNonNull(client);
+        this.onFinish = Objects.requireNonNull(onFinish);
+        this.continueRunning = Objects.requireNonNull(continueRunning);
+        this.jobId = task.getJobId();
+        this.snapshotId = task.getSnapshotId();
+    }
+
+    void start() {
+        // A TP with no queue, so that we fail immediately if there are no threads available
+        ExecutorService autodetectExecutorService = threadPool.executor(MachineLearning.JOB_COMMS_THREAD_POOL_NAME);
+
+        AutodetectProcess process = autodetectProcessFactory.createAutodetectProcess(jobId + "-" + snapshotId,
+            job,
+            params,
+            autodetectExecutorService,
+            (reason) -> {
+                setTaskToFailed(reason, ActionListener.wrap(t -> {}, f -> {}));
+                try {
+                    nativeStorageProvider.cleanupLocalTmpStorage(task.getDescription());
+                } catch (IOException e) {
+                    logger.error(
+                        new ParameterizedMessage("[{}] [{}] failed to delete temporary files snapshot upgrade", jobId, snapshotId),
+                        e);
+                }
+            });
+        JobSnapshotUpgraderResultProcessor processor = new JobSnapshotUpgraderResultProcessor(
+            jobId,
+            snapshotId,
+            jobResultsPersister,
+            process);
+        AutodetectWorkerExecutorService autodetectWorkerExecutor;
+        try (ThreadContext.StoredContext ignore = threadPool.getThreadContext().stashContext()) {
+            autodetectWorkerExecutor = new AutodetectWorkerExecutorService(threadPool.getThreadContext());
+            autodetectExecutorService.submit(autodetectWorkerExecutor::start);
+            autodetectExecutorService.submit(processor::process);
+        } catch (EsRejectedExecutionException e) {
+            // If submitting the operation to read the results from the process fails we need to close
+            // the process too, so that other submitted operations to threadpool are stopped.
+            try {
+                IOUtils.close(process);
+            } catch (IOException ioe) {
+                logger.error("Can't close autodetect", ioe);
+            }
+            onFinish.accept(e);
+            return;
+        }
+
+        StateStreamer stateStreamer = new StateStreamer(client);
+        Executor executor = new Executor(stateStreamer, processor, autodetectWorkerExecutor, process);
+        if (continueRunning.get() == false) {
+            onFinish.accept(null);
+            return;
+        }
+        executor.execute();
+    }
+
+    void setTaskToFailed(String reason, ActionListener<PersistentTask<?>> listener) {
+        SnapshotUpgradeTaskState taskState = new SnapshotUpgradeTaskState(
+            SnapshotUpgradeState.FAILED,
+            task.getAllocationId(),
+            reason);
+        task.updatePersistentTaskState(taskState, ActionListener.wrap(
+            listener::onResponse,
+            f -> {
+                logger.warn(
+                    () -> new ParameterizedMessage("[{}] [{}] failed to set task to failed", task.getJobId(), task.getSnapshotId()),
+                    f);
+                listener.onFailure(f);
+            }
+        ));
+    }
+
+    private class Executor {
+
+        private final StateStreamer stateStreamer;
+        private final JobSnapshotUpgraderResultProcessor processor;
+        private final ExecutorService autodetectWorkerExecutor;
+        private final AutodetectProcess process;
+
+        Executor(StateStreamer stateStreamer,
+                 JobSnapshotUpgraderResultProcessor processor,
+                 ExecutorService autodetectWorkerExecutor,
+                 AutodetectProcess process) {
+            this.stateStreamer = stateStreamer;
+            this.processor = processor;
+            this.autodetectWorkerExecutor = autodetectWorkerExecutor;
+            this.process = process;
+        }
+
+        void execute() {
+            this.restoreState();
+        }
+
+        protected final Map<String, Integer> outputFieldIndexes() {
+            Map<String, Integer> fieldIndexes = new HashMap<>();
+            // time field
+            fieldIndexes.put(job.getDataDescription().getTimeField(), 0);
+            int index = 1;
+            for (String field : job.getAnalysisConfig().analysisFields()) {
+                if (AnalysisConfig.ML_CATEGORY_FIELD.equals(field) == false) {
+                    fieldIndexes.put(field, index++);
+                }
+            }
+            fieldIndexes.put(LengthEncodedWriter.CONTROL_FIELD_NAME, index);
+            return fieldIndexes;
+        }
+
+        void writeHeader() throws IOException {
+            Map<String, Integer> outFieldIndexes = outputFieldIndexes();
+            // header is all the analysis input fields + the time field + control field
+            int numFields = outFieldIndexes.size();
+            String[] record = new String[numFields];
+            for (Map.Entry<String, Integer> entry : outFieldIndexes.entrySet()) {
+                record[entry.getValue()] = entry.getKey();
+            }
+            // Write the header
+            process.writeRecord(record);
+        }
+
+        void restoreState() {
+            try {
+                process.restoreState(stateStreamer, params.modelSnapshot());
+            } catch (Exception e) {
+                logger.error(() -> new ParameterizedMessage("[{}] [{}] failed to write old state", jobId, snapshotId), e);
+                setTaskToFailed("Failed to write old state due to: " + e.getMessage(),
+                    ActionListener.wrap(t -> shutdown(e), f -> shutdown(e)));
+                return;
+            }
+            task.updatePersistentTaskState(
+                new SnapshotUpgradeTaskState(SnapshotUpgradeState.SAVING_NEW_STATE, task.getAllocationId(), ""),
+                ActionListener.wrap(
+                    readingNewState -> {
+                        if (continueRunning.get() == false) {
+                            shutdown(null);
+                            return;
+                        }
+                        submitOperation(() -> {
+                            writeHeader();
+                            process.persistState(
+                                params.modelSnapshot().getTimestamp().getTime(),
+                                params.modelSnapshot().getSnapshotId(),
+                                params.modelSnapshot().getDescription());
+                            return null;
+                            // Execute callback in the UTILITY thread pool, as the current thread in the callback will be one in the
+                            // autodetectWorkerExecutor. Trying to run the callback in that executor will cause a dead lock as that
+                            // executor has a single processing queue.
+                        }, (aVoid, e) -> threadPool.executor(UTILITY_THREAD_POOL_NAME).execute(() -> shutdown(e)));
+                        logger.info("asked for state to be persisted");
+                    },
+                    f -> {
+                        logger.error(
+                            () -> new ParameterizedMessage(
+                                "[{}] [{}] failed to update snapshot upgrader task to started",
+                                jobId,
+                                snapshotId),
+                            f);
+                        shutdown(new ElasticsearchStatusException(
+                            "Failed to start snapshot upgrade [{}] for job [{}]",
+                            RestStatus.INTERNAL_SERVER_ERROR,
+                            f,
+                            snapshotId,
+                            jobId));
+                    }
+                ));
+        }
+
+        private <T> void submitOperation(CheckedSupplier<T, Exception> operation, BiConsumer<T, Exception> handler) {
+            autodetectWorkerExecutor.execute(new AbstractRunnable() {
+                @Override
+                public void onFailure(Exception e) {
+                    if (continueRunning.get() == false) {
+                        handler.accept(null, ExceptionsHelper.conflictStatusException(
+                            "[{}] Could not submit operation to process as it has been killed", job.getId()));
+                    } else {
+                        logger.error(new ParameterizedMessage("[{}] Unexpected exception writing to process", job.getId()), e);
+                        handler.accept(null, e);
+                    }
+                }
+
+                @Override
+                protected void doRun() throws Exception {
+                    if (continueRunning.get() == false) {
+                        handler.accept(null, ExceptionsHelper.conflictStatusException(
+                            "[{}] Could not submit operation to process as it has been killed", job.getId()));
+                    } else {
+                        checkProcessIsAlive();
+                        handler.accept(operation.get(), null);
+                    }
+                }
+            });
+        }
+
+        private void checkProcessIsAlive() {
+            if (!process.isProcessAlive()) {
+                // Don't log here - it just causes double logging when the exception gets logged
+                throw new ElasticsearchException("[{}] Unexpected death of autodetect: {}", job.getId(), process.readError());
+            }
+        }
+
+        void shutdown(Exception e) {
+            autodetectWorkerExecutor.execute(() -> {
+                if (process.isProcessAlive() == false) {
+                    onFinish.accept(e);
+                    return;
+                }
+                try {
+                    if (process.isReady()) {
+                        process.close();
+                    } else {
+                        processor.setProcessKilled();
+                        process.kill(true);
+                        processor.awaitCompletion();
+                    }
+                } catch (IOException | TimeoutException exc) {
+                    logger.warn(() -> new ParameterizedMessage("[{}] [{}] failed to shutdown process", jobId, snapshotId), exc);
+                } finally {
+                    onFinish.accept(e);
+                }
+            });
+            autodetectWorkerExecutor.shutdown();
+            stateStreamer.cancel();
+        }
+    }
+}

+ 5 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/NativeAutodetectProcess.java

@@ -126,4 +126,9 @@ class NativeAutodetectProcess extends AbstractNativeProcess implements Autodetec
     private AutodetectControlMsgWriter newMessageWriter() {
         return new AutodetectControlMsgWriter(recordWriter(), numberOfFields());
     }
+
+    @Override
+    public void persistState(long snapshotTimestamp, String snapshotId, String snapshotDescription) throws IOException {
+        newMessageWriter().writeStartBackgroundPersistMessage(snapshotTimestamp, snapshotId, snapshotDescription);
+    }
 }

+ 6 - 5
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/NativeAutodetectProcessFactory.java

@@ -71,17 +71,18 @@ public class NativeAutodetectProcessFactory implements AutodetectProcessFactory
     }
 
     @Override
-    public AutodetectProcess createAutodetectProcess(Job job,
+    public AutodetectProcess createAutodetectProcess(String pipelineId,
+                                                     Job job,
                                                      AutodetectParams params,
                                                      ExecutorService executorService,
                                                      Consumer<String> onProcessCrash) {
         List<Path> filesToDelete = new ArrayList<>();
         ProcessPipes processPipes = new ProcessPipes(env, NAMED_PIPE_HELPER, processConnectTimeout, AutodetectBuilder.AUTODETECT,
-            job.getId(), null, false, true, true, params.modelSnapshot() != null,
+            pipelineId, null, false, true, true, params.modelSnapshot() != null,
             AutodetectBuilder.DONT_PERSIST_MODEL_STATE_SETTING.get(settings) == false);
         createNativeProcess(job, params, processPipes, filesToDelete);
         boolean includeTokensField = MachineLearning.CATEGORIZATION_TOKENIZATION_IN_JAVA
-                && job.getAnalysisConfig().getCategorizationFieldName() != null;
+            && job.getAnalysisConfig().getCategorizationFieldName() != null;
         // The extra 1 is the control field
         int numberOfFields = job.allInputFields().size() + (includeTokensField ? 1 : 0) + 1;
 
@@ -89,8 +90,8 @@ public class NativeAutodetectProcessFactory implements AutodetectProcessFactory
         ProcessResultsParser<AutodetectResult> resultsParser = new ProcessResultsParser<>(AutodetectResult.PARSER,
             NamedXContentRegistry.EMPTY);
         NativeAutodetectProcess autodetect = new NativeAutodetectProcess(
-                job.getId(), nativeController, processPipes, numberOfFields,
-                filesToDelete, resultsParser, onProcessCrash);
+            job.getId(), nativeController, processPipes, numberOfFields,
+            filesToDelete, resultsParser, onProcessCrash);
         try {
             autodetect.start(executorService, stateProcessor);
             return autodetect;

+ 251 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/output/JobSnapshotUpgraderResultProcessor.java

@@ -0,0 +1,251 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.job.process.autodetect.output;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.apache.logging.log4j.message.ParameterizedMessage;
+import org.elasticsearch.action.bulk.BulkResponse;
+import org.elasticsearch.action.support.WriteRequest;
+import org.elasticsearch.xpack.core.ml.MachineLearningField;
+import org.elasticsearch.xpack.core.ml.annotations.Annotation;
+import org.elasticsearch.xpack.core.ml.job.process.autodetect.output.FlushAcknowledgement;
+import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.CategorizerStats;
+import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.ModelSizeStats;
+import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.ModelSnapshot;
+import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.Quantiles;
+import org.elasticsearch.xpack.core.ml.job.results.AnomalyRecord;
+import org.elasticsearch.xpack.core.ml.job.results.Bucket;
+import org.elasticsearch.xpack.core.ml.job.results.CategoryDefinition;
+import org.elasticsearch.xpack.core.ml.job.results.Forecast;
+import org.elasticsearch.xpack.core.ml.job.results.ForecastRequestStats;
+import org.elasticsearch.xpack.core.ml.job.results.Influencer;
+import org.elasticsearch.xpack.core.ml.job.results.ModelPlot;
+import org.elasticsearch.xpack.ml.job.persistence.JobResultsPersister;
+import org.elasticsearch.xpack.ml.job.process.autodetect.AutodetectProcess;
+import org.elasticsearch.xpack.ml.job.results.AutodetectResult;
+
+import java.util.Iterator;
+import java.util.List;
+import java.util.Objects;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+
+/**
+ * A runnable class that reads the autodetect process output in the
+ * {@link #process()} method and persists parsed
+ * results via the {@linkplain JobResultsPersister} passed in the constructor.
+ * <p>
+ * This is a single purpose result processor and only handles snapshot writes
+ */
+public class JobSnapshotUpgraderResultProcessor {
+
+    private static final Logger LOGGER = LogManager.getLogger(JobSnapshotUpgraderResultProcessor.class);
+    final CountDownLatch completionLatch = new CountDownLatch(1);
+    private final String jobId;
+    private final String snapshotId;
+    private final JobResultsPersister persister;
+    private final AutodetectProcess process;
+    private final JobResultsPersister.Builder bulkResultsPersister;
+    private volatile boolean processKilled;
+    private volatile boolean failed;
+
+    public JobSnapshotUpgraderResultProcessor(String jobId,
+                                              String snapshotId,
+                                              JobResultsPersister persister,
+                                              AutodetectProcess autodetectProcess) {
+        this.jobId = Objects.requireNonNull(jobId);
+        this.snapshotId = Objects.requireNonNull(snapshotId);
+        this.persister = Objects.requireNonNull(persister);
+        this.process = Objects.requireNonNull(autodetectProcess);
+        this.bulkResultsPersister = persister.bulkPersisterBuilder(jobId).shouldRetry(this::isAlive);
+    }
+
+    public void process() {
+
+        // If a function call in this throws for some reason we don't want it
+        // to kill the results reader thread as autodetect will be blocked
+        // trying to write its output.
+        try {
+            readResults();
+            try {
+                if (processKilled == false) {
+                    bulkResultsPersister.executeRequest();
+                }
+            } catch (Exception e) {
+                LOGGER.warn(new ParameterizedMessage(
+                    "[{}] [{}] Error persisting model snapshot upgrade results", jobId, snapshotId), e);
+            }
+        } catch (Exception e) {
+            failed = true;
+
+            if (processKilled) {
+                // Don't log the stack trace in this case.  Log just enough to hint
+                // that it would have been better to close jobs before shutting down,
+                // but we now fully expect jobs to move between nodes without doing
+                // all their graceful close activities.
+                LOGGER.warn(
+                    "[{}] [{}] some model snapshot upgrade results not processed due to the process being killed",
+                    jobId,
+                    snapshotId);
+            } else if (process.isProcessAliveAfterWaiting() == false) {
+                // Don't log the stack trace to not shadow the root cause.
+                LOGGER.warn(
+                    "[{}] [{}] some model snapshot upgrade results not processed due to the termination of autodetect",
+                    jobId,
+                    snapshotId);
+            } else {
+                // We should only get here if the iterator throws in which
+                // case parsing the autodetect output has failed.
+                LOGGER.error(new ParameterizedMessage(
+                    "[{}] [{}] error parsing model snapshot upgrade output", jobId, snapshotId), e);
+            }
+        } finally {
+            completionLatch.countDown();
+        }
+    }
+
+    private void readResults() {
+        try {
+            Iterator<AutodetectResult> iterator = process.readAutodetectResults();
+            while (iterator.hasNext()) {
+                try {
+                    AutodetectResult result = iterator.next();
+                    processResult(result);
+                } catch (Exception e) {
+                    if (isAlive() == false) {
+                        throw e;
+                    }
+                    LOGGER.warn(
+                        new ParameterizedMessage("[{}] [{}] Error processing model snapshot upgrade result", jobId, snapshotId),
+                        e);
+                }
+            }
+        } finally {
+            process.consumeAndCloseOutputStream();
+        }
+    }
+
+    public void setProcessKilled() {
+        processKilled = true;
+    }
+
+    private void logUnexpectedResult(String resultType) {
+        String msg = "["
+            + jobId
+            + "] ["
+            + snapshotId
+            + "] unexpected result read ["
+            + resultType
+            + "]";
+        // This should never happen, but we definitely want to fail if -ea is provided (e.g. during tests)
+        assert true : msg;
+        LOGGER.info(msg);
+    }
+
+    void processResult(AutodetectResult result) {
+        if (processKilled) {
+            return;
+        }
+
+        Bucket bucket = result.getBucket();
+        if (bucket != null) {
+            logUnexpectedResult(Bucket.RESULT_TYPE_VALUE);
+        }
+        List<AnomalyRecord> records = result.getRecords();
+        if (records != null && !records.isEmpty()) {
+            logUnexpectedResult(AnomalyRecord.RESULT_TYPE_VALUE);
+        }
+        List<Influencer> influencers = result.getInfluencers();
+        if (influencers != null && !influencers.isEmpty()) {
+            logUnexpectedResult(Influencer.RESULT_TYPE_VALUE);
+        }
+        CategoryDefinition categoryDefinition = result.getCategoryDefinition();
+        if (categoryDefinition != null) {
+            logUnexpectedResult(CategoryDefinition.TYPE.getPreferredName());
+        }
+        CategorizerStats categorizerStats = result.getCategorizerStats();
+        if (categorizerStats != null) {
+            logUnexpectedResult(CategorizerStats.RESULT_TYPE_VALUE);
+        }
+        ModelPlot modelPlot = result.getModelPlot();
+        if (modelPlot != null) {
+            logUnexpectedResult(ModelSnapshot.TYPE.getPreferredName());
+        }
+        Annotation annotation = result.getAnnotation();
+        if (annotation != null) {
+            logUnexpectedResult(Annotation.TYPE.getPreferredName());
+        }
+        Forecast forecast = result.getForecast();
+        if (forecast != null) {
+            logUnexpectedResult(Forecast.RESULT_TYPE_VALUE);
+        }
+        ForecastRequestStats forecastRequestStats = result.getForecastRequestStats();
+        if (forecastRequestStats != null) {
+            logUnexpectedResult(ForecastRequestStats.RESULT_TYPE_VALUE);
+        }
+        ModelSizeStats modelSizeStats = result.getModelSizeStats();
+        if (modelSizeStats != null) {
+            logUnexpectedResult(ModelSizeStats.RESULT_TYPE_VALUE);
+        }
+        ModelSnapshot modelSnapshot = result.getModelSnapshot();
+        if (modelSnapshot != null) {
+            BulkResponse bulkResponse = persister.persistModelSnapshot(modelSnapshot, WriteRequest.RefreshPolicy.IMMEDIATE, this::isAlive);
+            assert bulkResponse.getItems().length == 1;
+        }
+        Quantiles quantiles = result.getQuantiles();
+        if (quantiles != null) {
+            logUnexpectedResult(Quantiles.TYPE.getPreferredName());
+        }
+        FlushAcknowledgement flushAcknowledgement = result.getFlushAcknowledgement();
+        if (flushAcknowledgement != null) {
+            logUnexpectedResult(FlushAcknowledgement.TYPE.getPreferredName());
+        }
+    }
+
+    public void awaitCompletion() throws TimeoutException {
+        try {
+            // Although the results won't take 30 minutes to finish, the pipe won't be closed
+            // until the state is persisted, and that can take a while
+            if (completionLatch.await(MachineLearningField.STATE_PERSIST_RESTORE_TIMEOUT.getMinutes(),
+                TimeUnit.MINUTES) == false) {
+                throw new TimeoutException(
+                    "Timed out waiting for model snapshot upgrader results processor to complete for job "
+                        + jobId
+                        + " and snapshot "
+                        + snapshotId);
+            }
+
+            // These lines ensure that the "completion" we're awaiting includes making the results searchable
+            persister.commitStateWrites(jobId);
+
+        } catch (InterruptedException e) {
+            Thread.currentThread().interrupt();
+            LOGGER.info("[{}] [{}] Interrupted waiting for model snapshot upgrade results processor to complete", jobId, snapshotId);
+        }
+    }
+
+
+    /**
+     * If failed then there was an error parsing the results that cannot be recovered from
+     *
+     * @return true if failed
+     */
+    public boolean isFailed() {
+        return failed;
+    }
+
+    private boolean isAlive() {
+        if (processKilled) {
+            return false;
+        }
+        return process.isProcessAliveAfterWaiting();
+    }
+
+
+}

+ 16 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/autodetect/writer/AutodetectControlMsgWriter.java

@@ -244,4 +244,20 @@ public class AutodetectControlMsgWriter extends AbstractControlMsgWriter {
         fillCommandBuffer();
         lengthEncodedWriter.flush();
     }
+
+    /**
+     * @param snapshotTimestampMs The snapshot timestamp with MILLISECONDS resolution
+     * @param snapshotId The snapshot ID
+     * @param description The snapshot description
+     */
+    public void writeStartBackgroundPersistMessage(long snapshotTimestampMs, String snapshotId, String description) throws IOException {
+        StringBuilder stringBuilder = new StringBuilder(BACKGROUND_PERSIST_MESSAGE_CODE);
+        stringBuilder.append(snapshotTimestampMs / 1000).append(" ").append(snapshotId);
+        if (description != null) {
+            stringBuilder.append(" ").append(description);
+        }
+        writeMessage(stringBuilder.toString());
+        fillCommandBuffer();
+        lengthEncodedWriter.flush();
+    }
 }

+ 4 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/normalizer/MultiplyingNormalizerProcess.java

@@ -84,6 +84,10 @@ public class MultiplyingNormalizerProcess implements NormalizerProcess {
         // Nothing to do
     }
 
+    @Override
+    public void persistState(long snapshotTimestamp, String snapshotId, String snapshotDescription) {
+    }
+
     @Override
     public void flushStream() {
         // Nothing to do

+ 4 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/process/normalizer/NativeNormalizerProcess.java

@@ -38,6 +38,10 @@ class NativeNormalizerProcess extends AbstractNativeProcess implements Normalize
         // nothing to persist
     }
 
+    @Override
+    public void persistState(long snapshotTimestamp, String snapshotId, String snapshotDescription) {
+    }
+
     @Override
     public NormalizerResultHandler createNormalizedResultsHandler() {
         return new NormalizerResultHandler(processOutStream());

+ 85 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/snapshot/upgrader/SnapshotUpgradePredicate.java

@@ -0,0 +1,85 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+
+package org.elasticsearch.xpack.ml.job.snapshot.upgrader;
+
+import org.apache.logging.log4j.Logger;
+import org.elasticsearch.ElasticsearchException;
+import org.elasticsearch.persistent.PersistentTasksCustomMetadata;
+import org.elasticsearch.xpack.core.ml.job.snapshot.upgrade.SnapshotUpgradeState;
+import org.elasticsearch.xpack.core.ml.job.snapshot.upgrade.SnapshotUpgradeTaskState;
+import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
+
+import java.util.Optional;
+import java.util.function.Predicate;
+
+import static org.elasticsearch.xpack.ml.job.task.OpenJobPersistentTasksExecutor.checkAssignmentState;
+
+public class SnapshotUpgradePredicate implements Predicate<PersistentTasksCustomMetadata.PersistentTask<?>> {
+    private final boolean waitForCompletion;
+    private final Logger logger;
+    private volatile Exception exception;
+    private volatile String node = "";
+    private volatile boolean shouldCancel;
+    private volatile boolean isCompleted;
+
+    public SnapshotUpgradePredicate(boolean waitForCompletion, Logger logger) {
+        this.waitForCompletion = waitForCompletion;
+        this.logger = logger;
+    }
+
+    public Exception getException() {
+        return exception;
+    }
+
+    public String getNode() {
+        return node;
+    }
+
+    public boolean isShouldCancel() {
+        return shouldCancel;
+    }
+
+    public boolean isCompleted() {
+        return isCompleted;
+    }
+
+    @Override
+    public boolean test(PersistentTasksCustomMetadata.PersistentTask<?> persistentTask) {
+        // Persistent task being null means it has been removed from state, and is now complete
+        if (persistentTask == null) {
+            isCompleted = true;
+            return true;
+        }
+        SnapshotUpgradeTaskState snapshotUpgradeTaskState = (SnapshotUpgradeTaskState) persistentTask.getState();
+        SnapshotUpgradeState snapshotUpgradeState = snapshotUpgradeTaskState == null ?
+            SnapshotUpgradeState.STOPPED :
+            snapshotUpgradeTaskState.getState();
+        String reason = snapshotUpgradeTaskState == null ? "" : snapshotUpgradeTaskState.getReason();
+        PersistentTasksCustomMetadata.Assignment assignment = persistentTask.getAssignment();
+        // This logic is only appropriate when opening a job, not when reallocating following a failure,
+        // and this is why this class must only be used when opening a job
+        SnapshotUpgradeTaskParams params = (SnapshotUpgradeTaskParams) persistentTask.getParams();
+        Optional<ElasticsearchException> assignmentException = checkAssignmentState(assignment, params.getJobId(), logger);
+        if (assignmentException.isPresent()) {
+            exception = assignmentException.get();
+            shouldCancel = true;
+            return true;
+        }
+        if (snapshotUpgradeState == SnapshotUpgradeState.FAILED) {
+            exception = ExceptionsHelper.serverError("Unexpected state [" + snapshotUpgradeState
+                + "] while waiting for to be assigned to a node; recorded reason [" + reason + "]");
+            shouldCancel = true;
+            return true;
+        }
+        if (persistentTask.getExecutorNode() != null) {
+            node = persistentTask.getExecutorNode();
+            // If waitForCompletion is true, we need to wait for the task to be finished. Otherwise, return true once it is assigned
+            return waitForCompletion == false;
+        }
+        return false;
+    }
+}

+ 39 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/snapshot/upgrader/SnapshotUpgradeTask.java

@@ -0,0 +1,39 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+
+package org.elasticsearch.xpack.ml.job.snapshot.upgrader;
+
+import org.elasticsearch.persistent.AllocatedPersistentTask;
+import org.elasticsearch.tasks.TaskId;
+import org.elasticsearch.xpack.core.ml.MlTasks;
+
+import java.util.Map;
+
+public class SnapshotUpgradeTask extends AllocatedPersistentTask {
+
+    private final String jobId;
+    private final String snapshotId;
+
+    public SnapshotUpgradeTask(String jobId,
+                               String snapshotId,
+                               long id,
+                               String type,
+                               String action,
+                               TaskId parentTask,
+                               Map<String, String> headers) {
+        super(id, type, action, MlTasks.snapshotUpgradeTaskId(jobId, snapshotId), parentTask, headers);
+        this.jobId = jobId;
+        this.snapshotId = snapshotId;
+    }
+
+    public String getJobId() {
+        return jobId;
+    }
+
+    public String getSnapshotId() {
+        return snapshotId;
+    }
+}

+ 282 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/snapshot/upgrader/SnapshotUpgradeTaskExecutor.java

@@ -0,0 +1,282 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+
+package org.elasticsearch.xpack.ml.job.snapshot.upgrader;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.apache.logging.log4j.message.ParameterizedMessage;
+import org.elasticsearch.ElasticsearchStatusException;
+import org.elasticsearch.ResourceNotFoundException;
+import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.client.Client;
+import org.elasticsearch.cluster.ClusterState;
+import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver;
+import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.persistent.AllocatedPersistentTask;
+import org.elasticsearch.persistent.PersistentTaskState;
+import org.elasticsearch.persistent.PersistentTasksCustomMetadata;
+import org.elasticsearch.rest.RestStatus;
+import org.elasticsearch.tasks.TaskId;
+import org.elasticsearch.xpack.core.ml.MlConfigIndex;
+import org.elasticsearch.xpack.core.ml.MlTasks;
+import org.elasticsearch.xpack.core.ml.job.persistence.AnomalyDetectorsIndex;
+import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.ModelSnapshot;
+import org.elasticsearch.xpack.core.ml.job.results.Result;
+import org.elasticsearch.xpack.core.ml.job.snapshot.upgrade.SnapshotUpgradeState;
+import org.elasticsearch.xpack.core.ml.job.snapshot.upgrade.SnapshotUpgradeTaskState;
+import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
+import org.elasticsearch.xpack.ml.MachineLearning;
+import org.elasticsearch.xpack.ml.job.JobNodeSelector;
+import org.elasticsearch.xpack.ml.job.persistence.JobDataDeleter;
+import org.elasticsearch.xpack.ml.job.persistence.JobResultsProvider;
+import org.elasticsearch.xpack.ml.job.process.autodetect.AutodetectProcessManager;
+import org.elasticsearch.xpack.ml.notifications.AnomalyDetectionAuditor;
+import org.elasticsearch.xpack.ml.process.MlMemoryTracker;
+import org.elasticsearch.xpack.ml.task.AbstractJobPersistentTasksExecutor;
+
+import java.util.Collections;
+import java.util.Map;
+import java.util.Optional;
+
+
+public class SnapshotUpgradeTaskExecutor extends AbstractJobPersistentTasksExecutor<SnapshotUpgradeTaskParams> {
+
+    private static final Logger logger = LogManager.getLogger(SnapshotUpgradeTaskExecutor.class);
+    private final AutodetectProcessManager autodetectProcessManager;
+    private final AnomalyDetectionAuditor auditor;
+    private final JobResultsProvider jobResultsProvider;
+    private volatile ClusterState clusterState;
+    private final Client client;
+
+    public SnapshotUpgradeTaskExecutor(Settings settings,
+                                       ClusterService clusterService,
+                                       AutodetectProcessManager autodetectProcessManager,
+                                       MlMemoryTracker memoryTracker,
+                                       IndexNameExpressionResolver expressionResolver,
+                                       Client client) {
+        super(MlTasks.JOB_SNAPSHOT_UPGRADE_TASK_NAME,
+            MachineLearning.UTILITY_THREAD_POOL_NAME,
+            settings,
+            clusterService,
+            memoryTracker,
+            expressionResolver);
+        this.autodetectProcessManager = autodetectProcessManager;
+        this.auditor = new AnomalyDetectionAuditor(client, clusterService);
+        this.jobResultsProvider = new JobResultsProvider(client, settings, expressionResolver);
+        this.client = client;
+        clusterService.addListener(event -> clusterState = event.state());
+    }
+
+    @Override
+    public PersistentTasksCustomMetadata.Assignment getAssignment(SnapshotUpgradeTaskParams params, ClusterState clusterState) {
+        boolean isMemoryTrackerRecentlyRefreshed = memoryTracker.isRecentlyRefreshed();
+        Optional<PersistentTasksCustomMetadata.Assignment> optionalAssignment = getPotentialAssignment(params, clusterState);
+        if (optionalAssignment.isPresent()) {
+            return optionalAssignment.get();
+        }
+        JobNodeSelector jobNodeSelector = new JobNodeSelector(
+            clusterState,
+            params.getJobId(),
+            MlTasks.JOB_SNAPSHOT_UPGRADE_TASK_NAME,
+            memoryTracker,
+            0,
+            node -> null);
+        return jobNodeSelector.selectNode(
+            Integer.MAX_VALUE,
+            Integer.MAX_VALUE,
+            maxMachineMemoryPercent,
+            isMemoryTrackerRecentlyRefreshed,
+            useAutoMemoryPercentage);
+    }
+
+    @Override
+    protected void nodeOperation(AllocatedPersistentTask task, SnapshotUpgradeTaskParams params, PersistentTaskState state) {
+        SnapshotUpgradeTaskState jobTaskState = (SnapshotUpgradeTaskState) state;
+        SnapshotUpgradeState jobState = jobTaskState == null ? null : jobTaskState.getState();
+        logger.info("[{}] [{}] starting to execute task",
+            params.getJobId(),
+            params.getSnapshotId());
+
+        // This means that we have loaded the snapshot and possibly snapshot was partially updated
+        // This is no good, we should remove the snapshot
+        if (SnapshotUpgradeState.SAVING_NEW_STATE.equals(jobState)) {
+            deleteSnapshotAndFailTask(task, params.getJobId(), params.getSnapshotId());
+            return;
+        }
+        // if the task is failed, that means it was set that way purposefully. So, assuming there is no bad snapshot state
+        if (SnapshotUpgradeState.FAILED.equals(jobState)) {
+            logger.warn(
+                "[{}] [{}] upgrade task reassigned to another node while failed",
+                params.getJobId(),
+                params.getSnapshotId());
+            task.markAsFailed(new ElasticsearchStatusException(
+                "Task to upgrade job [{}] snapshot [{}] got reassigned while failed. Reason [{}]",
+                RestStatus.INTERNAL_SERVER_ERROR,
+                params.getJobId(),
+                params.getSnapshotId(),
+                jobTaskState.getReason() == null ? "__unknown__" : jobTaskState.getReason()));
+            return;
+        }
+
+        ActionListener<Boolean> stateAliasHandler = ActionListener.wrap(
+           r -> autodetectProcessManager.upgradeSnapshot((SnapshotUpgradeTask)task, e -> {
+               if (e == null) {
+                   auditor.info(params.getJobId(), "Finished upgrading snapshot [" + params.getSnapshotId() + "]");
+                   logger.info("[{}] [{}] finished upgrading snapshot", params.getJobId(), params.getSnapshotId());
+                   task.markAsCompleted();
+               } else {
+                   logger.warn(
+                       () -> new ParameterizedMessage(
+                           "[{}] failed upgrading snapshot [{}]",
+                           params.getJobId(),
+                           params.getSnapshotId()),
+                       e);
+                   auditor.warning(params.getJobId(),
+                       "failed upgrading snapshot ["
+                           + params.getSnapshotId()
+                           + "] with exception "
+                           + ExceptionsHelper.unwrapCause(e).getMessage());
+                   task.markAsFailed(e);
+               }
+           }),
+           e -> {
+               logger.warn(
+                   () -> new ParameterizedMessage(
+                       "[{}] failed upgrading snapshot [{}] as ml state alias creation failed",
+                       params.getJobId(),
+                       params.getSnapshotId()),
+                   e);
+               auditor.warning(params.getJobId(),
+                   "failed upgrading snapshot ["
+                       + params.getSnapshotId()
+                       + "] with exception "
+                       + ExceptionsHelper.unwrapCause(e).getMessage());
+               // We need to update cluster state so the API caller can be notified and exit
+               // As we have not set the task state to STARTED, it might still be waiting.
+               task.updatePersistentTaskState(
+                   new SnapshotUpgradeTaskState(SnapshotUpgradeState.FAILED, -1, e.getMessage()),
+                   ActionListener.wrap(
+                       r -> task.markAsFailed(e),
+                       failure -> {
+                           logger.warn(
+                               new ParameterizedMessage(
+                                   "[{}] [{}] failed to set task to failed",
+                                   params.getJobId(),
+                                   params.getSnapshotId()),
+                               failure);
+                           task.markAsFailed(e);
+                       }
+                   ));
+           }
+        );
+        AnomalyDetectorsIndex.createStateIndexAndAliasIfNecessary(client, clusterState, expressionResolver, stateAliasHandler);
+    }
+
+    @Override
+    protected AllocatedPersistentTask createTask(long id, String type, String action, TaskId parentTaskId,
+                                                 PersistentTasksCustomMetadata.PersistentTask<SnapshotUpgradeTaskParams> persistentTask,
+                                                 Map<String, String> headers) {
+        return new SnapshotUpgradeTask(persistentTask.getParams().getJobId(),
+            persistentTask.getParams().getSnapshotId(),
+            id,
+            type,
+            action,
+            parentTaskId,
+            headers);
+    }
+
+    @Override
+    protected boolean allowsMissingIndices() {
+        return false;
+    }
+
+    @Override
+    protected String[] indicesOfInterest(SnapshotUpgradeTaskParams params) {
+        return new String[]{
+            AnomalyDetectorsIndex.jobStateIndexPattern(),
+            MlConfigIndex.indexName(),
+            AnomalyDetectorsIndex.resultsWriteAlias(params.getJobId())
+        };
+    }
+
+    @Override
+    protected String getJobId(SnapshotUpgradeTaskParams params) {
+        return params.getJobId();
+    }
+
+    private void deleteSnapshotAndFailTask(AllocatedPersistentTask task, String jobId, String snapshotId) {
+        ActionListener<Result<ModelSnapshot>> modelSnapshotListener = ActionListener.wrap(
+            result -> {
+                if (result == null) {
+                    task.markAsFailed(new ElasticsearchStatusException(
+                        "Task to upgrade job [{}] snapshot [{}] got reassigned while running leaving an unknown snapshot state. " +
+                            "Snapshot is deleted",
+                        RestStatus.INTERNAL_SERVER_ERROR,
+                        jobId,
+                        snapshotId));
+                    return;
+                }
+                ModelSnapshot snapshot = result.result;
+                JobDataDeleter jobDataDeleter = new JobDataDeleter(client, jobId);
+                jobDataDeleter.deleteModelSnapshots(Collections.singletonList(snapshot), ActionListener.wrap(
+                    deleteResponse -> {
+                        auditor.warning(
+                            jobId,
+                            "Task to upgrade snapshot exited in unknown state. Deleted snapshot [" + snapshotId + "]");
+                        task.markAsFailed(new ElasticsearchStatusException(
+                            "Task to upgrade job [{}] snapshot [{}] got reassigned while running leaving an unknown snapshot state. " +
+                                "Corrupted snapshot deleted",
+                            RestStatus.INTERNAL_SERVER_ERROR,
+                            jobId,
+                            snapshotId));
+                    },
+                    failure -> {
+                        logger.warn(
+                            () -> new ParameterizedMessage(
+                                "[{}] [{}] failed to clean up potentially bad snapshot",
+                                jobId,
+                                snapshotId),
+                            failure);
+                        task.markAsFailed(new ElasticsearchStatusException(
+                            "Task to upgrade job [{}] snapshot [{}] got reassigned while running leaving an unknown snapshot state. " +
+                                "Unable to cleanup potentially corrupted snapshot",
+                            RestStatus.INTERNAL_SERVER_ERROR,
+                            jobId,
+                            snapshotId));
+                    }
+                ));
+            },
+            e -> {
+                if (ExceptionsHelper.unwrapCause(e) instanceof ResourceNotFoundException) {
+                    task.markAsFailed(new ElasticsearchStatusException(
+                        "Task to upgrade job [{}] snapshot [{}] got reassigned while running leaving an unknown snapshot state. " +
+                            "Snapshot is deleted",
+                        RestStatus.INTERNAL_SERVER_ERROR,
+                        jobId,
+                        snapshotId));
+                    return;
+                }
+                logger.warn(
+                    () -> new ParameterizedMessage(
+                        "[{}] [{}] failed to load bad snapshot for deletion",
+                        jobId,
+                        snapshotId
+                    ),
+                    e);
+                task.markAsFailed(new ElasticsearchStatusException(
+                    "Task to upgrade job [{}] snapshot [{}] got reassigned while running leaving an unknown snapshot state. " +
+                        "Unable to cleanup potentially corrupted snapshot",
+                    RestStatus.INTERNAL_SERVER_ERROR,
+                    jobId,
+                    snapshotId));
+
+            }
+        );
+        jobResultsProvider.getModelSnapshot(jobId, snapshotId, modelSnapshotListener::onResponse, modelSnapshotListener::onFailure);
+    }
+}

+ 100 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/snapshot/upgrader/SnapshotUpgradeTaskParams.java

@@ -0,0 +1,100 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+
+package org.elasticsearch.xpack.ml.job.snapshot.upgrader;
+
+import org.elasticsearch.Version;
+import org.elasticsearch.common.ParseField;
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.common.xcontent.ConstructingObjectParser;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.persistent.PersistentTaskParams;
+import org.elasticsearch.xpack.core.ml.job.config.Job;
+
+import java.io.IOException;
+import java.util.Objects;
+
+import static org.elasticsearch.xpack.core.ml.MlTasks.JOB_SNAPSHOT_UPGRADE_TASK_NAME;
+
+public class SnapshotUpgradeTaskParams implements PersistentTaskParams {
+
+    public static final ParseField SNAPSHOT_ID = new ParseField("snapshot_id");
+
+    public static final ConstructingObjectParser<SnapshotUpgradeTaskParams, Void> PARSER = new ConstructingObjectParser<>(
+        JOB_SNAPSHOT_UPGRADE_TASK_NAME,
+        true,
+        a -> new SnapshotUpgradeTaskParams((String) a[0], (String) a[1]));
+
+    static {
+        PARSER.declareString(ConstructingObjectParser.constructorArg(), Job.ID);
+        PARSER.declareString(ConstructingObjectParser.constructorArg(), SNAPSHOT_ID);
+    }
+
+    public static final String NAME = JOB_SNAPSHOT_UPGRADE_TASK_NAME;
+
+    private final String jobId;
+    private final String snapshotId;
+
+    public SnapshotUpgradeTaskParams(StreamInput in) throws IOException {
+        this.jobId = in.readString();
+        this.snapshotId = in.readString();
+    }
+
+    public SnapshotUpgradeTaskParams(String jobId, String snapshotId) {
+        this.jobId = jobId;
+        this.snapshotId = snapshotId;
+    }
+
+    public String getJobId() {
+        return jobId;
+    }
+
+    public String getSnapshotId() {
+        return snapshotId;
+    }
+
+    @Override
+    public String getWriteableName() {
+        return JOB_SNAPSHOT_UPGRADE_TASK_NAME;
+    }
+
+    @Override
+    public Version getMinimalSupportedVersion() {
+        return Version.V_7_11_0;
+    }
+
+    @Override
+    public void writeTo(StreamOutput out) throws IOException {
+        out.writeString(jobId);
+        out.writeString(snapshotId);
+    }
+
+    @Override
+    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
+        builder.startObject();
+        builder.field(Job.ID.getPreferredName(), jobId);
+        builder.field(SNAPSHOT_ID.getPreferredName(), snapshotId);
+        builder.endObject();
+        return builder;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        SnapshotUpgradeTaskParams params = (SnapshotUpgradeTaskParams) o;
+        return Objects.equals(jobId, params.jobId) &&
+            Objects.equals(snapshotId, params.snapshotId);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(jobId, snapshotId);
+    }
+}
+
+

+ 21 - 5
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/task/OpenJobPersistentTasksExecutor.java

@@ -65,7 +65,6 @@ public class OpenJobPersistentTasksExecutor extends AbstractJobPersistentTasksEx
     }
 
     private final AutodetectProcessManager autodetectProcessManager;
-    private final MlMemoryTracker memoryTracker;
     private final Client client;
     private final JobResultsProvider jobResultsProvider;
 
@@ -76,7 +75,6 @@ public class OpenJobPersistentTasksExecutor extends AbstractJobPersistentTasksEx
                                           Client client, IndexNameExpressionResolver expressionResolver) {
         super(MlTasks.JOB_TASK_NAME, MachineLearning.UTILITY_THREAD_POOL_NAME, settings, clusterService, memoryTracker, expressionResolver);
         this.autodetectProcessManager = Objects.requireNonNull(autodetectProcessManager);
-        this.memoryTracker = Objects.requireNonNull(memoryTracker);
         this.client = Objects.requireNonNull(client);
         this.jobResultsProvider = new JobResultsProvider(client, settings, expressionResolver);
         clusterService.addListener(event -> clusterState = event.state());
@@ -228,7 +226,25 @@ public class OpenJobPersistentTasksExecutor extends AbstractJobPersistentTasksEx
         return new JobTask(persistentTask.getParams().getJobId(), id, type, action, parentTaskId, headers);
     }
 
-    public static ElasticsearchException makeNoSuitableNodesException(Logger logger, String jobId, String explanation) {
+    public static Optional<ElasticsearchException> checkAssignmentState(PersistentTasksCustomMetadata.Assignment assignment,
+                                                                        String jobId,
+                                                                        Logger logger) {
+        if (assignment != null
+            && assignment.equals(PersistentTasksCustomMetadata.INITIAL_ASSIGNMENT) == false
+            && assignment.isAssigned() == false) {
+            // Assignment has failed on the master node despite passing our "fast fail" validation
+            if (assignment.equals(AWAITING_UPGRADE)) {
+                return Optional.of(makeCurrentlyBeingUpgradedException(logger, jobId, assignment.getExplanation()));
+            } else if (assignment.getExplanation().contains("[" + EnableAssignmentDecider.ALLOCATION_NONE_EXPLANATION + "]")) {
+                return Optional.of(makeAssignmentsNotAllowedException(logger, jobId));
+            } else {
+                return Optional.of(makeNoSuitableNodesException(logger, jobId, assignment.getExplanation()));
+            }
+        }
+        return Optional.empty();
+    }
+
+    static ElasticsearchException makeNoSuitableNodesException(Logger logger, String jobId, String explanation) {
         String msg = "Could not open job because no suitable nodes were found, allocation explanation [" + explanation + "]";
         logger.warn("[{}] {}", jobId, msg);
         Exception detail = new IllegalStateException(msg);
@@ -236,14 +252,14 @@ public class OpenJobPersistentTasksExecutor extends AbstractJobPersistentTasksEx
             RestStatus.TOO_MANY_REQUESTS, detail);
     }
 
-    public static ElasticsearchException makeAssignmentsNotAllowedException(Logger logger, String jobId) {
+    static ElasticsearchException makeAssignmentsNotAllowedException(Logger logger, String jobId) {
         String msg = "Cannot open jobs because persistent task assignment is disabled by the ["
             + EnableAssignmentDecider.CLUSTER_TASKS_ALLOCATION_ENABLE_SETTING.getKey() + "] setting";
         logger.warn("[{}] {}", jobId, msg);
         return new ElasticsearchStatusException(msg, RestStatus.TOO_MANY_REQUESTS);
     }
 
-    public static ElasticsearchException makeCurrentlyBeingUpgradedException(Logger logger, String jobId, String explanation) {
+    static ElasticsearchException makeCurrentlyBeingUpgradedException(Logger logger, String jobId, String explanation) {
         String msg = "Cannot open jobs when upgrade mode is enabled";
         logger.warn("[{}] {}", jobId, msg);
         return new ElasticsearchStatusException(msg, RestStatus.TOO_MANY_REQUESTS);

+ 9 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/process/NativeProcess.java

@@ -36,6 +36,15 @@ public interface NativeProcess extends Closeable {
      */
     void persistState() throws IOException;
 
+    /**
+     * Ask the process to persist state, even if it is unchanged.
+     * @param snapshotTimestampMs The snapshot timestamp in milliseconds
+     * @param snapshotId The id of the snapshot to save
+     * @param snapshotDescription the snapshot description
+     * @throws IOException if writing the request fails
+     */
+    void persistState(long snapshotTimestampMs, String snapshotId, String snapshotDescription) throws IOException;
+
     /**
      * Flush the output data stream
      */

+ 60 - 0
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/modelsnapshots/RestUpgradeJobModelSnapshotAction.java

@@ -0,0 +1,60 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.xpack.ml.rest.modelsnapshots;
+
+import org.elasticsearch.client.node.NodeClient;
+import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.rest.BaseRestHandler;
+import org.elasticsearch.rest.RestRequest;
+import org.elasticsearch.rest.action.RestToXContentListener;
+import org.elasticsearch.xpack.core.ml.action.UpgradeJobModelSnapshotAction;
+import org.elasticsearch.xpack.core.ml.job.config.Job;
+import org.elasticsearch.xpack.ml.MachineLearning;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.List;
+
+import static org.elasticsearch.rest.RestRequest.Method.POST;
+import static org.elasticsearch.xpack.core.ml.action.UpgradeJobModelSnapshotAction.Request.DEFAULT_TIMEOUT;
+
+public class RestUpgradeJobModelSnapshotAction extends BaseRestHandler {
+
+    @Override
+    public List<Route> routes() {
+        return Collections.singletonList(
+            new Route(POST,
+                MachineLearning.BASE_PATH
+                    + "anomaly_detectors/{"
+                    + Job.ID.getPreferredName()
+                    + "}/model_snapshots/{"
+                    + UpgradeJobModelSnapshotAction.Request.SNAPSHOT_ID.getPreferredName()
+                    + "}/_upgrade")
+        );
+    }
+
+    @Override
+    public String getName() {
+        return "ml_upgrade_job_model_snapshot_action";
+    }
+
+    @Override
+    protected RestChannelConsumer prepareRequest(RestRequest restRequest, NodeClient client) throws IOException {
+
+        String jobId = restRequest.param(Job.ID.getPreferredName());
+        String snapshotId = restRequest.param(UpgradeJobModelSnapshotAction.Request.SNAPSHOT_ID.getPreferredName());
+        TimeValue timeout = TimeValue.parseTimeValue(
+            restRequest.param(UpgradeJobModelSnapshotAction.Request.TIMEOUT.getPreferredName(), DEFAULT_TIMEOUT.getStringRep()),
+            UpgradeJobModelSnapshotAction.Request.TIMEOUT.getPreferredName());
+        boolean waitForCompletion = restRequest.paramAsBoolean(UpgradeJobModelSnapshotAction.Request.WAIT_FOR_COMPLETION.getPreferredName(),
+            false);
+        UpgradeJobModelSnapshotAction.Request request = new UpgradeJobModelSnapshotAction.Request(jobId,
+            snapshotId,
+            timeout,
+            waitForCompletion);
+        return channel -> client.execute(UpgradeJobModelSnapshotAction.INSTANCE, request, new RestToXContentListener<>(channel));
+    }
+}

+ 10 - 2
x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/task/AbstractJobPersistentTasksExecutor.java

@@ -38,6 +38,7 @@ public abstract class AbstractJobPersistentTasksExecutor<Params extends Persiste
 
     public static List<String> verifyIndicesPrimaryShardsAreActive(ClusterState clusterState,
                                                                    IndexNameExpressionResolver expressionResolver,
+                                                                   boolean allowMissing,
                                                                    String... indicesOfInterest) {
         String[] indices = expressionResolver.concreteIndexNames(clusterState, IndicesOptions.lenientExpandOpen(), indicesOfInterest);
         List<String> unavailableIndices = new ArrayList<>(indices.length);
@@ -45,6 +46,9 @@ public abstract class AbstractJobPersistentTasksExecutor<Params extends Persiste
             // Indices are created on demand from templates.
             // It is not an error if the index doesn't exist yet
             if (clusterState.metadata().hasIndex(index) == false) {
+                if (allowMissing == false) {
+                    unavailableIndices.add(index);
+                }
                 continue;
             }
             IndexRoutingTable routingTable = clusterState.getRoutingTable().index(index);
@@ -89,6 +93,9 @@ public abstract class AbstractJobPersistentTasksExecutor<Params extends Persiste
 
     protected abstract String[] indicesOfInterest(Params params);
     protected abstract String getJobId(Params params);
+    protected boolean allowsMissingIndices() {
+        return true;
+    }
 
     public Optional<PersistentTasksCustomMetadata.Assignment> getPotentialAssignment(Params params, ClusterState clusterState) {
         // If we are waiting for an upgrade to complete, we should not assign to a node
@@ -127,10 +134,11 @@ public abstract class AbstractJobPersistentTasksExecutor<Params extends Persiste
     }
 
     public Optional<PersistentTasksCustomMetadata.Assignment> checkRequiredIndices(String jobId,
-                                                                            ClusterState clusterState,
-                                                                            String... indicesOfInterest) {
+                                                                                   ClusterState clusterState,
+                                                                                   String... indicesOfInterest) {
         List<String> unavailableIndices = verifyIndicesPrimaryShardsAreActive(clusterState,
             expressionResolver,
+            allowsMissingIndices(),
             indicesOfInterest);
         if (unavailableIndices.size() != 0) {
             String reason = "Not opening [" + jobId + "], because not all primary shards are active for the following indices [" +

+ 3 - 3
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/process/autodetect/AutodetectProcessManagerTests.java

@@ -273,7 +273,7 @@ public class AutodetectProcessManagerTests extends ESTestCase {
         when(autodetectProcess.isProcessAlive()).thenReturn(true);
         when(autodetectProcess.readAutodetectResults()).thenReturn(Collections.emptyIterator());
 
-        autodetectFactory = (j, autodetectParams, e, onProcessCrash) -> autodetectProcess;
+        autodetectFactory = (pid, j, autodetectParams, e, onProcessCrash) -> autodetectProcess;
         Settings.Builder settings = Settings.builder();
         settings.put(MachineLearning.MAX_OPEN_JOBS_PER_NODE.getKey(), 3);
         AutodetectProcessManager manager = createSpyManager(settings.build());
@@ -620,7 +620,7 @@ public class AutodetectProcessManagerTests extends ESTestCase {
         }).when(jobManager).getJob(eq("my_id"), any());
 
         AutodetectProcess autodetectProcess = mock(AutodetectProcess.class);
-        autodetectFactory = (j, autodetectParams, e, onProcessCrash) -> autodetectProcess;
+        autodetectFactory = (pid, j, autodetectParams, e, onProcessCrash) -> autodetectProcess;
         AutodetectProcessManager manager = createSpyManager();
         doCallRealMethod().when(manager).create(any(), any(), any(), any());
 
@@ -694,7 +694,7 @@ public class AutodetectProcessManagerTests extends ESTestCase {
         }).when(jobManager).getJob(eq(jobId), any());
 
         AutodetectProcess autodetectProcess = mock(AutodetectProcess.class);
-        autodetectFactory = (j, autodetectParams, e, onProcessCrash) -> autodetectProcess;
+        autodetectFactory = (pid, j, autodetectParams, e, onProcessCrash) -> autodetectProcess;
         return createManager(Settings.EMPTY);
     }
 

+ 89 - 0
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/snapshot/upgrader/SnapshotUpgradePredicateTests.java

@@ -0,0 +1,89 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+
+package org.elasticsearch.xpack.ml.job.snapshot.upgrader;
+
+import org.elasticsearch.persistent.PersistentTasksCustomMetadata;
+import org.elasticsearch.persistent.PersistentTasksCustomMetadata.PersistentTask;
+import org.elasticsearch.test.ESTestCase;
+import org.elasticsearch.xpack.core.ml.MlTasks;
+import org.elasticsearch.xpack.core.ml.job.snapshot.upgrade.SnapshotUpgradeState;
+import org.elasticsearch.xpack.core.ml.job.snapshot.upgrade.SnapshotUpgradeTaskState;
+
+import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.notNullValue;
+
+
+public class SnapshotUpgradePredicateTests extends ESTestCase {
+
+    public void testWhenWaitForCompletionIsTrue() {
+        final PersistentTask<SnapshotUpgradeTaskParams> assignedTask = new PersistentTask<>("task_id",
+            MlTasks.JOB_SNAPSHOT_UPGRADE_TASK_NAME,
+            new SnapshotUpgradeTaskParams("job", "snapshot"),
+            1,
+            new PersistentTasksCustomMetadata.Assignment("test-node", ""));
+        {
+            SnapshotUpgradePredicate snapshotUpgradePredicate = new SnapshotUpgradePredicate(true, logger);
+            assertThat(snapshotUpgradePredicate.test(null), is(true));
+            assertThat(snapshotUpgradePredicate.isCompleted(), is(true));
+        }
+
+        {
+            SnapshotUpgradePredicate snapshotUpgradePredicate = new SnapshotUpgradePredicate(true, logger);
+            assertThat(snapshotUpgradePredicate.test(assignedTask), is(false));
+            assertThat(snapshotUpgradePredicate.isCompleted(), is(false));
+        }
+
+        {
+            PersistentTask<SnapshotUpgradeTaskParams> failedAssignedTask = new PersistentTask<>(assignedTask,
+                new SnapshotUpgradeTaskState(SnapshotUpgradeState.FAILED, 1,
+                    "this reason"));
+            SnapshotUpgradePredicate snapshotUpgradePredicate = new SnapshotUpgradePredicate(true, logger);
+            assertThat(snapshotUpgradePredicate.test(failedAssignedTask), is(true));
+            assertThat(snapshotUpgradePredicate.isCompleted(), is(false));
+            assertThat(snapshotUpgradePredicate.isShouldCancel(), is(true));
+            assertThat(snapshotUpgradePredicate.getException(), is(notNullValue()));
+            assertThat(snapshotUpgradePredicate.getException().getMessage(),
+                containsString("while waiting for to be assigned to a node; recorded reason [this reason]"));
+        }
+
+
+    }
+
+    public void testWhenWaitForCompletionIsFalse() {
+        final PersistentTask<SnapshotUpgradeTaskParams> assignedTask = new PersistentTask<>("task_id",
+            MlTasks.JOB_SNAPSHOT_UPGRADE_TASK_NAME,
+            new SnapshotUpgradeTaskParams("job", "snapshot"),
+            1,
+            new PersistentTasksCustomMetadata.Assignment("test-node", ""));
+        {
+            SnapshotUpgradePredicate snapshotUpgradePredicate = new SnapshotUpgradePredicate(false, logger);
+            assertThat(snapshotUpgradePredicate.test(null), is(true));
+            assertThat(snapshotUpgradePredicate.isCompleted(), is(true));
+        }
+
+        {
+            SnapshotUpgradePredicate snapshotUpgradePredicate = new SnapshotUpgradePredicate(false, logger);
+            assertThat(snapshotUpgradePredicate.test(assignedTask), is(true));
+            assertThat(snapshotUpgradePredicate.isCompleted(), is(false));
+        }
+
+        {
+            PersistentTask<SnapshotUpgradeTaskParams> failedAssignedTask = new PersistentTask<>(assignedTask,
+                new SnapshotUpgradeTaskState(SnapshotUpgradeState.FAILED, 1,
+                    "this reason"));
+            SnapshotUpgradePredicate snapshotUpgradePredicate = new SnapshotUpgradePredicate(false, logger);
+            assertThat(snapshotUpgradePredicate.test(failedAssignedTask), is(true));
+            assertThat(snapshotUpgradePredicate.isCompleted(), is(false));
+            assertThat(snapshotUpgradePredicate.isShouldCancel(), is(true));
+            assertThat(snapshotUpgradePredicate.getException(), is(notNullValue()));
+            assertThat(snapshotUpgradePredicate.getException().getMessage(),
+                containsString("while waiting for to be assigned to a node; recorded reason [this reason]"));
+        }
+
+    }
+}

+ 4 - 0
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/process/AbstractNativeProcessTests.java

@@ -199,5 +199,9 @@ public class AbstractNativeProcessTests extends ESTestCase {
         @Override
         public void persistState() {
         }
+
+        @Override
+        public void persistState(long snapshotTimestamp, String snapshotId, String snapshotDescription) {
+        }
     }
 }

+ 2 - 1
x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/task/AbstractJobPersistentTasksExecutorTests.java

@@ -49,7 +49,7 @@ public class AbstractJobPersistentTasksExecutorTests extends ESTestCase {
         csBuilder.metadata(metadata);
 
         ClusterState cs = csBuilder.build();
-        assertEquals(0, verifyIndicesPrimaryShardsAreActive(cs, resolver, ".ml-anomalies-shared",
+        assertEquals(0, verifyIndicesPrimaryShardsAreActive(cs, resolver, true, ".ml-anomalies-shared",
             AnomalyDetectorsIndex.jobStateIndexPattern(),
             MlMetaIndex.indexName(),
             MlConfigIndex.indexName()).size());
@@ -77,6 +77,7 @@ public class AbstractJobPersistentTasksExecutorTests extends ESTestCase {
         csBuilder.routingTable(routingTable.build());
         csBuilder.metadata(metadata);
         List<String> result = verifyIndicesPrimaryShardsAreActive(csBuilder.build(), resolver,
+            true,
             ".ml-anomalies-shared",
             AnomalyDetectorsIndex.jobStateIndexPattern(),
             MlMetaIndex.indexName(),

+ 41 - 0
x-pack/plugin/src/test/resources/rest-api-spec/api/ml.upgrade_job_snapshot.json

@@ -0,0 +1,41 @@
+{
+  "ml.upgrade_job_snapshot":{
+    "documentation":{
+      "url":"https://www.elastic.co/guide/en/elasticsearch/reference/current/ml-upgrade-job-model-snapshot.html",
+      "description":"Upgrades a given job snapshot to the current major version."
+    },
+    "stability":"stable",
+    "url":{
+      "paths":[
+        {
+          "path":"/_ml/anomaly_detectors/{job_id}/model_snapshots/{snapshot_id}/_upgrade",
+          "methods":[
+            "POST"
+          ],
+          "parts":{
+            "job_id":{
+              "type":"string",
+              "description":"The ID of the job"
+            },
+            "snapshot_id":{
+              "type":"string",
+              "description":"The ID of the snapshot"
+            }
+          }
+        }
+      ]
+    },
+    "params":{
+      "timeout":{
+        "type":"timevalue",
+        "required":false,
+        "description":"How long should the API wait for the job to be opened and the old snapshot to be loaded."
+      },
+      "wait_for_completion":{
+        "type":"boolean",
+        "required":false,
+        "description":"Should the request wait until the task is complete before responding to the caller. Default is false."
+      }
+    }
+  }
+}

+ 80 - 0
x-pack/plugin/src/test/resources/rest-api-spec/test/ml/upgrade_job_snapshot.yml

@@ -0,0 +1,80 @@
+setup:
+  - skip:
+      features: headers
+  - do:
+      headers:
+        Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser
+      ml.put_job:
+        job_id: upgrade-model-snapshot
+        body:  >
+          {
+            "analysis_config" : {
+                "detectors" :[{"function":"metric","field_name":"responsetime","by_field_name":"airline"}]
+            },
+            "data_description" : {
+                "format":"xcontent",
+                "time_field":"time"
+            }
+          }
+
+  - do:
+      headers:
+        Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser
+        Content-Type: application/json
+      index:
+        index:  .ml-anomalies-upgrade-model-snapshot
+        id:     "upgrade-model-snapshot_model_snapshot_snapshot-1"
+        body: >
+          {
+            "job_id" : "upgrade-model-snapshot",
+            "timestamp": "2016-06-02T00:00:00Z",
+            "snapshot_id": "snapshot-1",
+            "snapshot_doc_count": 3,
+            "min_version": "8.0.0",
+            "retain": false
+          }
+
+  - do:
+      headers:
+        Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser
+        Content-Type: application/json
+      index:
+        index:  .ml-anomalies-upgrade-model-snapshot
+        id:     "upgrade-model-snapshot_model_snapshot_snapshot-2"
+        body: >
+          {
+            "job_id": "upgrade-model-snapshot",
+            "timestamp": "2016-06-01T00:00:00Z",
+            "snapshot_id": "snapshot-2",
+            "description": "snapshot 2 description",
+            "snapshot_doc_count": 2,
+            "min_version": "7.0.0"
+          }
+
+  - do:
+      headers:
+        Authorization: "Basic eF9wYWNrX3Jlc3RfdXNlcjp4LXBhY2stdGVzdC1wYXNzd29yZA==" # run as x_pack_rest_user, i.e. the test setup superuser
+      indices.refresh:
+        index: [.ml-anomalies-upgrade-model-snapshot]
+
+---
+"Test with unknown job id":
+  - do:
+      catch: missing
+      ml.upgrade_job_snapshot:
+        job_id: "non-existent-job"
+        snapshot_id: "san"
+---
+"Test with unknown snapshot id":
+  - do:
+      catch: missing
+      ml.upgrade_job_snapshot:
+        job_id: "upgrade-model-snapshot"
+        snapshot_id: "snapshot-9999"
+---
+"Test with already upgraded snapshot":
+  - do:
+      catch: conflict
+      ml.upgrade_job_snapshot:
+        job_id: "upgrade-model-snapshot"
+        snapshot_id: "snapshot-1"

+ 275 - 0
x-pack/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/MlJobSnapshotUpgradeIT.java

@@ -0,0 +1,275 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License;
+ * you may not use this file except in compliance with the Elastic License.
+ */
+package org.elasticsearch.upgrades;
+
+import org.elasticsearch.ElasticsearchException;
+import org.elasticsearch.client.MachineLearningClient;
+import org.elasticsearch.client.Request;
+import org.elasticsearch.client.RequestOptions;
+import org.elasticsearch.client.RestClient;
+import org.elasticsearch.client.RestHighLevelClient;
+import org.elasticsearch.client.ml.CloseJobRequest;
+import org.elasticsearch.client.ml.CloseJobResponse;
+import org.elasticsearch.client.ml.FlushJobRequest;
+import org.elasticsearch.client.ml.FlushJobResponse;
+import org.elasticsearch.client.ml.GetJobRequest;
+import org.elasticsearch.client.ml.GetJobResponse;
+import org.elasticsearch.client.ml.GetJobStatsRequest;
+import org.elasticsearch.client.ml.GetModelSnapshotsRequest;
+import org.elasticsearch.client.ml.GetModelSnapshotsResponse;
+import org.elasticsearch.client.ml.OpenJobRequest;
+import org.elasticsearch.client.ml.OpenJobResponse;
+import org.elasticsearch.client.ml.PostDataRequest;
+import org.elasticsearch.client.ml.PostDataResponse;
+import org.elasticsearch.client.ml.PutJobRequest;
+import org.elasticsearch.client.ml.PutJobResponse;
+import org.elasticsearch.client.ml.RevertModelSnapshotRequest;
+import org.elasticsearch.client.ml.UpgradeJobModelSnapshotRequest;
+import org.elasticsearch.client.ml.job.config.AnalysisConfig;
+import org.elasticsearch.client.ml.job.config.DataDescription;
+import org.elasticsearch.client.ml.job.config.Detector;
+import org.elasticsearch.client.ml.job.config.Job;
+import org.elasticsearch.client.ml.job.process.DataCounts;
+import org.elasticsearch.client.ml.job.process.ModelSnapshot;
+import org.elasticsearch.common.Strings;
+import org.elasticsearch.common.bytes.BytesArray;
+import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.common.xcontent.XContentType;
+import org.elasticsearch.common.xcontent.json.JsonXContent;
+import org.elasticsearch.xpack.test.rest.XPackRestTestConstants;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+import java.util.function.BiFunction;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.greaterThan;
+import static org.hamcrest.Matchers.hasSize;
+import static org.hamcrest.Matchers.is;
+
+public class MlJobSnapshotUpgradeIT extends AbstractUpgradeTestCase {
+
+    private static final String JOB_ID = "ml-snapshots-upgrade-job";
+
+    private static class HLRC extends RestHighLevelClient {
+        HLRC(RestClient restClient) {
+            super(restClient, RestClient::close, new ArrayList<>());
+        }
+    }
+
+    private MachineLearningClient hlrc;
+
+    @Override
+    protected Collection<String> templatesToWaitFor() {
+        return Stream.concat(XPackRestTestConstants.ML_POST_V660_TEMPLATES.stream(),
+            super.templatesToWaitFor().stream()).collect(Collectors.toSet());
+    }
+
+    protected static void waitForPendingUpgraderTasks() throws Exception {
+        waitForPendingTasks(adminClient(), taskName -> taskName.startsWith("xpack/ml/job/snapshot/upgrade") == false);
+    }
+
+    /**
+     * The purpose of this test is to ensure that when a job is open through a rolling upgrade we upgrade the results
+     * index mappings when it is assigned to an upgraded node even if no other ML endpoint is called after the upgrade
+     */
+    public void testSnapshotUpgrader() throws Exception {
+        hlrc = new HLRC(client()).machineLearning();
+        //assumeTrue("Snapshot upgrader should only upgrade from the last major", UPGRADE_FROM_VERSION.major < 7);
+        Request adjustLoggingLevels = new Request("PUT", "/_cluster/settings");
+        adjustLoggingLevels.setJsonEntity(
+            "{\"transient\": {" +
+                "\"logger.org.elasticsearch.xpack.ml\": \"trace\"" +
+                "}}");
+        client().performRequest(adjustLoggingLevels);
+        switch (CLUSTER_TYPE) {
+            case OLD:
+                createJobAndSnapshots();
+                break;
+            case MIXED:
+                // Add mixed cluster test after backported
+                break;
+            case UPGRADED:
+                ensureHealth((request -> {
+                    request.addParameter("timeout", "70s");
+                    request.addParameter("wait_for_nodes", "3");
+                    request.addParameter("wait_for_status", "yellow");
+                }));
+                testSnapshotUpgrade();
+                waitForPendingUpgraderTasks();
+                break;
+            default:
+                throw new UnsupportedOperationException("Unknown cluster type [" + CLUSTER_TYPE + "]");
+        }
+    }
+
+    private void testSnapshotUpgrade() throws Exception {
+        Job job = getJob(JOB_ID).jobs().get(0);
+        String currentSnapshot = job.getModelSnapshotId();
+
+        GetModelSnapshotsResponse modelSnapshots = getModelSnapshots(job.getId());
+        assertThat(modelSnapshots.snapshots(), hasSize(2));
+        assertThat(modelSnapshots.snapshots().get(0).getMinVersion().major, equalTo((byte)7));
+        assertThat(modelSnapshots.snapshots().get(1).getMinVersion().major, equalTo((byte)7));
+
+        ModelSnapshot snapshot = modelSnapshots.snapshots()
+            .stream()
+            .filter(s -> s.getSnapshotId().equals(currentSnapshot) == false)
+            .findFirst()
+            .orElseThrow(() -> new ElasticsearchException("Not found snapshot other than " + currentSnapshot));
+
+        assertThat(hlrc.upgradeJobSnapshot(
+            new UpgradeJobModelSnapshotRequest(JOB_ID, snapshot.getSnapshotId(), null, true),
+            RequestOptions.DEFAULT).isCompleted(), is(true));
+
+        List<ModelSnapshot> snapshots = getModelSnapshots(job.getId(), snapshot.getSnapshotId()).snapshots();
+        assertThat(snapshots, hasSize(1));
+        assertThat(snapshot.getLatestRecordTimeStamp(), equalTo(snapshots.get(0).getLatestRecordTimeStamp()));
+
+        // Does the snapshot still work?
+        assertThat(hlrc.getJobStats(new GetJobStatsRequest(JOB_ID), RequestOptions.DEFAULT)
+                .jobStats()
+                .get(0)
+                .getDataCounts().getLatestRecordTimeStamp(),
+            greaterThan(snapshot.getLatestRecordTimeStamp()));
+        RevertModelSnapshotRequest revertModelSnapshotRequest = new RevertModelSnapshotRequest(JOB_ID, snapshot.getSnapshotId());
+        revertModelSnapshotRequest.setDeleteInterveningResults(true);
+        assertThat(hlrc.revertModelSnapshot(revertModelSnapshotRequest, RequestOptions.DEFAULT).getModel().getSnapshotId(),
+            equalTo(snapshot.getSnapshotId()));
+        assertThat(openJob(JOB_ID).isOpened(), is(true));
+        assertThat(hlrc.getJobStats(new GetJobStatsRequest(JOB_ID), RequestOptions.DEFAULT)
+                .jobStats()
+                .get(0)
+                .getDataCounts().getLatestRecordTimeStamp(),
+            equalTo(snapshot.getLatestRecordTimeStamp()));
+        closeJob(JOB_ID);
+    }
+
+    private void createJobAndSnapshots() throws Exception {
+        TimeValue bucketSpan = TimeValue.timeValueHours(1);
+        long startTime = 1491004800000L;
+
+        PutJobResponse jobResponse = buildAndPutJob(JOB_ID, bucketSpan);
+        Job job = jobResponse.getResponse();
+        openJob(job.getId());
+        DataCounts dataCounts = postData(job.getId(),
+            generateData(startTime,
+                bucketSpan,
+                10,
+                Arrays.asList("foo"),
+                (bucketIndex, series) -> bucketIndex == 5 ? 100.0 : 10.0).stream().collect(Collectors.joining()))
+            .getDataCounts();
+        assertThat(dataCounts.getInvalidDateCount(), equalTo(0L));
+        assertThat(dataCounts.getBucketCount(), greaterThan(0L));
+        final long lastCount = dataCounts.getBucketCount();
+        flushJob(job.getId());
+        closeJob(job.getId());
+
+        // We need to wait a second to ensure the second time around model snapshot will have a different ID (it depends on epoch seconds)
+        waitUntil(() -> false, 2, TimeUnit.SECONDS);
+
+        openJob(job.getId());
+        dataCounts = postData(job.getId(),
+            generateData(
+                startTime + 10 * bucketSpan.getMillis(),
+                bucketSpan,
+                10,
+                Arrays.asList("foo"),
+                (bucketIndex, series) -> 10.0).stream().collect(Collectors.joining()))
+            .getDataCounts();
+        assertThat(dataCounts.getInvalidDateCount(), equalTo(0L));
+        assertThat(dataCounts.getBucketCount(), greaterThan(lastCount));
+        flushJob(job.getId());
+        closeJob(job.getId());
+
+        GetModelSnapshotsResponse modelSnapshots = getModelSnapshots(job.getId());
+        assertThat(modelSnapshots.snapshots(), hasSize(2));
+        assertThat(modelSnapshots.snapshots().get(0).getMinVersion().major, equalTo((byte)7));
+        assertThat(modelSnapshots.snapshots().get(1).getMinVersion().major, equalTo((byte)7));
+    }
+
+    private PutJobResponse buildAndPutJob(String jobId, TimeValue bucketSpan) throws Exception {
+        Detector.Builder detector = new Detector.Builder("mean", "value");
+        detector.setPartitionFieldName("series");
+        AnalysisConfig.Builder analysisConfig = new AnalysisConfig.Builder(Arrays.asList(detector.build()));
+        analysisConfig.setBucketSpan(bucketSpan);
+        Job.Builder job = new Job.Builder(jobId);
+        job.setAnalysisConfig(analysisConfig);
+        DataDescription.Builder dataDescription = new DataDescription.Builder();
+        job.setDataDescription(dataDescription);
+        return putJob(job.build());
+    }
+
+    private static List<String> generateData(long timestamp, TimeValue bucketSpan, int bucketCount, List<String> series,
+                                             BiFunction<Integer, String, Double> timeAndSeriesToValueFunction) throws IOException {
+        List<String> data = new ArrayList<>();
+        long now = timestamp;
+        for (int i = 0; i < bucketCount; i++) {
+            for (String field : series) {
+                Map<String, Object> record = new HashMap<>();
+                record.put("time", now);
+                record.put("value", timeAndSeriesToValueFunction.apply(i, field));
+                record.put("series", field);
+                data.add(createJsonRecord(record));
+
+                record = new HashMap<>();
+                record.put("time", now + bucketSpan.getMillis() / 2);
+                record.put("value", timeAndSeriesToValueFunction.apply(i, field));
+                record.put("series", field);
+                data.add(createJsonRecord(record));
+            }
+            now += bucketSpan.getMillis();
+        }
+        return data;
+    }
+
+    protected GetJobResponse getJob(String jobId) throws IOException {
+        return hlrc.getJob(new GetJobRequest(jobId), RequestOptions.DEFAULT);
+    }
+
+    protected PutJobResponse putJob(Job job) throws IOException {
+        return hlrc.putJob(new PutJobRequest(job), RequestOptions.DEFAULT);
+    }
+
+    protected OpenJobResponse openJob(String jobId) throws IOException {
+        return hlrc.openJob(new OpenJobRequest(jobId), RequestOptions.DEFAULT);
+    }
+
+    protected PostDataResponse postData(String jobId, String data) throws IOException {
+        return hlrc.postData(new PostDataRequest(jobId, XContentType.JSON, new BytesArray(data)), RequestOptions.DEFAULT);
+    }
+
+    protected FlushJobResponse flushJob(String jobId) throws IOException {
+        return hlrc.flushJob(new FlushJobRequest(jobId), RequestOptions.DEFAULT);
+    }
+
+    protected CloseJobResponse closeJob(String jobId) throws IOException {
+        return hlrc.closeJob(new CloseJobRequest(jobId), RequestOptions.DEFAULT);
+    }
+
+    protected GetModelSnapshotsResponse getModelSnapshots(String jobId) throws IOException {
+        return getModelSnapshots(jobId, null);
+    }
+
+    protected GetModelSnapshotsResponse getModelSnapshots(String jobId, String snapshotId) throws IOException {
+        GetModelSnapshotsRequest getModelSnapshotsRequest = new GetModelSnapshotsRequest(jobId);
+        getModelSnapshotsRequest.setSnapshotId(snapshotId);
+        return hlrc.getModelSnapshots(getModelSnapshotsRequest, RequestOptions.DEFAULT);
+    }
+
+    protected static String createJsonRecord(Map<String, Object> keyValueMap) throws IOException {
+        return Strings.toString(JsonXContent.contentBuilder().map(keyValueMap)) + "\n";
+    }
+
+}