4 лет назад · 7c45032530
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/InferTrainedModelDeploymentAction.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/InferTrainedModelDeploymentAction.java
@@ -14,13 +14,14 @@ import org.elasticsearch.action.support.tasks.BaseTasksResponse;
 
				 import org.elasticsearch.common.io.stream.StreamInput;
			
 
				 import org.elasticsearch.common.io.stream.StreamOutput;
			
 
				 import org.elasticsearch.common.io.stream.Writeable;
			
 
				+import org.elasticsearch.core.Nullable;
			
 
				+import org.elasticsearch.core.TimeValue;
			
 
				+import org.elasticsearch.tasks.Task;
			
 
				 import org.elasticsearch.xcontent.ObjectParser;
			
 
				 import org.elasticsearch.xcontent.ParseField;
			
 
				 import org.elasticsearch.xcontent.ToXContentObject;
			
 
				 import org.elasticsearch.xcontent.XContentBuilder;
			
 
				 import org.elasticsearch.xcontent.XContentParser;
			
 
				-import org.elasticsearch.core.TimeValue;
			
 
				-import org.elasticsearch.tasks.Task;
			
 
				 import org.elasticsearch.xpack.core.ml.inference.results.InferenceResults;
			
 
				 import org.elasticsearch.xpack.core.ml.inference.trainedmodel.EmptyConfigUpdate;
			
 
				 import org.elasticsearch.xpack.core.ml.inference.trainedmodel.InferenceConfigUpdate;
			
@@ -46,6 +47,13 @@ public class InferTrainedModelDeploymentAction extends ActionType<InferTrainedMo
 
				         super(NAME, InferTrainedModelDeploymentAction.Response::new);
			
 
				     }
			
 
				 
			
 
				+    /**
			
 
				+     * Request for inference against the deployment.
			
 
				+     *
			
 
				+     * The task gets routed to a node that indicates its local model allocation is started
			
 
				+     *
			
 
				+     * For indicating timeout, the caller should call `setInferenceTimeout` and not the base class `setTimeout` method
			
 
				+     */
			
 
				     public static class Request extends BaseTasksRequest<Request> {
			
 
				 
			
 
				         public static final ParseField DEPLOYMENT_ID = new ParseField("deployment_id");
			
@@ -59,7 +67,7 @@ public class InferTrainedModelDeploymentAction extends ActionType<InferTrainedMo
 
				         static {
			
 
				             PARSER.declareString(Request.Builder::setDeploymentId, DEPLOYMENT_ID);
			
 
				             PARSER.declareObjectArray(Request.Builder::setDocs, (p, c) -> p.mapOrdered(), DOCS);
			
 
				-            PARSER.declareString(Request.Builder::setTimeout, TIMEOUT);
			
 
				+            PARSER.declareString(Request.Builder::setInferenceTimeout, TIMEOUT);
			
 
				             PARSER.declareNamedObject(
			
 
				                 Request.Builder::setUpdate,
			
 
				                 ((p, c, name) -> p.namedObject(InferenceConfigUpdate.class, name, c)),
			
@@ -67,22 +75,24 @@ public class InferTrainedModelDeploymentAction extends ActionType<InferTrainedMo
 
				             );
			
 
				         }
			
 
				 
			
 
				-        public static Request parseRequest(String deploymentId, XContentParser parser) {
			
 
				+        public static Request.Builder parseRequest(String deploymentId, XContentParser parser) {
			
 
				             Request.Builder builder = PARSER.apply(parser, null);
			
 
				             if (deploymentId != null) {
			
 
				                 builder.setDeploymentId(deploymentId);
			
 
				             }
			
 
				-            return builder.build();
			
 
				+            return builder;
			
 
				         }
			
 
				 
			
 
				         private final String deploymentId;
			
 
				         private final List<Map<String, Object>> docs;
			
 
				         private final InferenceConfigUpdate update;
			
 
				+        private final TimeValue inferenceTimeout;
			
 
				 
			
 
				-        public Request(String deploymentId, InferenceConfigUpdate update, List<Map<String, Object>> docs) {
			
 
				+        public Request(String deploymentId, InferenceConfigUpdate update, List<Map<String, Object>> docs, TimeValue inferenceTimeout) {
			
 
				             this.deploymentId = ExceptionsHelper.requireNonNull(deploymentId, DEPLOYMENT_ID);
			
 
				             this.docs = ExceptionsHelper.requireNonNull(Collections.unmodifiableList(docs), DOCS);
			
 
				             this.update = update;
			
 
				+            this.inferenceTimeout = inferenceTimeout;
			
 
				         }
			
 
				 
			
 
				         public Request(StreamInput in) throws IOException {
			
@@ -90,6 +100,7 @@ public class InferTrainedModelDeploymentAction extends ActionType<InferTrainedMo
 
				             deploymentId = in.readString();
			
 
				             docs = Collections.unmodifiableList(in.readList(StreamInput::readMap));
			
 
				             update = in.readOptionalNamedWriteable(InferenceConfigUpdate.class);
			
 
				+            inferenceTimeout = in.readOptionalTimeValue();
			
 
				         }
			
 
				 
			
 
				         public String getDeploymentId() {
			
@@ -104,13 +115,18 @@ public class InferTrainedModelDeploymentAction extends ActionType<InferTrainedMo
 
				             return Optional.ofNullable(update).orElse(new EmptyConfigUpdate());
			
 
				         }
			
 
				 
			
 
				+        public TimeValue getInferenceTimeout() {
			
 
				+            return inferenceTimeout == null ? DEFAULT_TIMEOUT : inferenceTimeout;
			
 
				+        }
			
 
				+
			
 
				+        /**
			
 
				+         * This is always null as we want the inference call to handle the timeout, not the tasks framework
			
 
				+         * @return null
			
 
				+         */
			
 
				         @Override
			
 
				+        @Nullable
			
 
				         public TimeValue getTimeout() {
			
 
				-            TimeValue tv = super.getTimeout();
			
 
				-            if (tv == null) {
			
 
				-                return DEFAULT_TIMEOUT;
			
 
				-            }
			
 
				-            return tv;
			
 
				+            return null;
			
 
				         }
			
 
				 
			
 
				         @Override
			
@@ -139,6 +155,7 @@ public class InferTrainedModelDeploymentAction extends ActionType<InferTrainedMo
 
				             out.writeString(deploymentId);
			
 
				             out.writeCollection(docs, StreamOutput::writeMap);
			
 
				             out.writeOptionalNamedWriteable(update);
			
 
				+            out.writeOptionalTimeValue(inferenceTimeout);
			
 
				         }
			
 
				 
			
 
				         @Override
			
@@ -154,12 +171,12 @@ public class InferTrainedModelDeploymentAction extends ActionType<InferTrainedMo
 
				             return Objects.equals(deploymentId, that.deploymentId)
			
 
				                 && Objects.equals(docs, that.docs)
			
 
				                 && Objects.equals(update, that.update)
			
 
				-                && Objects.equals(getTimeout(), that.getTimeout());
			
 
				+                && Objects.equals(inferenceTimeout, that.inferenceTimeout);
			
 
				         }
			
 
				 
			
 
				         @Override
			
 
				         public int hashCode() {
			
 
				-            return Objects.hash(deploymentId, update, docs, getTimeout());
			
 
				+            return Objects.hash(deploymentId, update, docs, inferenceTimeout);
			
 
				         }
			
 
				 
			
 
				         public static class Builder {
			
@@ -181,7 +198,7 @@ public class InferTrainedModelDeploymentAction extends ActionType<InferTrainedMo
 
				                 return this;
			
 
				             }
			
 
				 
			
 
				-            public Builder setTimeout(TimeValue timeout) {
			
 
				+            public Builder setInferenceTimeout(TimeValue timeout) {
			
 
				                 this.timeout = timeout;
			
 
				                 return this;
			
 
				             }
			
@@ -191,16 +208,12 @@ public class InferTrainedModelDeploymentAction extends ActionType<InferTrainedMo
 
				                 return this;
			
 
				             }
			
 
				 
			
 
				-            private Builder setTimeout(String timeout) {
			
 
				-                return setTimeout(TimeValue.parseTimeValue(timeout, TIMEOUT.getPreferredName()));
			
 
				+            private Builder setInferenceTimeout(String timeout) {
			
 
				+                return setInferenceTimeout(TimeValue.parseTimeValue(timeout, TIMEOUT.getPreferredName()));
			
 
				             }
			
 
				 
			
 
				             public Request build() {
			
 
				-                Request request = new Request(deploymentId, update, docs);
			
 
				-                if (timeout != null) {
			
 
				-                    request.setTimeout(timeout);
			
 
				-                }
			
 
				-                return request;
			
 
				+                return new Request(deploymentId, update, docs, timeout);
			
 
				             }
			
 
				         }
			
 
				     }
			
--- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/InferTrainedModelDeploymentRequestsTests.java
+++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/InferTrainedModelDeploymentRequestsTests.java
@@ -9,6 +9,7 @@ package org.elasticsearch.xpack.core.ml.action;
 
				 
			
 
				 import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
			
 
				 import org.elasticsearch.common.io.stream.Writeable;
			
 
				+import org.elasticsearch.core.TimeValue;
			
 
				 import org.elasticsearch.core.Tuple;
			
 
				 import org.elasticsearch.test.AbstractWireSerializingTestCase;
			
 
				 import org.elasticsearch.xpack.core.ml.inference.MlInferenceNamedXContentProvider;
			
@@ -37,15 +38,12 @@ public class InferTrainedModelDeploymentRequestsTests extends AbstractWireSerial
 
				         List<Map<String, Object>> docs = randomList(5, () -> randomMap(1, 3,
			
 
				             () -> Tuple.tuple(randomAlphaOfLength(7), randomAlphaOfLength(7))));
			
 
				 
			
 
				-        InferTrainedModelDeploymentAction.Request request = new InferTrainedModelDeploymentAction.Request(
			
 
				+        return new InferTrainedModelDeploymentAction.Request(
			
 
				             randomAlphaOfLength(4),
			
 
				             randomBoolean() ? null : randomInferenceConfigUpdate(),
			
 
				-            docs
			
 
				+            docs,
			
 
				+            randomBoolean() ? null : TimeValue.parseTimeValue(randomTimeValue(), "timeout")
			
 
				         );
			
 
				-        if (randomBoolean()) {
			
 
				-            request.setTimeout(randomTimeValue());
			
 
				-        }
			
 
				-        return request;
			
 
				     }
			
 
				 
			
 
				     @Override
			
@@ -56,6 +54,10 @@ public class InferTrainedModelDeploymentRequestsTests extends AbstractWireSerial
 
				     }
			
 
				 
			
 
				     public void testTimeoutNotNull() {
			
 
				-        assertNotNull(createTestInstance().getTimeout());
			
 
				+        assertNotNull(createTestInstance().getInferenceTimeout());
			
 
				+    }
			
 
				+
			
 
				+    public void testTimeoutNull() {
			
 
				+        assertNull(createTestInstance().getTimeout());
			
 
				     }
			
 
				 }
			
--- a/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/PyTorchModelIT.java
+++ b/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/PyTorchModelIT.java
@@ -15,6 +15,7 @@ import org.elasticsearch.common.CheckedBiConsumer;
 
				 import org.elasticsearch.common.settings.Settings;
			
 
				 import org.elasticsearch.common.util.concurrent.ThreadContext;
			
 
				 import org.elasticsearch.common.xcontent.support.XContentMapValues;
			
 
				+import org.elasticsearch.core.TimeValue;
			
 
				 import org.elasticsearch.test.SecuritySettingsSourceField;
			
 
				 import org.elasticsearch.test.rest.ESRestTestCase;
			
 
				 import org.elasticsearch.xpack.core.ml.inference.allocation.AllocationStatus;
			
@@ -185,6 +186,17 @@ public class PyTorchModelIT extends ESRestTestCase {
 
				         stopDeployment(modelId);
			
 
				     }
			
 
				 
			
 
				+    public void testEvaluateWithMinimalTimeout() throws IOException {
			
 
				+        String modelId = "test_evaluate_timeout";
			
 
				+        createTrainedModel(modelId);
			
 
				+        putModelDefinition(modelId);
			
 
				+        putVocabulary(List.of("these", "are", "my", "words"), modelId);
			
 
				+        startDeployment(modelId);
			
 
				+        ResponseException ex = expectThrows(ResponseException.class, () -> infer("my words", modelId, TimeValue.ZERO));
			
 
				+        assertThat(ex.getResponse().getStatusLine().getStatusCode(), equalTo(429));
			
 
				+        stopDeployment(modelId);
			
 
				+    }
			
 
				+
			
 
				     public void testDeleteFailureDueToDeployment() throws IOException {
			
 
				         String modelId = "test_deployed_model_delete";
			
 
				         createTrainedModel(modelId);
			
@@ -449,6 +461,14 @@ public class PyTorchModelIT extends ESRestTestCase {
 
				         return client().performRequest(request);
			
 
				     }
			
 
				 
			
 
				+    private Response infer(String input, String modelId, TimeValue timeout) throws IOException {
			
 
				+        Request request = new Request("POST", "/_ml/trained_models/" + modelId + "/deployment/_infer?timeout=" + timeout.toString());
			
 
				+        request.setJsonEntity("{  " +
			
 
				+            "\"docs\": [{\"input\":\"" + input + "\"}]\n" +
			
 
				+            "}");
			
 
				+        return client().performRequest(request);
			
 
				+    }
			
 
				+
			
 
				     private Response infer(String input, String modelId) throws IOException {
			
 
				         Request request = new Request("POST", "/_ml/trained_models/" + modelId + "/deployment/_infer");
			
 
				         request.setJsonEntity("{  " +
			
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportInferTrainedModelDeploymentAction.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportInferTrainedModelDeploymentAction.java
@@ -83,10 +83,11 @@ public class TransportInferTrainedModelDeploymentAction extends TransportTasksAc
 
				         task.infer(
			
 
				             request.getDocs().get(0),
			
 
				             request.getUpdate(),
			
 
				-            request.getTimeout(),
			
 
				+            request.getInferenceTimeout(),
			
 
				             ActionListener.wrap(
			
 
				                 pyTorchResult -> listener.onResponse(new InferTrainedModelDeploymentAction.Response(pyTorchResult)),
			
 
				-                listener::onFailure)
			
 
				+                listener::onFailure
			
 
				+            )
			
 
				         );
			
 
				     }
			
 
				 }
			
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportInternalInferModelAction.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportInternalInferModelAction.java
@@ -6,6 +6,8 @@
 
				  */
			
 
				 package org.elasticsearch.xpack.ml.action;
			
 
				 
			
 
				+import org.elasticsearch.ElasticsearchStatusException;
			
 
				+import org.elasticsearch.ExceptionsHelper;
			
 
				 import org.elasticsearch.action.ActionListener;
			
 
				 import org.elasticsearch.action.support.ActionFilters;
			
 
				 import org.elasticsearch.action.support.HandledTransportAction;
			
@@ -14,6 +16,7 @@ import org.elasticsearch.cluster.service.ClusterService;
 
				 import org.elasticsearch.common.inject.Inject;
			
 
				 import org.elasticsearch.license.LicenseUtils;
			
 
				 import org.elasticsearch.license.XPackLicenseState;
			
 
				+import org.elasticsearch.rest.RestStatus;
			
 
				 import org.elasticsearch.tasks.Task;
			
 
				 import org.elasticsearch.threadpool.ThreadPool;
			
 
				 import org.elasticsearch.transport.TransportService;
			
@@ -158,10 +161,22 @@ public class TransportInternalInferModelAction extends HandledTransportAction<Re
 
				         executeAsyncWithOrigin(client,
			
 
				             ML_ORIGIN,
			
 
				             InferTrainedModelDeploymentAction.INSTANCE,
			
 
				-            new InferTrainedModelDeploymentAction.Request(modelId, inferenceConfigUpdate, Collections.singletonList(doc)),
			
 
				+            new InferTrainedModelDeploymentAction.Request(modelId, inferenceConfigUpdate, Collections.singletonList(doc), null),
			
 
				             ActionListener.wrap(
			
 
				                 r -> listener.onResponse(r.getResults()),
			
 
				-                e -> listener.onResponse(new WarningInferenceResults(e.getMessage()))
			
 
				+                e -> {
			
 
				+                    Throwable unwrapped = ExceptionsHelper.unwrapCause(e);
			
 
				+                    if (unwrapped instanceof ElasticsearchStatusException) {
			
 
				+                        ElasticsearchStatusException ex = (ElasticsearchStatusException) unwrapped;
			
 
				+                        if (ex.status().equals(RestStatus.TOO_MANY_REQUESTS)) {
			
 
				+                            listener.onFailure(ex);
			
 
				+                        } else {
			
 
				+                            listener.onResponse(new WarningInferenceResults(ex.getMessage()));
			
 
				+                        }
			
 
				+                    } else {
			
 
				+                        listener.onResponse(new WarningInferenceResults(e.getMessage()));
			
 
				+                    }
			
 
				+                }
			
 
				             )
			
 
				         );
			
 
				     }
			
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/deployment/DeploymentManager.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/deployment/DeploymentManager.java
@@ -27,6 +27,7 @@ import org.elasticsearch.core.TimeValue;
 
				 import org.elasticsearch.index.query.IdsQueryBuilder;
			
 
				 import org.elasticsearch.rest.RestStatus;
			
 
				 import org.elasticsearch.search.SearchHit;
			
 
				+import org.elasticsearch.threadpool.Scheduler;
			
 
				 import org.elasticsearch.threadpool.ThreadPool;
			
 
				 import org.elasticsearch.xpack.core.ml.action.GetTrainedModelsAction;
			
 
				 import org.elasticsearch.xpack.core.ml.inference.TrainedModelConfig;
			
@@ -58,6 +59,7 @@ import java.util.Optional;
 
				 import java.util.concurrent.ConcurrentHashMap;
			
 
				 import java.util.concurrent.ConcurrentMap;
			
 
				 import java.util.concurrent.ExecutorService;
			
 
				+import java.util.concurrent.atomic.AtomicBoolean;
			
 
				 import java.util.concurrent.atomic.AtomicLong;
			
 
				 import java.util.function.Consumer;
			
 
				 
			
@@ -74,6 +76,7 @@ public class DeploymentManager {
 
				     private final PyTorchProcessFactory pyTorchProcessFactory;
			
 
				     private final ExecutorService executorServiceForDeployment;
			
 
				     private final ExecutorService executorServiceForProcess;
			
 
				+    private final ThreadPool threadPool;
			
 
				     private final ConcurrentMap<Long, ProcessContext> processContextByAllocation = new ConcurrentHashMap<>();
			
 
				 
			
 
				     public DeploymentManager(Client client, NamedXContentRegistry xContentRegistry,
			
@@ -81,6 +84,7 @@ public class DeploymentManager {
 
				         this.client = Objects.requireNonNull(client);
			
 
				         this.xContentRegistry = Objects.requireNonNull(xContentRegistry);
			
 
				         this.pyTorchProcessFactory = Objects.requireNonNull(pyTorchProcessFactory);
			
 
				+        this.threadPool = Objects.requireNonNull(threadPool);
			
 
				         this.executorServiceForDeployment = threadPool.executor(MachineLearning.UTILITY_THREAD_POOL_NAME);
			
 
				         this.executorServiceForProcess = threadPool.executor(MachineLearning.JOB_COMMS_THREAD_POOL_NAME);
			
 
				     }
			
@@ -92,8 +96,8 @@ public class DeploymentManager {
 
				     public Optional<ModelStats> getStats(TrainedModelDeploymentTask task) {
			
 
				         return Optional.ofNullable(processContextByAllocation.get(task.getId()))
			
 
				             .map(processContext ->
			
 
				-                new ModelStats(processContext.resultProcessor.getTimingStats(),
			
 
				-                    processContext.resultProcessor.getLastUsed())
			
 
				+                new ModelStats(processContext.getResultProcessor().getTimingStats(),
			
 
				+                    processContext.getResultProcessor().getLastUsed())
			
 
				             );
			
 
				     }
			
 
				 
			
@@ -117,7 +121,7 @@ public class DeploymentManager {
 
				 
			
 
				         ActionListener<Boolean> modelLoadedListener = ActionListener.wrap(
			
 
				             success -> {
			
 
				-                executorServiceForProcess.execute(() -> processContext.resultProcessor.process(processContext.process.get()));
			
 
				+                executorServiceForProcess.execute(() -> processContext.getResultProcessor().process(processContext.process.get()));
			
 
				                 listener.onResponse(task);
			
 
				             },
			
 
				             listener::onFailure
			
@@ -226,83 +230,145 @@ public class DeploymentManager {
 
				             return;
			
 
				         }
			
 
				 
			
 
				-        final String requestId = String.valueOf(requestIdCounter.getAndIncrement());
			
 
				+        final long requestId = requestIdCounter.getAndIncrement();
			
 
				+        executorServiceForProcess.execute(new InferenceAction(requestId, timeout, processContext, config, doc, threadPool, listener));
			
 
				+    }
			
 
				 
			
 
				-        executorServiceForProcess.execute(new AbstractRunnable() {
			
 
				-            @Override
			
 
				-            public void onFailure(Exception e) {
			
 
				-                listener.onFailure(e);
			
 
				-            }
			
 
				+    static class InferenceAction extends AbstractRunnable {
			
 
				+        private final long requestId;
			
 
				+        private final TimeValue timeout;
			
 
				+        private final Scheduler.Cancellable timeoutHandler;
			
 
				+        private final ProcessContext processContext;
			
 
				+        private final InferenceConfig config;
			
 
				+        private final Map<String, Object> doc;
			
 
				+        private final ActionListener<InferenceResults> listener;
			
 
				+        private final AtomicBoolean notified = new AtomicBoolean();
			
 
				+
			
 
				+        InferenceAction(
			
 
				+            long requestId,
			
 
				+            TimeValue timeout,
			
 
				+            ProcessContext processContext,
			
 
				+            InferenceConfig config,
			
 
				+            Map<String, Object> doc,
			
 
				+            ThreadPool threadPool,
			
 
				+            ActionListener<InferenceResults> listener
			
 
				+        ) {
			
 
				+            this.requestId = requestId;
			
 
				+            this.timeout = timeout;
			
 
				+            this.processContext = processContext;
			
 
				+            this.config = config;
			
 
				+            this.doc = doc;
			
 
				+            this.listener = listener;
			
 
				+            this.timeoutHandler = threadPool.schedule(
			
 
				+                this::onTimeout,
			
 
				+                ExceptionsHelper.requireNonNull(timeout, "timeout"),
			
 
				+                MachineLearning.UTILITY_THREAD_POOL_NAME
			
 
				+            );
			
 
				+        }
			
 
				 
			
 
				-            @Override
			
 
				-            protected void doRun() {
			
 
				-                try {
			
 
				-                    // The request builder expect a list of inputs which are then batched.
			
 
				-                    // TODO batching was implemented for expected use-cases such as zero-shot
			
 
				-                    // classification but is not used here.
			
 
				-                    List<String> text = Collections.singletonList(NlpTask.extractInput(processContext.modelInput.get(), doc));
			
 
				-                    NlpTask.Processor processor = processContext.nlpTaskProcessor.get();
			
 
				-                    processor.validateInputs(text);
			
 
				-                    assert config instanceof NlpConfig;
			
 
				-                    NlpTask.Request request = processor.getRequestBuilder((NlpConfig) config).buildRequest(text, requestId);
			
 
				-                    logger.trace(() -> "Inference Request "+ request.processInput.utf8ToString());
			
 
				-                    PyTorchResultProcessor.PendingResult pendingResult = processContext.resultProcessor.registerRequest(requestId);
			
 
				-                    processContext.process.get().writeInferenceRequest(request.processInput);
			
 
				-                    waitForResult(
			
 
				-                        processContext,
			
 
				-                        pendingResult,
			
 
				-                        request.tokenization,
			
 
				-                        requestId,
			
 
				-                        timeout,
			
 
				-                        processor.getResultProcessor((NlpConfig) config),
			
 
				-                        listener
			
 
				-                    );
			
 
				-                } catch (IOException e) {
			
 
				-                    logger.error(new ParameterizedMessage("[{}] error writing to process", processContext.task.getModelId()), e);
			
 
				-                    onFailure(ExceptionsHelper.serverError("error writing to process", e));
			
 
				-                } catch (Exception e) {
			
 
				-                    onFailure(e);
			
 
				-                } finally {
			
 
				-                    processContext.resultProcessor.requestAccepted(requestId);
			
 
				-                }
			
 
				+        void onTimeout() {
			
 
				+            if (notified.compareAndSet(false, true)) {
			
 
				+                processContext.getResultProcessor().requestIgnored(String.valueOf(requestId));
			
 
				+                listener.onFailure(
			
 
				+                    new ElasticsearchStatusException("timeout [{}] waiting for inference result", RestStatus.TOO_MANY_REQUESTS, timeout)
			
 
				+                );
			
 
				+                return;
			
 
				             }
			
 
				-        });
			
 
				-    }
			
 
				+            logger.debug("request [{}] received timeout after [{}] but listener already alerted", requestId, timeout);
			
 
				+        }
			
 
				 
			
 
				-    private void waitForResult(ProcessContext processContext,
			
 
				-                               PyTorchResultProcessor.PendingResult pendingResult,
			
 
				-                               TokenizationResult tokenization,
			
 
				-                               String requestId,
			
 
				-                               TimeValue timeout,
			
 
				-                               NlpTask.ResultProcessor inferenceResultsProcessor,
			
 
				-                               ActionListener<InferenceResults> listener) {
			
 
				-        try {
			
 
				-            PyTorchResult pyTorchResult = processContext.resultProcessor.waitForResult(
			
 
				-                processContext.process.get(),
			
 
				-                requestId,
			
 
				-                pendingResult,
			
 
				-                timeout
			
 
				-            );
			
 
				-            if (pyTorchResult == null) {
			
 
				-                listener.onFailure(new ElasticsearchStatusException("timeout [{}] waiting for inference result",
			
 
				-                    RestStatus.TOO_MANY_REQUESTS, timeout));
			
 
				+        void onSuccess(InferenceResults inferenceResults) {
			
 
				+            timeoutHandler.cancel();
			
 
				+            if (notified.compareAndSet(false, true)) {
			
 
				+                listener.onResponse(inferenceResults);
			
 
				                 return;
			
 
				             }
			
 
				+            logger.debug("request [{}] received inference response but listener already notified", requestId);
			
 
				+        }
			
 
				 
			
 
				-            if (pyTorchResult.isError()) {
			
 
				-                listener.onFailure(new ElasticsearchStatusException(pyTorchResult.getError(),
			
 
				-                    RestStatus.INTERNAL_SERVER_ERROR));
			
 
				+        @Override
			
 
				+        public void onFailure(Exception e) {
			
 
				+            timeoutHandler.cancel();
			
 
				+            if (notified.compareAndSet(false, true)) {
			
 
				+                listener.onFailure(e);
			
 
				                 return;
			
 
				             }
			
 
				+            logger.debug(
			
 
				+                () -> new ParameterizedMessage("request [{}] received failure but listener already notified", requestId),
			
 
				+                e
			
 
				+            );
			
 
				+        }
			
 
				 
			
 
				-            logger.debug(() -> new ParameterizedMessage(
			
 
				-                "[{}] retrieved result for request [{}]", processContext.task.getModelId(), requestId));
			
 
				-            InferenceResults results = inferenceResultsProcessor.processResult(tokenization, pyTorchResult);
			
 
				-            logger.debug(() -> new ParameterizedMessage(
			
 
				-                "[{}] processed result for request [{}]", processContext.task.getModelId(), requestId));
			
 
				-            listener.onResponse(results);
			
 
				-        } catch (InterruptedException e) {
			
 
				-            listener.onFailure(e);
			
 
				+        @Override
			
 
				+        protected void doRun() throws Exception {
			
 
				+            final String requestIdStr = String.valueOf(requestId);
			
 
				+            try {
			
 
				+                // The request builder expect a list of inputs which are then batched.
			
 
				+                // TODO batching was implemented for expected use-cases such as zero-shot
			
 
				+                // classification but is not used here.
			
 
				+                List<String> text = Collections.singletonList(NlpTask.extractInput(processContext.modelInput.get(), doc));
			
 
				+                NlpTask.Processor processor = processContext.nlpTaskProcessor.get();
			
 
				+                processor.validateInputs(text);
			
 
				+                assert config instanceof NlpConfig;
			
 
				+                NlpTask.Request request = processor.getRequestBuilder((NlpConfig) config).buildRequest(text, requestIdStr);
			
 
				+                logger.trace(() -> "Inference Request "+ request.processInput.utf8ToString());
			
 
				+                PyTorchResultProcessor.PendingResult pendingResult = processContext.getResultProcessor().registerRequest(requestIdStr);
			
 
				+                processContext.process.get().writeInferenceRequest(request.processInput);
			
 
				+                waitForResult(
			
 
				+                    processContext,
			
 
				+                    pendingResult,
			
 
				+                    request.tokenization,
			
 
				+                    requestIdStr,
			
 
				+                    timeout,
			
 
				+                    processor.getResultProcessor((NlpConfig) config),
			
 
				+                    ActionListener.wrap(this::onSuccess,this::onFailure)
			
 
				+                );
			
 
				+            } catch (IOException e) {
			
 
				+                logger.error(new ParameterizedMessage("[{}] error writing to process", processContext.task.getModelId()), e);
			
 
				+                onFailure(ExceptionsHelper.serverError("error writing to process", e));
			
 
				+            } catch (Exception e) {
			
 
				+                onFailure(e);
			
 
				+            } finally {
			
 
				+                processContext.getResultProcessor().requestIgnored(String.valueOf(requestId));
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        private void waitForResult(ProcessContext processContext,
			
 
				+                                   PyTorchResultProcessor.PendingResult pendingResult,
			
 
				+                                   TokenizationResult tokenization,
			
 
				+                                   String requestId,
			
 
				+                                   TimeValue timeout,
			
 
				+                                   NlpTask.ResultProcessor inferenceResultsProcessor,
			
 
				+                                   ActionListener<InferenceResults> listener) {
			
 
				+            try {
			
 
				+                PyTorchResult pyTorchResult = processContext.getResultProcessor().waitForResult(
			
 
				+                    processContext.process.get(),
			
 
				+                    requestId,
			
 
				+                    pendingResult,
			
 
				+                    timeout
			
 
				+                );
			
 
				+                if (pyTorchResult == null) {
			
 
				+                    listener.onFailure(
			
 
				+                        new ElasticsearchStatusException("timeout [{}] waiting for inference result", RestStatus.TOO_MANY_REQUESTS, timeout)
			
 
				+                    );
			
 
				+                    return;
			
 
				+                }
			
 
				+
			
 
				+                if (pyTorchResult.isError()) {
			
 
				+                    listener.onFailure(new ElasticsearchStatusException(pyTorchResult.getError(),
			
 
				+                        RestStatus.INTERNAL_SERVER_ERROR));
			
 
				+                    return;
			
 
				+                }
			
 
				+
			
 
				+                logger.debug(() -> new ParameterizedMessage(
			
 
				+                    "[{}] retrieved result for request [{}]", processContext.task.getModelId(), requestId));
			
 
				+                InferenceResults results = inferenceResultsProcessor.processResult(tokenization, pyTorchResult);
			
 
				+                logger.debug(() -> new ParameterizedMessage(
			
 
				+                    "[{}] processed result for request [{}]", processContext.task.getModelId(), requestId));
			
 
				+                listener.onResponse(results);
			
 
				+            } catch (InterruptedException e) {
			
 
				+                listener.onFailure(e);
			
 
				+            }
			
 
				         }
			
 
				     }
			
 
				 
			
@@ -321,6 +387,10 @@ public class DeploymentManager {
 
				             this.stateStreamer = new PyTorchStateStreamer(client, executorService, xContentRegistry);
			
 
				         }
			
 
				 
			
 
				+        PyTorchResultProcessor getResultProcessor() {
			
 
				+            return resultProcessor;
			
 
				+        }
			
 
				+
			
 
				         synchronized void startProcess() {
			
 
				             process.set(pyTorchProcessFactory.createProcess(task, executorServiceForProcess, onProcessCrash()));
			
 
				         }
			
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/pytorch/process/PyTorchResultProcessor.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/pytorch/process/PyTorchResultProcessor.java
@@ -44,8 +44,16 @@ public class PyTorchResultProcessor {
 
				         return pendingResults.computeIfAbsent(requestId, k -> new PendingResult());
			
 
				     }
			
 
				 
			
 
				-    public void requestAccepted(String requestId) {
			
 
				-        pendingResults.remove(requestId);
			
 
				+    /**
			
 
				+     * Call this method when the caller is no longer waiting on the request response.
			
 
				+     *
			
 
				+     * @param requestId The request ID that is no longer being waited on
			
 
				+     */
			
 
				+    public void requestIgnored(String requestId) {
			
 
				+        PendingResult pendingResult = pendingResults.remove(requestId);
			
 
				+        if (pendingResult != null) {
			
 
				+            pendingResult.latch.countDown();
			
 
				+        }
			
 
				     }
			
 
				 
			
 
				     public void process(NativePyTorchProcess process) {
			
@@ -55,9 +63,9 @@ public class PyTorchResultProcessor {
 
				                 PyTorchResult result = iterator.next();
			
 
				                 logger.trace(() -> new ParameterizedMessage("[{}] Parsed result with id [{}]", deploymentId, result.getRequestId()));
			
 
				                 processResult(result);
			
 
				-                PendingResult pendingResult = pendingResults.get(result.getRequestId());
			
 
				+                PendingResult pendingResult = pendingResults.remove(result.getRequestId());
			
 
				                 if (pendingResult == null) {
			
 
				-                    logger.warn(() -> new ParameterizedMessage("[{}] no pending result for [{}]", deploymentId, result.getRequestId()));
			
 
				+                    logger.debug(() -> new ParameterizedMessage("[{}] no pending result for [{}]", deploymentId, result.getRequestId()));
			
 
				                 } else {
			
 
				                     pendingResult.result.set(result);
			
 
				                     pendingResult.latch.countDown();
			
--- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/inference/RestInferTrainedModelDeploymentAction.java
+++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/rest/inference/RestInferTrainedModelDeploymentAction.java
@@ -45,15 +45,19 @@ public class RestInferTrainedModelDeploymentAction extends BaseRestHandler {
 
				         if (restRequest.hasContent() == false) {
			
 
				             throw ExceptionsHelper.badRequestException("requires body");
			
 
				         }
			
 
				-        InferTrainedModelDeploymentAction.Request request =
			
 
				+        InferTrainedModelDeploymentAction.Request.Builder request =
			
 
				             InferTrainedModelDeploymentAction.Request.parseRequest(deploymentId, restRequest.contentParser());
			
 
				 
			
 
				         if (restRequest.hasParam(InferTrainedModelDeploymentAction.Request.TIMEOUT.getPreferredName())) {
			
 
				             TimeValue inferTimeout = restRequest.paramAsTime(InferTrainedModelDeploymentAction.Request.TIMEOUT.getPreferredName(),
			
 
				                 InferTrainedModelDeploymentAction.Request.DEFAULT_TIMEOUT);
			
 
				-            request.setTimeout(inferTimeout);
			
 
				+            request.setInferenceTimeout(inferTimeout);
			
 
				         }
			
 
				 
			
 
				-        return channel -> client.execute(InferTrainedModelDeploymentAction.INSTANCE, request, new RestToXContentListener<>(channel));
			
 
				+        return channel -> client.execute(
			
 
				+            InferTrainedModelDeploymentAction.INSTANCE,
			
 
				+            request.build(),
			
 
				+            new RestToXContentListener<>(channel)
			
 
				+        );
			
 
				     }
			
 
				 }
			
--- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/deployment/DeploymentManagerTests.java
+++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/deployment/DeploymentManagerTests.java
@@ -0,0 +1,137 @@
 
				+/*
			
 
				+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
			
 
				+ * or more contributor license agreements. Licensed under the Elastic License
			
 
				+ * 2.0; you may not use this file except in compliance with the Elastic License
			
 
				+ * 2.0.
			
 
				+ */
			
 
				+
			
 
				+package org.elasticsearch.xpack.ml.inference.deployment;
			
 
				+
			
 
				+import org.elasticsearch.action.ActionListener;
			
 
				+import org.elasticsearch.client.Client;
			
 
				+import org.elasticsearch.core.TimeValue;
			
 
				+import org.elasticsearch.test.ESTestCase;
			
 
				+import org.elasticsearch.threadpool.ScalingExecutorBuilder;
			
 
				+import org.elasticsearch.threadpool.TestThreadPool;
			
 
				+import org.elasticsearch.threadpool.ThreadPool;
			
 
				+import org.elasticsearch.xpack.core.ml.inference.results.InferenceResults;
			
 
				+import org.elasticsearch.xpack.core.ml.inference.results.WarningInferenceResults;
			
 
				+import org.elasticsearch.xpack.core.ml.inference.trainedmodel.PassThroughConfig;
			
 
				+import org.elasticsearch.xpack.ml.inference.pytorch.process.NativePyTorchProcess;
			
 
				+import org.elasticsearch.xpack.ml.inference.pytorch.process.PyTorchResultProcessor;
			
 
				+import org.junit.After;
			
 
				+import org.junit.Before;
			
 
				+
			
 
				+import java.util.Map;
			
 
				+
			
 
				+import static org.elasticsearch.xpack.ml.MachineLearning.JOB_COMMS_THREAD_POOL_NAME;
			
 
				+import static org.elasticsearch.xpack.ml.MachineLearning.UTILITY_THREAD_POOL_NAME;
			
 
				+import static org.hamcrest.Matchers.equalTo;
			
 
				+import static org.mockito.Mockito.mock;
			
 
				+import static org.mockito.Mockito.when;
			
 
				+
			
 
				+public class DeploymentManagerTests extends ESTestCase {
			
 
				+
			
 
				+    private DeploymentManager deploymentManager;
			
 
				+    private ThreadPool tp;
			
 
				+
			
 
				+    @Before
			
 
				+    public void managerSetup() {
			
 
				+        tp = new TestThreadPool(
			
 
				+            "DeploymentManagerTests",
			
 
				+            new ScalingExecutorBuilder(UTILITY_THREAD_POOL_NAME,1, 4, TimeValue.timeValueMinutes(10), "xpack.ml.utility_thread_pool"),
			
 
				+            new ScalingExecutorBuilder(JOB_COMMS_THREAD_POOL_NAME,1, 4, TimeValue.timeValueMinutes(10), "xpack.ml.job_comms_thread_pool")
			
 
				+        );
			
 
				+        deploymentManager = new DeploymentManager(
			
 
				+            mock(Client.class),
			
 
				+            xContentRegistry(),
			
 
				+            tp,
			
 
				+            (task, executorService, onProcessCrash) -> mock(NativePyTorchProcess.class)
			
 
				+        );
			
 
				+    }
			
 
				+
			
 
				+    @After
			
 
				+    public void shutdownThreadpool() {
			
 
				+        tp.shutdown();
			
 
				+    }
			
 
				+
			
 
				+    public void testInferListenerOnlyCalledOnce() {
			
 
				+        PyTorchResultProcessor resultProcessor = new PyTorchResultProcessor("1");
			
 
				+        DeploymentManager.ProcessContext processContext = mock(DeploymentManager.ProcessContext.class);
			
 
				+        when(processContext.getResultProcessor()).thenReturn(resultProcessor);
			
 
				+
			
 
				+        ListenerCounter listener = new ListenerCounter();
			
 
				+        DeploymentManager.InferenceAction action = new DeploymentManager.InferenceAction(
			
 
				+            1,
			
 
				+            TimeValue.MAX_VALUE,
			
 
				+            processContext,
			
 
				+            new PassThroughConfig(null, null, null),
			
 
				+            Map.of(),
			
 
				+            tp,
			
 
				+            listener
			
 
				+        );
			
 
				+
			
 
				+        action.onSuccess(new WarningInferenceResults("foo"));
			
 
				+        for (int i = 0; i < 10; i++) {
			
 
				+            action.onSuccess(new WarningInferenceResults("foo"));
			
 
				+            action.onFailure(new Exception("foo"));
			
 
				+            action.onTimeout();
			
 
				+        }
			
 
				+        assertThat(listener.failureCounts, equalTo(0));
			
 
				+        assertThat(listener.responseCounts, equalTo(1));
			
 
				+
			
 
				+        action = new DeploymentManager.InferenceAction(
			
 
				+            1,
			
 
				+            TimeValue.MAX_VALUE,
			
 
				+            processContext,
			
 
				+            new PassThroughConfig(null, null, null),
			
 
				+            Map.of(),
			
 
				+            tp,
			
 
				+            listener
			
 
				+        );
			
 
				+
			
 
				+        action.onTimeout();
			
 
				+        for (int i = 0; i < 10; i++) {
			
 
				+            action.onSuccess(new WarningInferenceResults("foo"));
			
 
				+            action.onFailure(new Exception("foo"));
			
 
				+            action.onTimeout();
			
 
				+        }
			
 
				+        assertThat(listener.failureCounts, equalTo(1));
			
 
				+        assertThat(listener.responseCounts, equalTo(1));
			
 
				+
			
 
				+        action = new DeploymentManager.InferenceAction(
			
 
				+            1,
			
 
				+            TimeValue.MAX_VALUE,
			
 
				+            processContext,
			
 
				+            new PassThroughConfig(null, null, null),
			
 
				+            Map.of(),
			
 
				+            tp,
			
 
				+            listener
			
 
				+        );
			
 
				+
			
 
				+        action.onFailure(new Exception("bar"));
			
 
				+        for (int i = 0; i < 10; i++) {
			
 
				+            action.onSuccess(new WarningInferenceResults("foo"));
			
 
				+            action.onFailure(new Exception("foo"));
			
 
				+            action.onTimeout();
			
 
				+        }
			
 
				+        assertThat(listener.failureCounts, equalTo(2));
			
 
				+        assertThat(listener.responseCounts, equalTo(1));
			
 
				+    }
			
 
				+
			
 
				+    private static class ListenerCounter implements ActionListener<InferenceResults> {
			
 
				+        private int responseCounts;
			
 
				+        private int failureCounts;
			
 
				+
			
 
				+        @Override
			
 
				+        public void onResponse(InferenceResults inferenceResults) {
			
 
				+            responseCounts++;
			
 
				+        }
			
 
				+
			
 
				+        @Override
			
 
				+        public void onFailure(Exception e) {
			
 
				+            failureCounts++;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+}