2 years ago · fbb6abd2f4
--- a/docs/changelog/92328.yaml
+++ b/docs/changelog/92328.yaml
@@ -0,0 +1,5 @@
 
															+pr: 92328
														
 
															+summary: Increase the default timeout for the start trained model deployment API
														
 
															+area: Machine Learning
														
 
															+type: enhancement
														
 
															+issues: []
														
--- a/docs/reference/ml/trained-models/apis/start-trained-model-deployment.asciidoc
+++ b/docs/reference/ml/trained-models/apis/start-trained-model-deployment.asciidoc
@@ -28,19 +28,19 @@ in an ingest pipeline or directly in the <<infer-trained-model>> API.
 
															 Scaling inference performance can be achieved by setting the parameters
														
 
															 `number_of_allocations` and `threads_per_allocation`.
														
 
															-Increasing `threads_per_allocation` means more threads are used when an 
														
 
															-inference request is processed on a node. This can improve inference speed for 
														
 
															+Increasing `threads_per_allocation` means more threads are used when an
														
 
															+inference request is processed on a node. This can improve inference speed for
														
 
															 certain models. It may also result in improvement to throughput.
														
 
															-Increasing `number_of_allocations` means more threads are used to process 
														
 
															-multiple inference requests in parallel resulting in throughput improvement. 
														
 
															-Each model allocation uses a number of threads defined by 
														
 
															+Increasing `number_of_allocations` means more threads are used to process
														
 
															+multiple inference requests in parallel resulting in throughput improvement.
														
 
															+Each model allocation uses a number of threads defined by
														
 
															 `threads_per_allocation`.
														
 
															-Model allocations are distributed across {ml} nodes. All allocations assigned to 
														
 
															-a node share the same copy of the model in memory. To avoid thread 
														
 
															-oversubscription which is detrimental to performance, model allocations are 
														
 
															-distributed in such a way that the total number of used threads does not surpass 
														
 
															+Model allocations are distributed across {ml} nodes. All allocations assigned to
														
 
															+a node share the same copy of the model in memory. To avoid thread
														
 
															+oversubscription which is detrimental to performance, model allocations are
														
 
															+distributed in such a way that the total number of used threads does not surpass
														
 
															 the node's allocated processors.
														
 
															 [[start-trained-model-deployment-path-params]]
														
@@ -55,9 +55,9 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=model-id]
 
															 `cache_size`::
														
 
															 (Optional, <<byte-units,byte value>>)
														
 
															-The inference cache size (in memory outside the JVM heap) per node for the 
														
 
															-model. The default value is the size of the model as reported by the 
														
 
															-`model_size_bytes` field in the <<get-trained-models-stats>>. To disable the 
														
 
															+The inference cache size (in memory outside the JVM heap) per node for the
														
 
															+model. The default value is the size of the model as reported by the
														
 
															+`model_size_bytes` field in the <<get-trained-models-stats>>. To disable the
														
 
															 cache, `0b` can be provided.
														
 
															 `number_of_allocations`::
														
@@ -71,17 +71,17 @@ The priority of the deployment. The default value is `normal`.
 
															 There are two priority settings:
														
 
															 +
														
 
															 --
														
 
															-* `normal`: Use this for deployments in production. The deployment allocations 
														
 
															+* `normal`: Use this for deployments in production. The deployment allocations
														
 
															 are distributed so that node processors are not oversubscribed.
														
 
															-* `low`: Use this for testing model functionality. The intention is that these 
														
 
															-deployments are not sent a high volume of input. The deployment is required to 
														
 
															-have a single allocation with just one thread. Low priority deployments may be 
														
 
															-assigned on nodes that already utilize all their processors but will be given a 
														
 
															-lower CPU priority than normal deployments. Low priority deployments may be 
														
 
															+* `low`: Use this for testing model functionality. The intention is that these
														
 
															+deployments are not sent a high volume of input. The deployment is required to
														
 
															+have a single allocation with just one thread. Low priority deployments may be
														
 
															+assigned on nodes that already utilize all their processors but will be given a
														
 
															+lower CPU priority than normal deployments. Low priority deployments may be
														
 
															 unassigned in order to satisfy more allocations of normal priority deployments.
														
 
															 --
														
 
															-WARNING: Heavy usage of low priority deployments may impact performance of 
														
 
															+WARNING: Heavy usage of low priority deployments may impact performance of
														
 
															 normal priority deployments.
														
 
															 `queue_capacity`::
														
@@ -89,20 +89,20 @@ normal priority deployments.
 
															 Controls how many inference requests are allowed in the queue at a time.
														
 
															 Every machine learning node in the cluster where the model can be allocated
														
 
															 has a queue of this size; when the number of requests exceeds the total value,
														
 
															-new requests are rejected with a 429 error. Defaults to 1024. Max allowed value 
														
 
															+new requests are rejected with a 429 error. Defaults to 1024. Max allowed value
														
 
															 is 1000000.
														
 
															 `threads_per_allocation`::
														
 
															 (Optional, integer)
														
 
															-Sets the number of threads used by each model allocation during inference. This 
														
 
															-generally increases the speed per inference request. The inference process is a 
														
 
															-compute-bound process; `threads_per_allocations` must not exceed the number of 
														
 
															-available allocated processors per node. Defaults to 1. Must be a power of 2. 
														
 
															+Sets the number of threads used by each model allocation during inference. This
														
 
															+generally increases the speed per inference request. The inference process is a
														
 
															+compute-bound process; `threads_per_allocations` must not exceed the number of
														
 
															+available allocated processors per node. Defaults to 1. Must be a power of 2.
														
 
															 Max allowed value is 32.
														
 
															 `timeout`::
														
 
															 (Optional, time)
														
 
															-Controls the amount of time to wait for the model to deploy. Defaults to 20 
														
 
															+Controls the amount of time to wait for the model to deploy. Defaults to 30
														
 
															 seconds.
														
 
															 `wait_for`::
														
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/StartTrainedModelDeploymentAction.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/StartTrainedModelDeploymentAction.java
@@ -47,7 +47,7 @@ public class StartTrainedModelDeploymentAction extends ActionType<CreateTrainedM
 
															     public static final StartTrainedModelDeploymentAction INSTANCE = new StartTrainedModelDeploymentAction();
														
 
															     public static final String NAME = "cluster:admin/xpack/ml/trained_models/deployment/start";
														
 
															-    public static final TimeValue DEFAULT_TIMEOUT = new TimeValue(20, TimeUnit.SECONDS);
														
 
															+    public static final TimeValue DEFAULT_TIMEOUT = new TimeValue(30, TimeUnit.SECONDS);
														
 
															     /**
														
 
															      * This has been found to be approximately 300MB on linux by manual testing.
														
--- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/StartTrainedModelDeploymentRequestTests.java
+++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/StartTrainedModelDeploymentRequestTests.java
@@ -221,7 +221,7 @@ public class StartTrainedModelDeploymentRequestTests extends AbstractXContentSer
 
															     public void testDefaults() {
														
 
															         Request request = new Request(randomAlphaOfLength(10));
														
 
															-        assertThat(request.getTimeout(), equalTo(TimeValue.timeValueSeconds(20)));
														
 
															+        assertThat(request.getTimeout(), equalTo(TimeValue.timeValueSeconds(30)));
														
 
															         assertThat(request.getWaitForState(), equalTo(AllocationStatus.State.STARTED));
														
 
															         assertThat(request.getNumberOfAllocations(), equalTo(1));
														
 
															         assertThat(request.getThreadsPerAllocation(), equalTo(1));