2 سال پیش · fbb6abd2f4
--- a/docs/changelog/92328.yaml
+++ b/docs/changelog/92328.yaml
@@ -0,0 +1,5 @@
 
				+pr: 92328
			
 
				+summary: Increase the default timeout for the start trained model deployment API
			
 
				+area: Machine Learning
			
 
				+type: enhancement
			
 
				+issues: []
			
--- a/docs/reference/ml/trained-models/apis/start-trained-model-deployment.asciidoc
+++ b/docs/reference/ml/trained-models/apis/start-trained-model-deployment.asciidoc
@@ -28,19 +28,19 @@ in an ingest pipeline or directly in the <<infer-trained-model>> API.
 
				 Scaling inference performance can be achieved by setting the parameters
			
 
				 `number_of_allocations` and `threads_per_allocation`.
			
 
				 
			
 
				-Increasing `threads_per_allocation` means more threads are used when an 
			
 
				-inference request is processed on a node. This can improve inference speed for 
			
 
				+Increasing `threads_per_allocation` means more threads are used when an
			
 
				+inference request is processed on a node. This can improve inference speed for
			
 
				 certain models. It may also result in improvement to throughput.
			
 
				 
			
 
				-Increasing `number_of_allocations` means more threads are used to process 
			
 
				-multiple inference requests in parallel resulting in throughput improvement. 
			
 
				-Each model allocation uses a number of threads defined by 
			
 
				+Increasing `number_of_allocations` means more threads are used to process
			
 
				+multiple inference requests in parallel resulting in throughput improvement.
			
 
				+Each model allocation uses a number of threads defined by
			
 
				 `threads_per_allocation`.
			
 
				 
			
 
				-Model allocations are distributed across {ml} nodes. All allocations assigned to 
			
 
				-a node share the same copy of the model in memory. To avoid thread 
			
 
				-oversubscription which is detrimental to performance, model allocations are 
			
 
				-distributed in such a way that the total number of used threads does not surpass 
			
 
				+Model allocations are distributed across {ml} nodes. All allocations assigned to
			
 
				+a node share the same copy of the model in memory. To avoid thread
			
 
				+oversubscription which is detrimental to performance, model allocations are
			
 
				+distributed in such a way that the total number of used threads does not surpass
			
 
				 the node's allocated processors.
			
 
				 
			
 
				 [[start-trained-model-deployment-path-params]]
			
@@ -55,9 +55,9 @@ include::{es-repo-dir}/ml/ml-shared.asciidoc[tag=model-id]
 
				 
			
 
				 `cache_size`::
			
 
				 (Optional, <<byte-units,byte value>>)
			
 
				-The inference cache size (in memory outside the JVM heap) per node for the 
			
 
				-model. The default value is the size of the model as reported by the 
			
 
				-`model_size_bytes` field in the <<get-trained-models-stats>>. To disable the 
			
 
				+The inference cache size (in memory outside the JVM heap) per node for the
			
 
				+model. The default value is the size of the model as reported by the
			
 
				+`model_size_bytes` field in the <<get-trained-models-stats>>. To disable the
			
 
				 cache, `0b` can be provided.
			
 
				 
			
 
				 `number_of_allocations`::
			
@@ -71,17 +71,17 @@ The priority of the deployment. The default value is `normal`.
 
				 There are two priority settings:
			
 
				 +
			
 
				 --
			
 
				-* `normal`: Use this for deployments in production. The deployment allocations 
			
 
				+* `normal`: Use this for deployments in production. The deployment allocations
			
 
				 are distributed so that node processors are not oversubscribed.
			
 
				-* `low`: Use this for testing model functionality. The intention is that these 
			
 
				-deployments are not sent a high volume of input. The deployment is required to 
			
 
				-have a single allocation with just one thread. Low priority deployments may be 
			
 
				-assigned on nodes that already utilize all their processors but will be given a 
			
 
				-lower CPU priority than normal deployments. Low priority deployments may be 
			
 
				+* `low`: Use this for testing model functionality. The intention is that these
			
 
				+deployments are not sent a high volume of input. The deployment is required to
			
 
				+have a single allocation with just one thread. Low priority deployments may be
			
 
				+assigned on nodes that already utilize all their processors but will be given a
			
 
				+lower CPU priority than normal deployments. Low priority deployments may be
			
 
				 unassigned in order to satisfy more allocations of normal priority deployments.
			
 
				 --
			
 
				 
			
 
				-WARNING: Heavy usage of low priority deployments may impact performance of 
			
 
				+WARNING: Heavy usage of low priority deployments may impact performance of
			
 
				 normal priority deployments.
			
 
				 
			
 
				 `queue_capacity`::
			
@@ -89,20 +89,20 @@ normal priority deployments.
 
				 Controls how many inference requests are allowed in the queue at a time.
			
 
				 Every machine learning node in the cluster where the model can be allocated
			
 
				 has a queue of this size; when the number of requests exceeds the total value,
			
 
				-new requests are rejected with a 429 error. Defaults to 1024. Max allowed value 
			
 
				+new requests are rejected with a 429 error. Defaults to 1024. Max allowed value
			
 
				 is 1000000.
			
 
				 
			
 
				 `threads_per_allocation`::
			
 
				 (Optional, integer)
			
 
				-Sets the number of threads used by each model allocation during inference. This 
			
 
				-generally increases the speed per inference request. The inference process is a 
			
 
				-compute-bound process; `threads_per_allocations` must not exceed the number of 
			
 
				-available allocated processors per node. Defaults to 1. Must be a power of 2. 
			
 
				+Sets the number of threads used by each model allocation during inference. This
			
 
				+generally increases the speed per inference request. The inference process is a
			
 
				+compute-bound process; `threads_per_allocations` must not exceed the number of
			
 
				+available allocated processors per node. Defaults to 1. Must be a power of 2.
			
 
				 Max allowed value is 32.
			
 
				 
			
 
				 `timeout`::
			
 
				 (Optional, time)
			
 
				-Controls the amount of time to wait for the model to deploy. Defaults to 20 
			
 
				+Controls the amount of time to wait for the model to deploy. Defaults to 30
			
 
				 seconds.
			
 
				 
			
 
				 `wait_for`::
			
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/StartTrainedModelDeploymentAction.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/action/StartTrainedModelDeploymentAction.java
@@ -47,7 +47,7 @@ public class StartTrainedModelDeploymentAction extends ActionType<CreateTrainedM
 
				     public static final StartTrainedModelDeploymentAction INSTANCE = new StartTrainedModelDeploymentAction();
			
 
				     public static final String NAME = "cluster:admin/xpack/ml/trained_models/deployment/start";
			
 
				 
			
 
				-    public static final TimeValue DEFAULT_TIMEOUT = new TimeValue(20, TimeUnit.SECONDS);
			
 
				+    public static final TimeValue DEFAULT_TIMEOUT = new TimeValue(30, TimeUnit.SECONDS);
			
 
				 
			
 
				     /**
			
 
				      * This has been found to be approximately 300MB on linux by manual testing.
			
--- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/StartTrainedModelDeploymentRequestTests.java
+++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/action/StartTrainedModelDeploymentRequestTests.java
@@ -221,7 +221,7 @@ public class StartTrainedModelDeploymentRequestTests extends AbstractXContentSer
 
				 
			
 
				     public void testDefaults() {
			
 
				         Request request = new Request(randomAlphaOfLength(10));
			
 
				-        assertThat(request.getTimeout(), equalTo(TimeValue.timeValueSeconds(20)));
			
 
				+        assertThat(request.getTimeout(), equalTo(TimeValue.timeValueSeconds(30)));
			
 
				         assertThat(request.getWaitForState(), equalTo(AllocationStatus.State.STARTED));
			
 
				         assertThat(request.getNumberOfAllocations(), equalTo(1));
			
 
				         assertThat(request.getThreadsPerAllocation(), equalTo(1));