|
@@ -87,7 +87,10 @@ public class ElasticInferenceService extends SenderService {
|
|
|
public static final String NAME = "elastic";
|
|
|
public static final String ELASTIC_INFERENCE_SERVICE_IDENTIFIER = "Elastic Inference Service";
|
|
|
public static final Integer DENSE_TEXT_EMBEDDINGS_DIMENSIONS = 1024;
|
|
|
- public static final Integer SPARSE_TEXT_EMBEDDING_MAX_BATCH_SIZE = 512;
|
|
|
+ // The maximum batch size for sparse text embeddings is set to 16.
|
|
|
+ // This value was reduced from 512 due to memory constraints; batch sizes above 32 can cause GPU out-of-memory errors.
|
|
|
+ // A batch size of 16 provides optimal throughput and stability, especially on lower-tier instance types.
|
|
|
+ public static final Integer SPARSE_TEXT_EMBEDDING_MAX_BATCH_SIZE = 16;
|
|
|
|
|
|
private static final EnumSet<TaskType> IMPLEMENTED_TASK_TYPES = EnumSet.of(
|
|
|
TaskType.SPARSE_EMBEDDING,
|
|
@@ -97,7 +100,10 @@ public class ElasticInferenceService extends SenderService {
|
|
|
);
|
|
|
private static final String SERVICE_NAME = "Elastic";
|
|
|
|
|
|
- private static final Integer DENSE_TEXT_EMBEDDINGS_MAX_BATCH_SIZE = 32;
|
|
|
+ // TODO: revisit this value once EIS supports dense models
|
|
|
+ // The maximum batch size for dense text embeddings is proactively set to 16.
|
|
|
+ // This mirrors the memory constraints observed with sparse embeddings
|
|
|
+ private static final Integer DENSE_TEXT_EMBEDDINGS_MAX_BATCH_SIZE = 16;
|
|
|
|
|
|
// rainbow-sprinkles
|
|
|
static final String DEFAULT_CHAT_COMPLETION_MODEL_ID_V1 = "rainbow-sprinkles";
|