|
@@ -11,8 +11,8 @@ import numpy as np
|
|
|
# An inference engine should work the same for any number of Shards, as long as the Shards are continuous.
|
|
|
async def test_inference_engine(inference_engine_1: InferenceEngine, inference_engine_2: InferenceEngine, model_id: str, n_layers: int):
|
|
|
prompt = "In a single word only, what is the last name of the current president of the USA?"
|
|
|
- resp_full, inference_state_full, _ = await inference_engine_1.infer_prompt("A", shard=Shard(model_id=model_id, start_layer=0, end_layer=n_layers - 1, n_layers=n_layers), prompt=prompt)
|
|
|
- next_resp_full, _next_inference_state_full, _ = await inference_engine_1.infer_tensor(
|
|
|
+ resp_full = await inference_engine_1.infer_prompt("A", shard=Shard(model_id=model_id, start_layer=0, end_layer=n_layers - 1, n_layers=n_layers), prompt=prompt)
|
|
|
+ next_resp_full = await inference_engine_1.infer_tensor(
|
|
|
"A",
|
|
|
shard=Shard(model_id=model_id, start_layer=0, end_layer=n_layers - 1, n_layers=n_layers),
|
|
|
input_data=resp_full,
|
|
@@ -20,20 +20,20 @@ async def test_inference_engine(inference_engine_1: InferenceEngine, inference_e
|
|
|
)
|
|
|
|
|
|
pp = n_layers // 2
|
|
|
- resp1, inference_state_1, _ = await inference_engine_1.infer_prompt("B", shard=Shard(model_id=model_id, start_layer=0, end_layer=pp, n_layers=n_layers), prompt=prompt)
|
|
|
- resp2, inference_state_2, _ = await inference_engine_2.infer_tensor(
|
|
|
+ resp1 = await inference_engine_1.infer_prompt("B", shard=Shard(model_id=model_id, start_layer=0, end_layer=pp, n_layers=n_layers), prompt=prompt)
|
|
|
+ resp2 = await inference_engine_2.infer_tensor(
|
|
|
"B",
|
|
|
shard=Shard(model_id=model_id, start_layer=pp + 1, end_layer=n_layers - 1, n_layers=n_layers),
|
|
|
input_data=resp1,
|
|
|
inference_state=inference_state_1,
|
|
|
)
|
|
|
- resp3, inference_state_3, _ = await inference_engine_1.infer_tensor(
|
|
|
+ resp3 = await inference_engine_1.infer_tensor(
|
|
|
"B",
|
|
|
shard=Shard(model_id=model_id, start_layer=0, end_layer=pp, n_layers=n_layers),
|
|
|
input_data=resp2,
|
|
|
inference_state=inference_state_2,
|
|
|
)
|
|
|
- resp4, _inference_state_4, _ = await inference_engine_2.infer_tensor(
|
|
|
+ resp4 = await inference_engine_2.infer_tensor(
|
|
|
"B",
|
|
|
shard=Shard(model_id=model_id, start_layer=pp + 1, end_layer=n_layers - 1, n_layers=n_layers),
|
|
|
input_data=resp3,
|