|
@@ -16,25 +16,25 @@ async def test_inference_engine(inference_engine_1: InferenceEngine, inference_e
|
|
|
resp_full = await inference_engine_1.infer_prompt("A", shard=Shard(model_id=model_id, start_layer=0, end_layer=31, n_layers=32), prompt=prompt)
|
|
|
token_full = await inference_engine_1.sample(resp_full)
|
|
|
|
|
|
- next_resp_full = await inference_engine_1.infer_tensor(
|
|
|
+ next_resp_full, _ = await inference_engine_1.infer_tensor(
|
|
|
"A",
|
|
|
shard=Shard(model_id=model_id, start_layer=0, end_layer=31, n_layers=32),
|
|
|
input_data=token_full,
|
|
|
)
|
|
|
|
|
|
- resp1 = await inference_engine_1.infer_prompt("B", shard=Shard(model_id=model_id, start_layer=0, end_layer=30, n_layers=32), prompt=prompt)
|
|
|
- resp2 = await inference_engine_2.infer_tensor(
|
|
|
+ resp1, _ = await inference_engine_1.infer_prompt("B", shard=Shard(model_id=model_id, start_layer=0, end_layer=30, n_layers=32), prompt=prompt)
|
|
|
+ resp2, _ = await inference_engine_2.infer_tensor(
|
|
|
"B",
|
|
|
shard=Shard(model_id=model_id, start_layer=31, end_layer=31, n_layers=32),
|
|
|
input_data=resp1,
|
|
|
)
|
|
|
token2 = await inference_engine_2.sample(resp2)
|
|
|
- resp3 = await inference_engine_1.infer_tensor(
|
|
|
+ resp3, _ = await inference_engine_1.infer_tensor(
|
|
|
"B",
|
|
|
shard=Shard(model_id=model_id, start_layer=0, end_layer=30, n_layers=32),
|
|
|
input_data=token2,
|
|
|
)
|
|
|
- resp4 = await inference_engine_2.infer_tensor(
|
|
|
+ resp4, _ = await inference_engine_2.infer_tensor(
|
|
|
"B",
|
|
|
shard=Shard(model_id=model_id, start_layer=31, end_layer=31, n_layers=32),
|
|
|
input_data=resp3,
|