|
@@ -1,17 +1,16 @@
|
|
-from exo.inference.tinygrad.inference import TinygradDynamicShardInferenceEngine
|
|
|
|
from exo.inference.mlx.sharded_inference_engine import MLXDynamicShardInferenceEngine
|
|
from exo.inference.mlx.sharded_inference_engine import MLXDynamicShardInferenceEngine
|
|
from exo.download.hf.hf_shard_download import HFShardDownloader
|
|
from exo.download.hf.hf_shard_download import HFShardDownloader
|
|
from exo.inference.inference_engine import InferenceEngine
|
|
from exo.inference.inference_engine import InferenceEngine
|
|
from exo.inference.shard import Shard
|
|
from exo.inference.shard import Shard
|
|
|
|
+from exo.helpers import DEBUG
|
|
import asyncio
|
|
import asyncio
|
|
import numpy as np
|
|
import numpy as np
|
|
-
|
|
|
|
|
|
+from transformers import AutoTokenizer
|
|
|
|
|
|
# An inference engine should work the same for any number of Shards, as long as the Shards are continuous.
|
|
# An inference engine should work the same for any number of Shards, as long as the Shards are continuous.
|
|
-async def test_inference_engine(inference_engine_1: InferenceEngine, inference_engine_2: InferenceEngine, model_id: str):
|
|
|
|
|
|
+async def test_inference_engine(inference_engine_1: InferenceEngine, inference_engine_2: InferenceEngine, model_id: str, tokenizer: AutoTokenizer):
|
|
prompt = "In a single word only, what is the last name of the current president of the USA?"
|
|
prompt = "In a single word only, what is the last name of the current president of the USA?"
|
|
resp_full, inference_state_full, _ = await inference_engine_1.infer_prompt("A", shard=Shard(model_id=model_id, start_layer=0, end_layer=31, n_layers=32), prompt=prompt)
|
|
resp_full, inference_state_full, _ = await inference_engine_1.infer_prompt("A", shard=Shard(model_id=model_id, start_layer=0, end_layer=31, n_layers=32), prompt=prompt)
|
|
-
|
|
|
|
next_resp_full, _next_inference_state_full, _ = await inference_engine_1.infer_tensor(
|
|
next_resp_full, _next_inference_state_full, _ = await inference_engine_1.infer_tensor(
|
|
"A",
|
|
"A",
|
|
shard=Shard(model_id=model_id, start_layer=0, end_layer=31, n_layers=32),
|
|
shard=Shard(model_id=model_id, start_layer=0, end_layer=31, n_layers=32),
|
|
@@ -19,22 +18,23 @@ async def test_inference_engine(inference_engine_1: InferenceEngine, inference_e
|
|
inference_state=inference_state_full,
|
|
inference_state=inference_state_full,
|
|
)
|
|
)
|
|
|
|
|
|
- resp1, inference_state_1, _ = await inference_engine_1.infer_prompt("B", shard=Shard(model_id=model_id, start_layer=0, end_layer=30, n_layers=32), prompt=prompt)
|
|
|
|
|
|
+ pp = 15
|
|
|
|
+ resp1, inference_state_1, _ = await inference_engine_1.infer_prompt("B", shard=Shard(model_id=model_id, start_layer=0, end_layer=pp, n_layers=32), prompt=prompt)
|
|
resp2, inference_state_2, _ = await inference_engine_2.infer_tensor(
|
|
resp2, inference_state_2, _ = await inference_engine_2.infer_tensor(
|
|
"B",
|
|
"B",
|
|
- shard=Shard(model_id=model_id, start_layer=31, end_layer=31, n_layers=32),
|
|
|
|
|
|
+ shard=Shard(model_id=model_id, start_layer=pp+1, end_layer=31, n_layers=32),
|
|
input_data=resp1,
|
|
input_data=resp1,
|
|
inference_state=inference_state_1,
|
|
inference_state=inference_state_1,
|
|
)
|
|
)
|
|
resp3, inference_state_3, _ = await inference_engine_1.infer_tensor(
|
|
resp3, inference_state_3, _ = await inference_engine_1.infer_tensor(
|
|
"B",
|
|
"B",
|
|
- shard=Shard(model_id=model_id, start_layer=0, end_layer=30, n_layers=32),
|
|
|
|
|
|
+ shard=Shard(model_id=model_id, start_layer=0, end_layer=pp, n_layers=32),
|
|
input_data=resp2,
|
|
input_data=resp2,
|
|
inference_state=inference_state_2,
|
|
inference_state=inference_state_2,
|
|
)
|
|
)
|
|
resp4, _inference_state_4, _ = await inference_engine_2.infer_tensor(
|
|
resp4, _inference_state_4, _ = await inference_engine_2.infer_tensor(
|
|
"B",
|
|
"B",
|
|
- shard=Shard(model_id=model_id, start_layer=31, end_layer=31, n_layers=32),
|
|
|
|
|
|
+ shard=Shard(model_id=model_id, start_layer=pp+1, end_layer=31, n_layers=32),
|
|
input_data=resp3,
|
|
input_data=resp3,
|
|
inference_state=inference_state_3,
|
|
inference_state=inference_state_3,
|
|
)
|
|
)
|
|
@@ -42,7 +42,6 @@ async def test_inference_engine(inference_engine_1: InferenceEngine, inference_e
|
|
assert np.array_equal(resp_full, resp2)
|
|
assert np.array_equal(resp_full, resp2)
|
|
assert np.array_equal(next_resp_full, resp4)
|
|
assert np.array_equal(next_resp_full, resp4)
|
|
|
|
|
|
-
|
|
|
|
asyncio.run(
|
|
asyncio.run(
|
|
test_inference_engine(
|
|
test_inference_engine(
|
|
MLXDynamicShardInferenceEngine(HFShardDownloader()),
|
|
MLXDynamicShardInferenceEngine(HFShardDownloader()),
|
|
@@ -51,9 +50,14 @@ asyncio.run(
|
|
)
|
|
)
|
|
)
|
|
)
|
|
|
|
|
|
-# TODO: Need more memory or a smaller model
|
|
|
|
-# asyncio.run(test_inference_engine(
|
|
|
|
-# TinygradDynamicShardInferenceEngine(HFShardDownloader()),
|
|
|
|
-# TinygradDynamicShardInferenceEngine(HFShardDownloader()),
|
|
|
|
-# "TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-8B-R",
|
|
|
|
-# ))
|
|
|
|
|
|
+if os.getenv("RUN_TINYGRAD", default="0") == "1":
|
|
|
|
+ import tinygrad
|
|
|
|
+ import os
|
|
|
|
+ from exo.inference.tinygrad.inference import TinygradDynamicShardInferenceEngine
|
|
|
|
+ tinygrad.helpers.DEBUG.value = int(os.getenv("TINYGRAD_DEBUG", default="0"))
|
|
|
|
+ asyncio.run(test_inference_engine(
|
|
|
|
+ TinygradDynamicShardInferenceEngine(HFShardDownloader()),
|
|
|
|
+ TinygradDynamicShardInferenceEngine(HFShardDownloader()),
|
|
|
|
+ "TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-8B-R",
|
|
|
|
+ AutoTokenizer.from_pretrained("TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-8B-R")
|
|
|
|
+ ))
|