Pārlūkot izejas kodu

Merge branch 'main' into non_blocking

Alex Cheema 8 mēneši atpakaļ
vecāks
revīzija
de19f0ab42
2 mainītis faili ar 5 papildinājumiem un 1 dzēšanām
  1. 1 1
      exo/inference/tokenizers.py
  2. 4 0
      exo/models.py

+ 1 - 1
exo/inference/tokenizers.py

@@ -11,7 +11,7 @@ async def resolve_tokenizer(model_id: str):
   local_path = await get_local_snapshot_dir(model_id)
   if DEBUG >= 2: print(f"Checking if local path exists to load tokenizer from local {local_path=}")
   try:
-    if await aios.path.exists(local_path):
+    if local_path and await aios.path.exists(local_path):
       if DEBUG >= 2: print(f"Resolving tokenizer for {model_id=} from {local_path=}")
       return await _resolve_tokenizer(local_path)
   except:

+ 4 - 0
exo/models.py

@@ -10,6 +10,10 @@ model_base_shards = {
     "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3.1-70B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=80),
     "TinygradDynamicShardInferenceEngine": Shard(model_id="NousResearch/Meta-Llama-3.1-70B-Instruct", start_layer=0, end_layer=0, n_layers=80),
   },
+  "llama-3.1-70b-bf16": {
+    "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3.1-70B-Instruct-bf16", start_layer=0, end_layer=0, n_layers=80),
+    "TinygradDynamicShardInferenceEngine": Shard(model_id="NousResearch/Meta-Llama-3.1-70B-Instruct", start_layer=0, end_layer=0, n_layers=80),
+  },
   "llama-3.1-405b": {"MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3.1-405B-4bit", start_layer=0, end_layer=0, n_layers=126),},
   "llama-3-8b": {
     "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3-8B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=32),