11 months ago · 545a486ed3
--- a/exo/api/chatgpt_api.py
+++ b/exo/api/chatgpt_api.py
@@ -25,11 +25,11 @@ shard_mappings = {
 
				   },
			
 
				   "llama-3-8b": {
			
 
				     "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3-8B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=32),
			
 
				-    "TinygradDynamicShardInferenceEngine": Shard(model_id="llama3-8b-sfr", start_layer=0, end_layer=0, n_layers=32),
			
 
				+    "TinygradDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3-8B-Instruct", start_layer=0, end_layer=0, n_layers=32),
			
 
				   },
			
 
				   "llama-3-70b": {
			
 
				     "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3-70B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=80),
			
 
				-    "TinygradDynamicShardInferenceEngine": Shard(model_id="llama3-70b-sfr", start_layer=0, end_layer=0, n_layers=80),
			
 
				+    "TinygradDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3-70B-Instruct", start_layer=0, end_layer=0, n_layers=80),
			
 
				   },
			
 
				   ### mistral
			
 
				   "mistral-nemo": {
			
@@ -76,14 +76,6 @@ class ChatCompletionRequest:
 
				         }
			
 
				 
			
 
				 
			
 
				-def resolve_tinygrad_tokenizer(model_id: str):
			
 
				-  if model_id == "llama3-8b-sfr":
			
 
				-    return AutoTokenizer.from_pretrained("TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-8B-R")
			
 
				-  elif model_id == "llama3-70b-sfr":
			
 
				-    return AutoTokenizer.from_pretrained("TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-8B-R")
			
 
				-  else:
			
 
				-    raise ValueError(f"tinygrad doesnt currently support arbitrary model downloading. unsupported model: {model_id}")
			
 
				-
			
 
				 
			
 
				 async def resolve_tokenizer(model_id: str):
			
 
				   try:
			
@@ -111,15 +103,6 @@ async def resolve_tokenizer(model_id: str):
 
				 
			
 
				     if DEBUG >= 2: print(traceback.format_exc())
			
 
				 
			
 
				-  try:
			
 
				-    if DEBUG >= 2: print(f"Trying tinygrad tokenizer for {model_id}")
			
 
				-    return resolve_tinygrad_tokenizer(model_id)
			
 
				-  except Exception as e:
			
 
				-    if DEBUG >= 2: print(f"Failed again to load tokenizer for {model_id}. Falling back to mlx tokenizer. Error: {e}")
			
 
				-    import traceback
			
 
				-
			
 
				-    if DEBUG >= 2: print(traceback.format_exc())
			
 
				-
			
 
				   if DEBUG >= 2: print(f"Trying mlx tokenizer for {model_id}")
			
 
				   from exo.inference.mlx.sharded_utils import get_model_path, load_tokenizer
			
 
				 
			
--- a/exo/inference/hf_helpers.py
+++ b/exo/inference/hf_helpers.py
@@ -1,36 +1,17 @@
 
				 import asyncio
			
 
				 import aiohttp
			
 
				 import os
			
 
				-import argparse
			
 
				 from urllib.parse import urljoin
			
 
				-from typing import Callable, Optional, Coroutine, Any
			
 
				+from typing import Callable, Optional, Coroutine, Any, Dict, List, Union, Literal
			
 
				 from datetime import datetime, timedelta
			
 
				 from fnmatch import fnmatch
			
 
				 from pathlib import Path
			
 
				-from typing import Generator, Iterable, List, TypeVar, Union
			
 
				+from typing import Generator, Iterable, TypeVar, TypedDict
			
 
				+from dataclasses import dataclass
			
 
				+from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
			
 
				+from exo.helpers import DEBUG
			
 
				 
			
 
				 T = TypeVar("T")
			
 
				-
			
 
				-DEFAULT_ALLOW_PATTERNS = [
			
 
				-    "*.json",
			
 
				-    "*.py",
			
 
				-    "tokenizer.model",
			
 
				-    "*.tiktoken",
			
 
				-    "*.txt",
			
 
				-    "*.safetensors",
			
 
				-]
			
 
				-# Always ignore `.git` and `.cache/huggingface` folders in commits
			
 
				-DEFAULT_IGNORE_PATTERNS = [
			
 
				-    ".git",
			
 
				-    ".git/*",
			
 
				-    "*/.git",
			
 
				-    "**/.git/**",
			
 
				-    ".cache/huggingface",
			
 
				-    ".cache/huggingface/*",
			
 
				-    "*/.cache/huggingface",
			
 
				-    "**/.cache/huggingface/**",
			
 
				-]
			
 
				-
			
 
				 def filter_repo_objects(
			
 
				     items: Iterable[T],
			
 
				     *,
			
@@ -117,7 +98,35 @@ async def fetch_file_list(session, repo_id, revision, path=""):
 
				         else:
			
 
				             raise Exception(f"Failed to fetch file list: {response.status}")
			
 
				 
			
 
				-async def download_file(session, repo_id, revision, file_path, save_directory, progress_callback: Optional[Callable[[str, int, int, float, timedelta], Coroutine[Any, Any, None]]] = None):
			
 
				+
			
 
				+@dataclass
			
 
				+class HFRepoFileProgressEvent:
			
 
				+    file_path: str
			
 
				+    downloaded: int
			
 
				+    total: int
			
 
				+    speed: float
			
 
				+    eta: timedelta
			
 
				+    status: Literal["not_started", "in_progress", "complete"]
			
 
				+
			
 
				+@dataclass
			
 
				+class HFRepoProgressEvent:
			
 
				+    completed_files: int
			
 
				+    total_files: int
			
 
				+    downloaded_bytes: int
			
 
				+    total_bytes: int
			
 
				+    overall_eta: timedelta
			
 
				+    file_progress: Dict[str, HFRepoFileProgressEvent]
			
 
				+
			
 
				+HFRepoFileProgressCallback = Callable[[HFRepoFileProgressEvent], Coroutine[Any, Any, None]]
			
 
				+HFRepoProgressCallback = Callable[[HFRepoProgressEvent], Coroutine[Any, Any, None]]
			
 
				+
			
 
				+@retry(
			
 
				+    stop=stop_after_attempt(5),
			
 
				+    wait=wait_exponential(multiplier=1, min=4, max=60),
			
 
				+    retry=retry_if_exception_type((aiohttp.ClientError, asyncio.TimeoutError, aiohttp.ClientResponseError)),
			
 
				+    reraise=True
			
 
				+)
			
 
				+async def download_file(session: aiohttp.ClientSession, repo_id: str, revision: str, file_path: str, save_directory: str, progress_callback: Optional[HFRepoFileProgressCallback] = None, use_range_request: bool = True):
			
 
				     base_url = f"https://huggingface.co/{repo_id}/resolve/{revision}/"
			
 
				     url = urljoin(base_url, file_path)
			
 
				     local_path = os.path.join(save_directory, file_path)
			
@@ -125,64 +134,72 @@ async def download_file(session, repo_id, revision, file_path, save_directory, p
 
				     os.makedirs(os.path.dirname(local_path), exist_ok=True)
			
 
				 
			
 
				     # Check if file already exists and get its size
			
 
				-    if os.path.exists(local_path):
			
 
				-        local_file_size = os.path.getsize(local_path)
			
 
				-    else:
			
 
				-        local_file_size = 0
			
 
				+    local_file_size = os.path.getsize(local_path) if os.path.exists(local_path) else 0
			
 
				+
			
 
				+    headers = get_auth_headers()
			
 
				+    if use_range_request:
			
 
				+        headers["Range"] = f"bytes={local_file_size}-"
			
 
				 
			
 
				-    headers = {"Range": f"bytes={local_file_size}-"}
			
 
				-    headers.update(get_auth_headers())
			
 
				     async with session.get(url, headers=headers) as response:
			
 
				+        total_size = int(response.headers.get('Content-Length', 0))
			
 
				+        downloaded_size = local_file_size
			
 
				+        mode = 'ab' if use_range_request else 'wb'
			
 
				+        if downloaded_size == total_size:
			
 
				+            if DEBUG >= 2: print(f"File already downloaded: {file_path}")
			
 
				+            if progress_callback:
			
 
				+                await progress_callback(HFRepoFileProgressEvent(file_path, downloaded_size, total_size, 0, timedelta(0), "complete"))
			
 
				+            return
			
 
				+
			
 
				         if response.status == 200:
			
 
				-            # File doesn't support range requests, start from beginning
			
 
				+            # File doesn't support range requests or we're not using them, start from beginning
			
 
				             mode = 'wb'
			
 
				-            total_size = int(response.headers.get('Content-Length', 0))
			
 
				             downloaded_size = 0
			
 
				         elif response.status == 206:
			
 
				             # Partial content, resume download
			
 
				-            mode = 'ab'
			
 
				-            content_range = response.headers.get('Content-Range')
			
 
				-            total_size = int(content_range.split('/')[-1])
			
 
				-            downloaded_size = local_file_size
			
 
				+            content_range = response.headers.get('Content-Range', '')
			
 
				+            try:
			
 
				+                total_size = int(content_range.split('/')[-1])
			
 
				+            except ValueError:
			
 
				+                if DEBUG >= 1: print(f"Failed to parse Content-Range header: {content_range}. Starting download from scratch...")
			
 
				+                return await download_file(session, repo_id, revision, file_path, save_directory, progress_callback, use_range_request=False)
			
 
				         elif response.status == 416:
			
 
				             # Range not satisfiable, get the actual file size
			
 
				-            if response.headers.get('Content-Type', '').startswith('text/html'):
			
 
				-                content = await response.text()
			
 
				-                print(f"Response content (HTML):\n{content}")
			
 
				-            else:
			
 
				-                print(response)
			
 
				-            print("Return header: ", response.headers)
			
 
				-            print("Return header: ", response.headers.get('Content-Range').split('/')[-1])
			
 
				-            total_size = int(response.headers.get('Content-Range', '').split('/')[-1])
			
 
				-            if local_file_size == total_size:
			
 
				-                print(f"File already fully downloaded: {file_path}")
			
 
				-                return
			
 
				-            else:
			
 
				-                # Start the download from the beginning
			
 
				-                mode = 'wb'
			
 
				-                downloaded_size = 0
			
 
				+            content_range = response.headers.get('Content-Range', '')
			
 
				+            try:
			
 
				+                total_size = int(content_range.split('/')[-1])
			
 
				+                if downloaded_size == total_size:
			
 
				+                    if DEBUG >= 2: print(f"File fully downloaded on first pass: {file_path}")
			
 
				+                    if progress_callback:
			
 
				+                        await progress_callback(HFRepoFileProgressEvent(file_path, downloaded_size, total_size, 0, timedelta(0), "complete"))
			
 
				+                    return
			
 
				+            except ValueError:
			
 
				+                if DEBUG >= 1: print(f"Failed to parse Content-Range header: {content_range}. Starting download from scratch...")
			
 
				+                return await download_file(session, repo_id, revision, file_path, save_directory, progress_callback, use_range_request=False)
			
 
				         else:
			
 
				-            print(f"Failed to download {file_path}: {response.status}")
			
 
				-            return
			
 
				+            raise aiohttp.ClientResponseError(response.request_info, response.history, status=response.status, message=f"Failed to download {file_path}: {response.status}")
			
 
				 
			
 
				         if downloaded_size == total_size:
			
 
				             print(f"File already downloaded: {file_path}")
			
 
				+            if progress_callback:
			
 
				+                await progress_callback(HFRepoFileProgressEvent(file_path, downloaded_size, total_size, 0, timedelta(0), "complete"))
			
 
				             return
			
 
				 
			
 
				+        DOWNLOAD_CHUNK_SIZE = 32768
			
 
				         start_time = datetime.now()
			
 
				-        new_downloaded_size = 0
			
 
				         with open(local_path, mode) as f:
			
 
				-            async for chunk in response.content.iter_chunked(8192):
			
 
				+            async for chunk in response.content.iter_chunked(DOWNLOAD_CHUNK_SIZE):
			
 
				                 f.write(chunk)
			
 
				-                new_downloaded_size += len(chunk)
			
 
				-                if progress_callback:
			
 
				+                downloaded_size += len(chunk)
			
 
				+                if progress_callback and total_size:
			
 
				                     elapsed_time = (datetime.now() - start_time).total_seconds()
			
 
				-                    speed = new_downloaded_size / elapsed_time if elapsed_time > 0 else 0
			
 
				-                    eta = timedelta(seconds=(total_size - downloaded_size - new_downloaded_size) / speed) if speed > 0 else timedelta(0)
			
 
				-                    await progress_callback(file_path, new_downloaded_size, total_size - downloaded_size, speed, eta)
			
 
				-        print(f"Downloaded: {file_path}")
			
 
				-
			
 
				-async def download_all_files(repo_id, revision="main", progress_callback: Optional[Callable[[int, int, int, int, timedelta, dict], Coroutine[Any, Any, None]]] = None, allow_patterns: Optional[Union[List[str], str]] = None, ignore_patterns: Optional[Union[List[str], str]] = None):
			
 
				+                    speed = downloaded_size / elapsed_time if elapsed_time > 0 else 0
			
 
				+                    remaining_size = total_size - downloaded_size
			
 
				+                    eta = timedelta(seconds=remaining_size / speed) if speed > 0 else timedelta(0)
			
 
				+                    status = "in_progress" if downloaded_size < total_size else "complete"
			
 
				+                    await progress_callback(HFRepoFileProgressEvent(file_path, downloaded_size, total_size, speed, eta, status))
			
 
				+        if DEBUG >= 2: print(f"Downloaded: {file_path}")
			
 
				+
			
 
				+async def download_all_files(repo_id: str, revision: str = "main", progress_callback: Optional[HFRepoProgressCallback] = None, allow_patterns: Optional[Union[List[str], str]] = None, ignore_patterns: Optional[Union[List[str], str]] = None):
			
 
				     repo_root = get_repo_root(repo_id)
			
 
				     refs_dir = repo_root / "refs"
			
 
				     snapshots_dir = repo_root / "snapshots"
			
@@ -197,7 +214,7 @@ async def download_all_files(repo_id, revision="main", progress_callback: Option
 
				         headers = get_auth_headers()
			
 
				         async with session.get(api_url, headers=headers) as response:
			
 
				             if response.status != 200:
			
 
				-                raise Exception(f"Failed to fetch revision info: {response.status}")
			
 
				+                raise Exception(f"Failed to fetch revision info from {api_url}: {response.status}")
			
 
				             revision_info = await response.json()
			
 
				             commit_hash = revision_info['sha']
			
 
				 
			
@@ -215,68 +232,32 @@ async def download_all_files(repo_id, revision="main", progress_callback: Option
 
				         completed_files = 0
			
 
				         total_bytes = sum(file["size"] for file in filtered_file_list)
			
 
				         downloaded_bytes = 0
			
 
				-        new_downloaded_bytes = 0
			
 
				-        file_progress = {file["path"]: {"status": "not_started", "downloaded": 0, "total": file["size"]} for file in filtered_file_list}
			
 
				+        file_progress: Dict[str, HFRepoFileProgressEvent] = {file["path"]: HFRepoFileProgressEvent(file["path"], 0, file["size"], 0, timedelta(0), "not_started") for file in filtered_file_list}
			
 
				         start_time = datetime.now()
			
 
				 
			
 
				         async def download_with_progress(file_info):
			
 
				-            nonlocal completed_files, downloaded_bytes, new_downloaded_bytes, file_progress
			
 
				-
			
 
				-            async def file_progress_callback(path, file_downloaded, file_total, speed, file_eta):
			
 
				-                nonlocal downloaded_bytes, new_downloaded_bytes, file_progress
			
 
				-                new_downloaded_bytes += file_downloaded - file_progress[path]['downloaded']
			
 
				-                downloaded_bytes += file_downloaded - file_progress[path]['downloaded']
			
 
				-                file_progress[path].update({
			
 
				-                    'status': 'in_progress',
			
 
				-                    'downloaded': file_downloaded,
			
 
				-                    'total': file_total,
			
 
				-                    'speed': speed,
			
 
				-                    'eta': file_eta
			
 
				-                })
			
 
				+            nonlocal completed_files, downloaded_bytes, file_progress
			
 
				+
			
 
				+            async def file_progress_callback(event: HFRepoFileProgressEvent):
			
 
				+                nonlocal downloaded_bytes, file_progress
			
 
				+                downloaded_bytes += event.downloaded - file_progress[event.file_path].downloaded
			
 
				+                file_progress[event.file_path] = event
			
 
				                 if progress_callback:
			
 
				                     elapsed_time = (datetime.now() - start_time).total_seconds()
			
 
				-                    overall_speed = new_downloaded_bytes / elapsed_time if elapsed_time > 0 else 0
			
 
				+                    overall_speed = downloaded_bytes / elapsed_time if elapsed_time > 0 else 0
			
 
				                     overall_eta = timedelta(seconds=(total_bytes - downloaded_bytes) / overall_speed) if overall_speed > 0 else timedelta(0)
			
 
				-                    await progress_callback(completed_files, total_files, new_downloaded_bytes, total_bytes, overall_eta, file_progress)
			
 
				+                    await progress_callback(HFRepoProgressEvent(completed_files, total_files, downloaded_bytes, total_bytes, overall_eta, file_progress))
			
 
				 
			
 
				             await download_file(session, repo_id, revision, file_info["path"], snapshot_dir, file_progress_callback)
			
 
				             completed_files += 1
			
 
				-            file_progress[file_info["path"]]['status'] = 'complete'
			
 
				+            file_progress[file_info["path"]] = HFRepoFileProgressEvent(file_info["path"], file_info["size"], file_info["size"], 0, timedelta(0), "complete")
			
 
				             if progress_callback:
			
 
				                 elapsed_time = (datetime.now() - start_time).total_seconds()
			
 
				-                overall_speed = new_downloaded_bytes / elapsed_time if elapsed_time > 0 else 0
			
 
				+                overall_speed = downloaded_bytes / elapsed_time if elapsed_time > 0 else 0
			
 
				                 overall_eta = timedelta(seconds=(total_bytes - downloaded_bytes) / overall_speed) if overall_speed > 0 else timedelta(0)
			
 
				-                await progress_callback(completed_files, total_files, new_downloaded_bytes, total_bytes, overall_eta, file_progress)
			
 
				+                await progress_callback(HFRepoProgressEvent(completed_files, total_files, downloaded_bytes, total_bytes, overall_eta, file_progress))
			
 
				 
			
 
				         tasks = [download_with_progress(file_info) for file_info in filtered_file_list]
			
 
				         await asyncio.gather(*tasks)
			
 
				 
			
 
				-async def main(repo_id, revision="main", allow_patterns=None, ignore_patterns=None):
			
 
				-    async def progress_callback(completed_files, total_files, downloaded_bytes, total_bytes, overall_eta, file_progress):
			
 
				-        print(f"Overall Progress: {completed_files}/{total_files} files, {downloaded_bytes}/{total_bytes} bytes")
			
 
				-        print(f"Estimated time remaining: {overall_eta}")
			
 
				-        print("File Progress:")
			
 
				-        for file_path, progress in file_progress.items():
			
 
				-            status_icon = {
			
 
				-                'not_started': '⚪',
			
 
				-                'in_progress': '🔵',
			
 
				-                'complete': '✅'
			
 
				-            }[progress['status']]
			
 
				-            eta_str = str(progress.get('eta', 'N/A'))
			
 
				-            print(f"{status_icon} {file_path}: {progress.get('downloaded', 0)}/{progress['total']} bytes, "
			
 
				-                  f"Speed: {progress.get('speed', 0):.2f} B/s, ETA: {eta_str}")
			
 
				-        print("\n")
			
 
				-
			
 
				-    await download_all_files(repo_id, revision, progress_callback, allow_patterns, ignore_patterns)
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    parser = argparse.ArgumentParser(description="Download files from a Hugging Face model repository.")
			
 
				-    parser.add_argument("--repo-id", help="The repository ID (e.g., 'meta-llama/Meta-Llama-3.1-8B-Instruct')")
			
 
				-    parser.add_argument("--revision", default="main", help="The revision to download (branch, tag, or commit hash)")
			
 
				-    parser.add_argument("--allow-patterns", nargs="*", default=DEFAULT_ALLOW_PATTERNS, help="Patterns of files to allow (e.g., '*.json' '*.safetensors')")
			
 
				-    parser.add_argument("--ignore-patterns", nargs="*", default=DEFAULT_IGNORE_PATTERNS, help="Patterns of files to ignore (e.g., '.*')")
			
 
				-
			
 
				-    args = parser.parse_args()
			
 
				-
			
 
				-    asyncio.run(main(args.repo_id, args.revision, args.allow_patterns, args.ignore_patterns))
			
 
				+    return snapshot_dir
			
--- a/exo/inference/inference_engine.py
+++ b/exo/inference/inference_engine.py
@@ -1,9 +1,9 @@
 
				 import numpy as np
			
 
				 
			
 
				-from typing import Tuple, Optional, Callable
			
 
				+from typing import Tuple, Optional, Callable, Coroutine, Any
			
 
				 from abc import ABC, abstractmethod
			
 
				 from .shard import Shard
			
 
				-
			
 
				+from exo.inference.hf_helpers import HFRepoProgressEvent
			
 
				 
			
 
				 class InferenceEngine(ABC):
			
 
				   @abstractmethod
			
@@ -15,5 +15,5 @@ class InferenceEngine(ABC):
 
				     pass
			
 
				 
			
 
				   @abstractmethod
			
 
				-  def set_on_download_progress(self, on_download_progress: Callable[[int, int], None]):
			
 
				+  def set_progress_callback(self, progress_callback: Callable[[HFRepoProgressEvent], Coroutine[Any, Any, None]]):
			
 
				     pass
			
--- a/exo/inference/mlx/sharded_inference_engine.py
+++ b/exo/inference/mlx/sharded_inference_engine.py
@@ -5,12 +5,13 @@ from .sharded_model import StatefulShardedModel
 
				 from .sharded_utils import load_shard, get_image_from_str
			
 
				 from ..shard import Shard
			
 
				 from typing import Optional, Callable
			
 
				+from exo.inference.hf_helpers import HFRepoProgressCallback
			
 
				 
			
 
				 
			
 
				 class MLXDynamicShardInferenceEngine(InferenceEngine):
			
 
				-  def __init__(self, on_download_progress: Callable[[int, int], None] = None):
			
 
				+  def __init__(self, progress_callback: Optional[HFRepoProgressCallback] = None):
			
 
				     self.shard = None
			
 
				-    self.on_download_progress = on_download_progress
			
 
				+    self.progress_callback = progress_callback
			
 
				 
			
 
				   async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> (np.ndarray, str, bool):
			
 
				     await self.ensure_shard(shard)
			
@@ -33,9 +34,9 @@ class MLXDynamicShardInferenceEngine(InferenceEngine):
 
				     if self.shard == shard:
			
 
				       return
			
 
				 
			
 
				-    model_shard, self.tokenizer = await load_shard(shard.model_id, shard, on_download_progress=self.on_download_progress)
			
 
				+    model_shard, self.tokenizer = await load_shard(shard.model_id, shard, progress_callback=self.progress_callback)
			
 
				     self.stateful_sharded_model = StatefulShardedModel(shard, model_shard)
			
 
				     self.shard = shard
			
 
				 
			
 
				-  def set_on_download_progress(self, on_download_progress: Callable[[int, int], None]):
			
 
				-    self.on_download_progress = on_download_progress
			
 
				+  def set_progress_callback(self, progress_callback: HFRepoProgressCallback):
			
 
				+    self.progress_callback = progress_callback
			
--- a/exo/inference/mlx/sharded_utils.py
+++ b/exo/inference/mlx/sharded_utils.py
@@ -12,10 +12,7 @@ from typing import Optional, Tuple, Union, List, Callable
 
				 from PIL import Image
			
 
				 from io import BytesIO
			
 
				 import base64
			
 
				-import os
			
 
				-import concurrent.futures
			
 
				 
			
 
				-from exo import DEBUG
			
 
				 import mlx.core as mx
			
 
				 import mlx.nn as nn
			
 
				 from huggingface_hub import snapshot_download, list_repo_tree, get_paths_info
			
@@ -28,6 +25,8 @@ from transformers import AutoProcessor
 
				 from mlx_lm.tokenizer_utils import load_tokenizer, TokenizerWrapper
			
 
				 from mlx_lm.tuner.utils import apply_lora_layers
			
 
				 
			
 
				+from exo import DEBUG
			
 
				+from exo.inference.hf_helpers import download_all_files, HFRepoProgressCallback
			
 
				 from ..shard import Shard
			
 
				 
			
 
				 
			
@@ -164,52 +163,6 @@ def load_model_shard(
 
				   return model
			
 
				 
			
 
				 
			
 
				-async def get_repo_size(repo_id: str, revision: Optional[str] = None, allow_patterns: Optional[Union[List[str], str]] = None, repo_type: Optional[str] = None):
			
 
				-  it = await asyncio.to_thread(list_repo_tree, repo_id, revision=revision, repo_type=repo_type)
			
 
				-  files = list(filter_repo_objects(it, allow_patterns=allow_patterns, key=lambda f: f.path))
			
 
				-  return sum(file.size for file in files if hasattr(file, "size") and file.size is not None)
			
 
				-
			
 
				-async def monitor_progress(dir, total_size, print_progress=False, on_progress: Callable[[int, int], None] = None):
			
 
				-    while True:
			
 
				-      try:
			
 
				-        await asyncio.sleep(0.1)
			
 
				-        current_size = sum(os.path.getsize(os.path.join(root, file))
			
 
				-                            for root, _, files in os.walk(dir)
			
 
				-                            for file in files)
			
 
				-        progress = min(current_size / total_size * 100, 100)
			
 
				-        if print_progress:
			
 
				-          print(f"\rProgress: {progress:.2f}% ({current_size}/{total_size} bytes)", end="", flush=True)
			
 
				-        if on_progress:
			
 
				-          on_progress(current_size, total_size)
			
 
				-        if progress >= 100:
			
 
				-          if print_progress:
			
 
				-            print("\nDownload complete!")
			
 
				-          break
			
 
				-      except Exception as e:
			
 
				-        print(f"Error monitoring progress: {e}")
			
 
				-
			
 
				-async def download_repo(repo_id: str, revision: Optional[str] = None, allow_patterns: Optional[Union[List[str], str]] = None, repo_type: Optional[str] = None):
			
 
				-    with concurrent.futures.ThreadPoolExecutor() as pool:
			
 
				-        return await asyncio.get_event_loop().run_in_executor(
			
 
				-            pool,
			
 
				-            partial(snapshot_download, repo_id=repo_id, revision=revision, allow_patterns=allow_patterns, repo_type=repo_type)
			
 
				-        )
			
 
				-
			
 
				-async def download_async_with_progress(repo_id: str, revision: Optional[str] = None, allow_patterns: Optional[Union[List[str], str]] = None, repo_type: Optional[str] = None, on_progress: Callable[[int, int], None] = None):
			
 
				-  storage_folder = os.path.join(HF_HUB_CACHE, repo_folder_name(repo_id=repo_id, repo_type="model"))
			
 
				-  # os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '1'
			
 
				-  # os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
			
 
				-
			
 
				-  total_size = await get_repo_size(repo_id)
			
 
				-
			
 
				-  # Create tasks for download and progress checking
			
 
				-  download_task = asyncio.create_task(download_repo(repo_id, revision=revision, allow_patterns=allow_patterns, repo_type=repo_type))
			
 
				-  progress_task = asyncio.create_task(monitor_progress(storage_folder, total_size, on_progress=on_progress))
			
 
				-
			
 
				-  # Wait for both tasks to complete
			
 
				-  result = await asyncio.gather(download_task, progress_task, return_exceptions=True)
			
 
				-  return result[0]  # Return the result from download_task
			
 
				-
			
 
				 repo_id_safetensors_layers = {
			
 
				   "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit": {
			
 
				     "model.safetensors": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
			
@@ -313,7 +266,7 @@ def get_safetensors_allow_patterns(repo_id: str, shard: Optional[Shard] = None):
 
				 
			
 
				     return allow_patterns if len(allow_patterns) > 0 else ["*.safetensors"]
			
 
				 
			
 
				-async def get_model_path(path_or_hf_repo: str, shard: Optional[Shard] = None, revision: Optional[str] = None, on_download_progress: Callable[[int, int], None] = None) -> Path:
			
 
				+async def get_model_path(path_or_hf_repo: str, shard: Optional[Shard] = None, revision: str = "main", progress_callback: Optional[HFRepoProgressCallback] = None) -> Path:
			
 
				   """
			
 
				   Ensures the model is available locally. If the path does not exist locally,
			
 
				   it is downloaded from the Hugging Face Hub.
			
@@ -329,7 +282,7 @@ async def get_model_path(path_or_hf_repo: str, shard: Optional[Shard] = None, re
 
				   if not model_path.exists():
			
 
				     try:
			
 
				       model_path = Path(
			
 
				-        await download_async_with_progress(
			
 
				+        await download_all_files(
			
 
				           repo_id=path_or_hf_repo,
			
 
				           revision=revision,
			
 
				           allow_patterns=[
			
@@ -339,7 +292,7 @@ async def get_model_path(path_or_hf_repo: str, shard: Optional[Shard] = None, re
 
				             "*.tiktoken",
			
 
				             "*.txt",
			
 
				           ] + get_safetensors_allow_patterns(path_or_hf_repo, shard),
			
 
				-          on_progress=on_download_progress,
			
 
				+          progress_callback=progress_callback,
			
 
				         )
			
 
				       )
			
 
				     except RepositoryNotFoundError:
			
@@ -360,7 +313,7 @@ async def load_shard(
 
				   model_config={},
			
 
				   adapter_path: Optional[str] = None,
			
 
				   lazy: bool = False,
			
 
				-  on_download_progress: Callable[[int, int], None] = None,
			
 
				+  progress_callback: Optional[HFRepoProgressCallback] = None,
			
 
				 ) -> Tuple[nn.Module, TokenizerWrapper]:
			
 
				   """
			
 
				   Load the model and tokenizer from a given path or a huggingface repository.
			
@@ -383,7 +336,7 @@ async def load_shard(
 
				    FileNotFoundError: If config file or safetensors are not found.
			
 
				    ValueError: If model class or args class are not found.
			
 
				   """
			
 
				-  model_path = await get_model_path(path_or_hf_repo, shard, on_download_progress=on_download_progress)
			
 
				+  model_path = await get_model_path(path_or_hf_repo, shard, progress_callback=progress_callback)
			
 
				 
			
 
				   model = load_model_shard(model_path, shard, lazy, model_config)
			
 
				   if adapter_path is not None:
			
--- a/exo/inference/test_inference_engine.py
+++ b/exo/inference/test_inference_engine.py
@@ -1,3 +1,4 @@
 
				+from exo.inference.tinygrad.inference import TinygradDynamicShardInferenceEngine
			
 
				 from exo.inference.mlx.sharded_inference_engine import MLXDynamicShardInferenceEngine
			
 
				 from exo.inference.inference_engine import InferenceEngine
			
 
				 from exo.inference.shard import Shard
			
@@ -40,17 +41,17 @@ async def test_inference_engine(inference_engine_1: InferenceEngine, inference_e
 
				   assert np.array_equal(next_resp_full, resp4)
			
 
				 
			
 
				 
			
 
				-asyncio.run(
			
 
				-  test_inference_engine(
			
 
				-    MLXDynamicShardInferenceEngine(),
			
 
				-    MLXDynamicShardInferenceEngine(),
			
 
				-    "mlx-community/Meta-Llama-3-8B-Instruct-4bit",
			
 
				-  )
			
 
				-)
			
 
				+# asyncio.run(
			
 
				+#   test_inference_engine(
			
 
				+#     MLXDynamicShardInferenceEngine(),
			
 
				+#     MLXDynamicShardInferenceEngine(),
			
 
				+#     "mlx-community/Meta-Llama-3-8B-Instruct-4bit",
			
 
				+#   )
			
 
				+# )
			
 
				 
			
 
				 # TODO: Need more memory or a smaller model
			
 
				-# asyncio.run(test_inference_engine(
			
 
				-#     TinygradDynamicShardInferenceEngine(),
			
 
				-#     TinygradDynamicShardInferenceEngine(),
			
 
				-#     "llama3-8b-sfr",
			
 
				-# ))
			
 
				+asyncio.run(test_inference_engine(
			
 
				+    TinygradDynamicShardInferenceEngine(),
			
 
				+    TinygradDynamicShardInferenceEngine(),
			
 
				+    "llama3-8b-sfr",
			
 
				+))
			
--- a/exo/inference/tinygrad/inference.py
+++ b/exo/inference/tinygrad/inference.py
@@ -1,9 +1,7 @@
 
				-import asyncio
			
 
				 from functools import partial
			
 
				 from pathlib import Path
			
 
				-from typing import List, Optional, Union, Callable
			
 
				+from typing import List, Optional, Union, Callable, Coroutine, Any
			
 
				 import json
			
 
				-import tiktoken
			
 
				 from tiktoken.load import load_tiktoken_bpe
			
 
				 from exo.inference.tinygrad.models.llama import Transformer, convert_from_huggingface, fix_bf16
			
 
				 from tinygrad.nn.state import safe_load, torch_load, load_state_dict
			
@@ -12,7 +10,7 @@ from tinygrad.helpers import DEBUG, tqdm, _cache_dir, fetch
 
				 from exo.inference.shard import Shard
			
 
				 from exo.inference.inference_engine import InferenceEngine
			
 
				 import numpy as np
			
 
				-import os
			
 
				+from exo.inference.hf_helpers import HFRepoProgressCallback, HFRepoProgressEvent, download_all_files, get_repo_root
			
 
				 
			
 
				 MODEL_PARAMS = {
			
 
				   "8B": {
			
@@ -46,15 +44,6 @@ MODEL_PARAMS = {
 
				 
			
 
				 
			
 
				 # **** helper functions ****
			
 
				-async def fetch_async(
			
 
				-  url: str,
			
 
				-  name: Optional[Union[Path, str]] = None,
			
 
				-  subdir: Optional[str] = None,
			
 
				-  allow_caching=not os.getenv("DISABLE_HTTP_CACHE"),
			
 
				-) -> Path:
			
 
				-  func = partial(fetch, url, name, subdir, allow_caching)
			
 
				-  return await asyncio.get_event_loop().run_in_executor(None, func)
			
 
				-
			
 
				 
			
 
				 def concat_weights(models, device=None):
			
 
				   def convert(name) -> Tensor:
			
@@ -159,8 +148,9 @@ def prefill(model, toks, start_pos=0):
 
				 
			
 
				 
			
 
				 class TinygradDynamicShardInferenceEngine(InferenceEngine):
			
 
				-  def __init__(self):
			
 
				+  def __init__(self, progress_callback: Optional[HFRepoProgressCallback] = None):
			
 
				     self.shard = None
			
 
				+    self.progress_callback = progress_callback
			
 
				 
			
 
				   async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> (np.ndarray, str, bool):
			
 
				     # TODO: we need to refactor models/llamaa to handle per-request-kv-cache. right now it's shared between requests.
			
@@ -199,62 +189,9 @@ class TinygradDynamicShardInferenceEngine(InferenceEngine):
 
				     if self.shard == shard:
			
 
				       return
			
 
				 
			
 
				-    model_path = Path(shard.model_id)
			
 
				-    models_dir = Path(_cache_dir) / "tinygrad" / "downloads"
			
 
				-    model_path = models_dir / shard.model_id
			
 
				-    size = "8B"
			
 
				-    if Path(model_path / "tokenizer_config.json").exists():
			
 
				-      model = model_path
			
 
				-    else:
			
 
				-
			
 
				-      if DEBUG >= 2: print(f"Downloading tinygrad model {shard.model_id}...")
			
 
				-      if shard.model_id.lower().find("llama3-8b-sfr") != -1:
			
 
				-        num_files = 4
			
 
				-        for i in range(num_files):
			
 
				-          await fetch_async(
			
 
				-            f"https://huggingface.co/mlx-community/Meta-Llama-3-8B-Instruct/resolve/main/model-{(i+1):05d}-of-{num_files:05d}.safetensors",
			
 
				-            f"model-{(i+1):05d}-of-{num_files:05d}.safetensors",
			
 
				-            subdir=shard.model_id,
			
 
				-          )
			
 
				-        await fetch_async(
			
 
				-          "https://huggingface.co/mlx-community/Meta-Llama-3-8B-Instruct/resolve/main/config.json",
			
 
				-          "config.json",
			
 
				-          subdir=shard.model_id,
			
 
				-        )
			
 
				-        model = await fetch_async(
			
 
				-          "https://huggingface.co/mlx-community/Meta-Llama-3-8B-Instruct/raw/main/model.safetensors.index.json",
			
 
				-          "model.safetensors.index.json",
			
 
				-          subdir=shard.model_id,
			
 
				-        )
			
 
				-        await fetch_async(
			
 
				-          "https://huggingface.co/mlx-community/Meta-Llama-3-8B-Instruct/resolve/main/special_tokens_map.json",
			
 
				-          "special_tokens_map.json",
			
 
				-          subdir=shard.model_id,
			
 
				-        )
			
 
				-        await fetch_async(
			
 
				-          "https://huggingface.co/mlx-community/Meta-Llama-3-8B-Instruct/resolve/main/tokenizer.json",
			
 
				-          "tokenizer.json",
			
 
				-          subdir=shard.model_id,
			
 
				-        )
			
 
				-        await fetch_async(
			
 
				-          "https://huggingface.co/mlx-community/Meta-Llama-3-8B-Instruct/resolve/main/tokenizer_config.json",
			
 
				-          "tokenizer_config.json",
			
 
				-          subdir=shard.model_id,
			
 
				-        )
			
 
				-        size = "8B"
			
 
				-      elif shard.model_id.lower().find("llama3-70b-sfr") != -1:
			
 
				-        raise NotImplementedError("llama3-70b-sfr is not implemented for tinygrad")
			
 
				-        # fetch("https://huggingface.co/bofenghuang/Meta-Llama-3-70B/resolve/main/original/tokenizer.model", "tokenizer.model", subdir=shard.model_id)
			
 
				-        # fetch("https://huggingface.co/TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-70B-R/resolve/main/model-00001-of-00004.safetensors", "model-00001-of-00004.safetensors", subdir=shard.model_id)
			
 
				-        # fetch("https://huggingface.co/TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-70B-R/resolve/main/model-00002-of-00004.safetensors", "model-00002-of-00004.safetensors", subdir=shard.model_id)
			
 
				-        # fetch("https://huggingface.co/TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-70B-R/resolve/main/model-00003-of-00004.safetensors", "model-00003-of-00004.safetensors", subdir=shard.model_id)
			
 
				-        # fetch("https://huggingface.co/TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-70B-R/resolve/main/model-00004-of-00004.safetensors", "model-00004-of-00004.safetensors", subdir=shard.model_id)
			
 
				-        # model = fetch("https://huggingface.co/TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-70B-R/raw/main/model.safetensors.index.json", "model.safetensors.index.json", subdir=shard.model_id)
			
 
				-        # size = "70B"
			
 
				-      else:
			
 
				-        raise ValueError(f"tinygrad doesnt currently support arbitrary model downloading. unsupported model: {shard.model_id}")
			
 
				-
			
 
				-    model = build_transformer(model_path, shard=shard, model_size=size)
			
 
				+    model_path = await download_all_files(shard.model_id, progress_callback=self.progress_callback)
			
 
				+    print(f"{model_path=}")
			
 
				+    model = build_transformer(model_path, shard=shard, model_size="8B" if "8b" in shard.model_id else "70B" if "70b" in shard.model_id else "8B")
			
 
				     from transformers import AutoTokenizer
			
 
				     tokenizer = AutoTokenizer.from_pretrained(str((model_path if model_path.is_dir() else model_path.parent)))
			
 
				 
			
@@ -262,5 +199,5 @@ class TinygradDynamicShardInferenceEngine(InferenceEngine):
 
				     self.model = model
			
 
				     self.tokenizer = tokenizer
			
 
				 
			
 
				-  def set_on_download_progress(self, on_download_progress: Callable[[int, int], None]):
			
 
				-    pass
			
 
				+  def set_progress_callback(self, progress_callback: Callable[[HFRepoProgressEvent], Coroutine[Any, Any, None]]):
			
 
				+    self.progress_callback = progress_callback
			
--- a/extra/download_hf.py
+++ b/extra/download_hf.py
@@ -0,0 +1,53 @@
 
				+import argparse
			
 
				+import asyncio
			
 
				+from exo.inference.hf_helpers import download_all_files, HFRepoProgressEvent, HFRepoFileProgressEvent
			
 
				+
			
 
				+DEFAULT_ALLOW_PATTERNS = [
			
 
				+    "*.json",
			
 
				+    "*.py",
			
 
				+    "tokenizer.model",
			
 
				+    "*.tiktoken",
			
 
				+    "*.txt",
			
 
				+    "*.safetensors",
			
 
				+]
			
 
				+# Always ignore `.git` and `.cache/huggingface` folders in commits
			
 
				+DEFAULT_IGNORE_PATTERNS = [
			
 
				+    ".git",
			
 
				+    ".git/*",
			
 
				+    "*/.git",
			
 
				+    "**/.git/**",
			
 
				+    ".cache/huggingface",
			
 
				+    ".cache/huggingface/*",
			
 
				+    "*/.cache/huggingface",
			
 
				+    "**/.cache/huggingface/**",
			
 
				+]
			
 
				+
			
 
				+async def main(repo_id, revision="main", allow_patterns=None, ignore_patterns=None):
			
 
				+    async def progress_callback(event: HFRepoProgressEvent):
			
 
				+        print(f"Overall Progress: {event.completed_files}/{event.total_files} files, {event.downloaded_bytes}/{event.total_bytes} bytes")
			
 
				+        print(f"Estimated time remaining: {event.overall_eta}")
			
 
				+        print("File Progress:")
			
 
				+        for file_path, progress in event.file_progress.items():
			
 
				+            status_icon = {
			
 
				+                'not_started': '⚪',
			
 
				+                'in_progress': '🔵',
			
 
				+                'complete': '✅'
			
 
				+            }[progress.status]
			
 
				+            eta_str = str(progress.eta)
			
 
				+            print(f"{status_icon} {file_path}: {progress.downloaded}/{progress.total} bytes, "
			
 
				+                  f"Speed: {progress.speed:.2f} B/s, ETA: {eta_str}")
			
 
				+        print("\n")
			
 
				+
			
 
				+    await download_all_files(repo_id, revision, progress_callback, allow_patterns, ignore_patterns)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    parser = argparse.ArgumentParser(description="Download files from a Hugging Face model repository.")
			
 
				+    parser.add_argument("--repo-id", required=True, help="The repository ID (e.g., 'meta-llama/Meta-Llama-3.1-8B-Instruct')")
			
 
				+    parser.add_argument("--revision", default="main", help="The revision to download (branch, tag, or commit hash)")
			
 
				+    parser.add_argument("--allow-patterns", nargs="*", default=None, help="Patterns of files to allow (e.g., '*.json' '*.safetensors')")
			
 
				+    parser.add_argument("--ignore-patterns", nargs="*", default=None, help="Patterns of files to ignore (e.g., '.*')")
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    asyncio.run(main(args.repo_id, args.revision, args.allow_patterns, args.ignore_patterns))
			
--- a/main.py
+++ b/main.py
@@ -60,7 +60,7 @@ node.on_token.register("main_log").on_next(lambda _, tokens, __: print(inference
 
				 if args.prometheus_client_port:
			
 
				     from exo.stats.metrics import start_metrics_server
			
 
				     start_metrics_server(node, args.prometheus_client_port)
			
 
				-inference_engine.set_on_download_progress(lambda current, total: asyncio.create_task(node.broadcast_opaque_status("", json.dumps({"type": "download_progress", "node_id": node.id, "current": current, "total": total}))))
			
 
				+inference_engine.set_progress_callback(lambda event: asyncio.create_task(node.broadcast_opaque_status("", json.dumps({"type": "download_progress", "node_id": node.id, "current": event.downloaded_bytes, "total": event.total_bytes}))))
			
 
				 
			
 
				 async def shutdown(signal, loop):
			
 
				     """Gracefully shutdown the server and close the asyncio loop."""
			
--- a/setup.py
+++ b/setup.py
@@ -6,6 +6,7 @@ from setuptools import find_packages, setup
 
				 install_requires = [
			
 
				     "aiohttp==3.9.5",
			
 
				     "aiohttp_cors==0.7.0",
			
 
				+    "aiofiles==24.1.0",
			
 
				     "blobfile==2.1.1",
			
 
				     "grpcio==1.64.1",
			
 
				     "grpcio-tools==1.64.1",
			
@@ -21,6 +22,7 @@ install_requires = [
 
				     "requests==2.32.3",
			
 
				     "rich==13.7.1",
			
 
				     "safetensors==0.4.3",
			
 
				+    "tenacity==9.0.0",
			
 
				     "tiktoken==0.7.0",
			
 
				     "tokenizers==0.19.1",
			
 
				     "tqdm==4.66.4",