1 tahun lalu · 476a714bbb
--- a/exo/api/chatgpt_api.py
+++ b/exo/api/chatgpt_api.py
@@ -103,10 +103,7 @@ async def resolve_tokenizer(model_id: str):
 
				     if DEBUG >= 4: print(f"Failed to load tokenizer for {model_id}. Falling back to tinygrad tokenizer. Error: {e}")
			
 
				     if DEBUG >= 4: print(traceback.format_exc())
			
 
				 
			
 
				-  if DEBUG >= 4: print(f"Trying mlx tokenizer for {model_id}")
			
 
				-  from exo.inference.mlx.sharded_utils import get_model_path, load_tokenizer
			
 
				-
			
 
				-  return load_tokenizer(await get_model_path(model_id))
			
 
				+  raise ValueError(f"[TODO] Unsupported model: {model_id}")
			
 
				 
			
 
				 
			
 
				 def generate_completion(
			
--- a/exo/download/download_progress.py
+++ b/exo/download/download_progress.py
@@ -0,0 +1,74 @@
 
				+from typing import Dict, Callable, Coroutine, Any, Literal
			
 
				+from dataclasses import dataclass
			
 
				+from datetime import timedelta
			
 
				+
			
 
				+@dataclass
			
 
				+class RepoFileProgressEvent:
			
 
				+    file_path: str
			
 
				+    downloaded: int
			
 
				+    downloaded_this_session: int
			
 
				+    total: int
			
 
				+    speed: int
			
 
				+    eta: timedelta
			
 
				+    status: Literal["not_started", "in_progress", "complete"]
			
 
				+
			
 
				+    def to_dict(self):
			
 
				+        return {
			
 
				+            "file_path": self.file_path,
			
 
				+            "downloaded": self.downloaded,
			
 
				+            "downloaded_this_session": self.downloaded_this_session,
			
 
				+            "total": self.total,
			
 
				+            "speed": self.speed,
			
 
				+            "eta": self.eta.total_seconds(),
			
 
				+            "status": self.status
			
 
				+        }
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_dict(cls, data):
			
 
				+        # Convert eta from seconds back to timedelta
			
 
				+        if 'eta' in data:
			
 
				+            data['eta'] = timedelta(seconds=data['eta'])
			
 
				+        return cls(**data)
			
 
				+
			
 
				+@dataclass
			
 
				+class RepoProgressEvent:
			
 
				+    completed_files: int
			
 
				+    total_files: int
			
 
				+    downloaded_bytes: int
			
 
				+    downloaded_bytes_this_session: int
			
 
				+    total_bytes: int
			
 
				+    overall_speed: int
			
 
				+    overall_eta: timedelta
			
 
				+    file_progress: Dict[str, RepoFileProgressEvent]
			
 
				+    status: Literal["not_started", "in_progress", "complete"]
			
 
				+
			
 
				+    def to_dict(self):
			
 
				+        return {
			
 
				+            "completed_files": self.completed_files,
			
 
				+            "total_files": self.total_files,
			
 
				+            "downloaded_bytes": self.downloaded_bytes,
			
 
				+            "downloaded_bytes_this_session": self.downloaded_bytes_this_session,
			
 
				+            "total_bytes": self.total_bytes,
			
 
				+            "overall_speed": self.overall_speed,
			
 
				+            "overall_eta": self.overall_eta.total_seconds(),
			
 
				+            "file_progress": {k: v.to_dict() for k, v in self.file_progress.items()},
			
 
				+            "status": self.status
			
 
				+        }
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_dict(cls, data):
			
 
				+        # Convert overall_eta from seconds back to timedelta
			
 
				+        if 'overall_eta' in data:
			
 
				+            data['overall_eta'] = timedelta(seconds=data['overall_eta'])
			
 
				+
			
 
				+        # Parse file_progress
			
 
				+        if 'file_progress' in data:
			
 
				+            data['file_progress'] = {
			
 
				+                k: RepoFileProgressEvent.from_dict(v)
			
 
				+                for k, v in data['file_progress'].items()
			
 
				+            }
			
 
				+
			
 
				+        return cls(**data)
			
 
				+
			
 
				+RepoFileProgressCallback = Callable[[RepoFileProgressEvent], Coroutine[Any, Any, None]]
			
 
				+RepoProgressCallback = Callable[[RepoProgressEvent], Coroutine[Any, Any, None]]
			
--- a/exo/download/hf/hf_helpers.py
+++ b/exo/download/hf/hf_helpers.py
@@ -1,5 +1,6 @@
 
				 import asyncio
			
 
				 import aiohttp
			
 
				+import json
			
 
				 import os
			
 
				 from urllib.parse import urljoin
			
 
				 from typing import Callable, Optional, Coroutine, Any, Dict, List, Union, Literal
			
@@ -7,9 +8,9 @@ from datetime import datetime, timedelta
 
				 from fnmatch import fnmatch
			
 
				 from pathlib import Path
			
 
				 from typing import Generator, Iterable, TypeVar, TypedDict
			
 
				-from dataclasses import dataclass
			
 
				 from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
			
 
				 from exo.helpers import DEBUG
			
 
				+from exo.download.download_progress import RepoProgressEvent, RepoFileProgressEvent, RepoProgressCallback, RepoFileProgressCallback
			
 
				 
			
 
				 T = TypeVar("T")
			
 
				 def filter_repo_objects(
			
@@ -21,10 +22,8 @@ def filter_repo_objects(
 
				 ) -> Generator[T, None, None]:
			
 
				     if isinstance(allow_patterns, str):
			
 
				         allow_patterns = [allow_patterns]
			
 
				-
			
 
				     if isinstance(ignore_patterns, str):
			
 
				         ignore_patterns = [ignore_patterns]
			
 
				-
			
 
				     if allow_patterns is not None:
			
 
				         allow_patterns = [_add_wildcard_to_directories(p) for p in allow_patterns]
			
 
				     if ignore_patterns is not None:
			
@@ -37,18 +36,14 @@ def filter_repo_objects(
 
				             if isinstance(item, Path):
			
 
				                 return str(item)
			
 
				             raise ValueError(f"Please provide `key` argument in `filter_repo_objects`: `{item}` is not a string.")
			
 
				-
			
 
				         key = _identity
			
 
				 
			
 
				     for item in items:
			
 
				         path = key(item)
			
 
				-
			
 
				         if allow_patterns is not None and not any(fnmatch(path, r) for r in allow_patterns):
			
 
				             continue
			
 
				-
			
 
				         if ignore_patterns is not None and any(fnmatch(path, r) for r in ignore_patterns):
			
 
				             continue
			
 
				-
			
 
				         yield item
			
 
				 
			
 
				 def _add_wildcard_to_directories(pattern: str) -> str:
			
@@ -99,84 +94,13 @@ async def fetch_file_list(session, repo_id, revision, path=""):
 
				             raise Exception(f"Failed to fetch file list: {response.status}")
			
 
				 
			
 
				 
			
 
				-@dataclass
			
 
				-class HFRepoFileProgressEvent:
			
 
				-    file_path: str
			
 
				-    downloaded: int
			
 
				-    downloaded_this_session: int
			
 
				-    total: int
			
 
				-    speed: int
			
 
				-    eta: timedelta
			
 
				-    status: Literal["not_started", "in_progress", "complete"]
			
 
				-
			
 
				-    def to_dict(self):
			
 
				-        return {
			
 
				-            "file_path": self.file_path,
			
 
				-            "downloaded": self.downloaded,
			
 
				-            "downloaded_this_session": self.downloaded_this_session,
			
 
				-            "total": self.total,
			
 
				-            "speed": self.speed,
			
 
				-            "eta": self.eta.total_seconds(),
			
 
				-            "status": self.status
			
 
				-        }
			
 
				-
			
 
				-    @classmethod
			
 
				-    def from_dict(cls, data):
			
 
				-        # Convert eta from seconds back to timedelta
			
 
				-        if 'eta' in data:
			
 
				-            data['eta'] = timedelta(seconds=data['eta'])
			
 
				-        return cls(**data)
			
 
				-
			
 
				-@dataclass
			
 
				-class HFRepoProgressEvent:
			
 
				-    completed_files: int
			
 
				-    total_files: int
			
 
				-    downloaded_bytes: int
			
 
				-    downloaded_bytes_this_session: int
			
 
				-    total_bytes: int
			
 
				-    overall_speed: int
			
 
				-    overall_eta: timedelta
			
 
				-    file_progress: Dict[str, HFRepoFileProgressEvent]
			
 
				-    status: Literal["not_started", "in_progress", "complete"]
			
 
				-
			
 
				-    def to_dict(self):
			
 
				-        return {
			
 
				-            "completed_files": self.completed_files,
			
 
				-            "total_files": self.total_files,
			
 
				-            "downloaded_bytes": self.downloaded_bytes,
			
 
				-            "downloaded_bytes_this_session": self.downloaded_bytes_this_session,
			
 
				-            "total_bytes": self.total_bytes,
			
 
				-            "overall_speed": self.overall_speed,
			
 
				-            "overall_eta": self.overall_eta.total_seconds(),
			
 
				-            "file_progress": {k: v.to_dict() for k, v in self.file_progress.items()},
			
 
				-            "status": self.status
			
 
				-        }
			
 
				-
			
 
				-    @classmethod
			
 
				-    def from_dict(cls, data):
			
 
				-        # Convert overall_eta from seconds back to timedelta
			
 
				-        if 'overall_eta' in data:
			
 
				-            data['overall_eta'] = timedelta(seconds=data['overall_eta'])
			
 
				-
			
 
				-        # Parse file_progress
			
 
				-        if 'file_progress' in data:
			
 
				-            data['file_progress'] = {
			
 
				-                k: HFRepoFileProgressEvent.from_dict(v)
			
 
				-                for k, v in data['file_progress'].items()
			
 
				-            }
			
 
				-
			
 
				-        return cls(**data)
			
 
				-
			
 
				-HFRepoFileProgressCallback = Callable[[HFRepoFileProgressEvent], Coroutine[Any, Any, None]]
			
 
				-HFRepoProgressCallback = Callable[[HFRepoProgressEvent], Coroutine[Any, Any, None]]
			
 
				-
			
 
				 @retry(
			
 
				     stop=stop_after_attempt(5),
			
 
				     wait=wait_exponential(multiplier=1, min=4, max=60),
			
 
				     retry=retry_if_exception_type((aiohttp.ClientError, asyncio.TimeoutError, aiohttp.ClientResponseError)),
			
 
				     reraise=True
			
 
				 )
			
 
				-async def download_file(session: aiohttp.ClientSession, repo_id: str, revision: str, file_path: str, save_directory: str, progress_callback: Optional[HFRepoFileProgressCallback] = None, use_range_request: bool = True):
			
 
				+async def download_file(session: aiohttp.ClientSession, repo_id: str, revision: str, file_path: str, save_directory: str, progress_callback: Optional[RepoFileProgressCallback] = None, use_range_request: bool = True):
			
 
				     base_url = f"https://huggingface.co/{repo_id}/resolve/{revision}/"
			
 
				     url = urljoin(base_url, file_path)
			
 
				     local_path = os.path.join(save_directory, file_path)
			
@@ -198,7 +122,7 @@ async def download_file(session: aiohttp.ClientSession, repo_id: str, revision:
 
				         if downloaded_size == total_size:
			
 
				             if DEBUG >= 2: print(f"File already downloaded: {file_path}")
			
 
				             if progress_callback:
			
 
				-                await progress_callback(HFRepoFileProgressEvent(file_path, downloaded_size, downloaded_this_session, total_size, 0, timedelta(0), "complete"))
			
 
				+                await progress_callback(RepoFileProgressEvent(file_path, downloaded_size, downloaded_this_session, total_size, 0, timedelta(0), "complete"))
			
 
				             return
			
 
				 
			
 
				         if response.status == 200:
			
@@ -221,7 +145,7 @@ async def download_file(session: aiohttp.ClientSession, repo_id: str, revision:
 
				                 if downloaded_size == total_size:
			
 
				                     if DEBUG >= 2: print(f"File fully downloaded on first pass: {file_path}")
			
 
				                     if progress_callback:
			
 
				-                        await progress_callback(HFRepoFileProgressEvent(file_path, downloaded_size, downloaded_this_session, total_size, 0, timedelta(0), "complete"))
			
 
				+                        await progress_callback(RepoFileProgressEvent(file_path, downloaded_size, downloaded_this_session, total_size, 0, timedelta(0), "complete"))
			
 
				                     return
			
 
				             except ValueError:
			
 
				                 if DEBUG >= 1: print(f"Failed to parse Content-Range header: {content_range}. Starting download from scratch...")
			
@@ -232,7 +156,7 @@ async def download_file(session: aiohttp.ClientSession, repo_id: str, revision:
 
				         if downloaded_size == total_size:
			
 
				             print(f"File already downloaded: {file_path}")
			
 
				             if progress_callback:
			
 
				-                await progress_callback(HFRepoFileProgressEvent(file_path, downloaded_size, downloaded_this_session, total_size, 0, timedelta(0), "complete"))
			
 
				+                await progress_callback(RepoFileProgressEvent(file_path, downloaded_size, downloaded_this_session, total_size, 0, timedelta(0), "complete"))
			
 
				             return
			
 
				 
			
 
				         DOWNLOAD_CHUNK_SIZE = 32768
			
@@ -249,10 +173,10 @@ async def download_file(session: aiohttp.ClientSession, repo_id: str, revision:
 
				                     eta = timedelta(seconds=remaining_size / speed) if speed > 0 else timedelta(0)
			
 
				                     status = "in_progress" if downloaded_size < total_size else "complete"
			
 
				                     if DEBUG >= 8: print(f"HF repo file download progress: {file_path=} {elapsed_time=} {speed=} Downloaded={downloaded_size}/{total_size} {remaining_size=} {eta=} {status=}")
			
 
				-                    await progress_callback(HFRepoFileProgressEvent(file_path, downloaded_size, downloaded_this_session, total_size, speed, eta, status))
			
 
				+                    await progress_callback(RepoFileProgressEvent(file_path, downloaded_size, downloaded_this_session, total_size, speed, eta, status))
			
 
				         if DEBUG >= 2: print(f"Downloaded: {file_path}")
			
 
				 
			
 
				-async def download_all_files(repo_id: str, revision: str = "main", progress_callback: Optional[HFRepoProgressCallback] = None, allow_patterns: Optional[Union[List[str], str]] = None, ignore_patterns: Optional[Union[List[str], str]] = None):
			
 
				+async def download_repo_files(repo_id: str, revision: str = "main", progress_callback: Optional[RepoProgressCallback] = None, allow_patterns: Optional[Union[List[str], str]] = None, ignore_patterns: Optional[Union[List[str], str]] = None) -> Path:
			
 
				     repo_root = get_repo_root(repo_id)
			
 
				     refs_dir = repo_root / "refs"
			
 
				     snapshots_dir = repo_root / "snapshots"
			
@@ -283,11 +207,11 @@ async def download_all_files(repo_id: str, revision: str = "main", progress_call
 
				         filtered_file_list = list(filter_repo_objects(file_list, allow_patterns=allow_patterns, ignore_patterns=ignore_patterns, key=lambda x: x["path"]))
			
 
				         total_files = len(filtered_file_list)
			
 
				         total_bytes = sum(file["size"] for file in filtered_file_list)
			
 
				-        file_progress: Dict[str, HFRepoFileProgressEvent] = {file["path"]: HFRepoFileProgressEvent(file["path"], 0, 0, file["size"], 0, timedelta(0), "not_started") for file in filtered_file_list}
			
 
				+        file_progress: Dict[str, RepoFileProgressEvent] = {file["path"]: RepoFileProgressEvent(file["path"], 0, 0, file["size"], 0, timedelta(0), "not_started") for file in filtered_file_list}
			
 
				         start_time = datetime.now()
			
 
				 
			
 
				         async def download_with_progress(file_info, progress_state):
			
 
				-            async def file_progress_callback(event: HFRepoFileProgressEvent):
			
 
				+            async def file_progress_callback(event: RepoFileProgressEvent):
			
 
				                 progress_state['downloaded_bytes'] += event.downloaded - file_progress[event.file_path].downloaded
			
 
				                 progress_state['downloaded_bytes_this_session'] += event.downloaded_this_session - file_progress[event.file_path].downloaded_this_session
			
 
				                 file_progress[event.file_path] = event
			
@@ -297,21 +221,60 @@ async def download_all_files(repo_id: str, revision: str = "main", progress_call
 
				                     remaining_bytes = total_bytes - progress_state['downloaded_bytes']
			
 
				                     overall_eta = timedelta(seconds=remaining_bytes / overall_speed) if overall_speed > 0 else timedelta(seconds=0)
			
 
				                     status = "in_progress" if progress_state['downloaded_bytes'] < total_bytes else "complete"
			
 
				-                    await progress_callback(HFRepoProgressEvent(progress_state['completed_files'], total_files, progress_state['downloaded_bytes'], progress_state['downloaded_bytes_this_session'], total_bytes, overall_speed, overall_eta, file_progress, status))
			
 
				+                    await progress_callback(RepoProgressEvent(progress_state['completed_files'], total_files, progress_state['downloaded_bytes'], progress_state['downloaded_bytes_this_session'], total_bytes, overall_speed, overall_eta, file_progress, status))
			
 
				 
			
 
				             await download_file(session, repo_id, revision, file_info["path"], snapshot_dir, file_progress_callback)
			
 
				             progress_state['completed_files'] += 1
			
 
				-            file_progress[file_info["path"]] = HFRepoFileProgressEvent(file_info["path"], file_info["size"], file_progress[file_info["path"]].downloaded_this_session, file_info["size"], 0, timedelta(0), "complete")
			
 
				+            file_progress[file_info["path"]] = RepoFileProgressEvent(file_info["path"], file_info["size"], file_progress[file_info["path"]].downloaded_this_session, file_info["size"], 0, timedelta(0), "complete")
			
 
				             if progress_callback:
			
 
				                 elapsed_time = (datetime.now() - start_time).total_seconds()
			
 
				                 overall_speed = int(progress_state['downloaded_bytes_this_session'] / elapsed_time) if elapsed_time > 0 else 0
			
 
				                 remaining_bytes = total_bytes - progress_state['downloaded_bytes']
			
 
				                 overall_eta = timedelta(seconds=remaining_bytes / overall_speed) if overall_speed > 0 else timedelta(seconds=0)
			
 
				                 status = "in_progress" if progress_state['completed_files'] < total_files else "complete"
			
 
				-                await progress_callback(HFRepoProgressEvent(progress_state['completed_files'], total_files, progress_state['downloaded_bytes'], progress_state['downloaded_bytes_this_session'], total_bytes, overall_speed, overall_eta, file_progress, status))
			
 
				+                await progress_callback(RepoProgressEvent(progress_state['completed_files'], total_files, progress_state['downloaded_bytes'], progress_state['downloaded_bytes_this_session'], total_bytes, overall_speed, overall_eta, file_progress, status))
			
 
				 
			
 
				         progress_state = {'completed_files': 0, 'downloaded_bytes': 0, 'downloaded_bytes_this_session': 0}
			
 
				         tasks = [download_with_progress(file_info, progress_state) for file_info in filtered_file_list]
			
 
				         await asyncio.gather(*tasks)
			
 
				 
			
 
				     return snapshot_dir
			
 
				+
			
 
				+async def get_weight_map(repo_id: str, revision: str = "main") -> Optional[Dict[str, str]]:
			
 
				+    """
			
 
				+    Retrieve the weight map from the model.safetensors.index.json file.
			
 
				+
			
 
				+    Args:
			
 
				+        repo_id (str): The Hugging Face repository ID.
			
 
				+        revision (str): The revision of the repository to use.
			
 
				+
			
 
				+    Returns:
			
 
				+        Optional[Dict[str, str]]: The weight map if it exists, otherwise None.
			
 
				+    """
			
 
				+
			
 
				+    # Download the index file
			
 
				+    await download_repo_files(
			
 
				+        repo_id=repo_id,
			
 
				+        revision=revision,
			
 
				+        allow_patterns="model.safetensors.index.json"
			
 
				+    )
			
 
				+
			
 
				+    # Check if the file exists
			
 
				+    repo_root = get_repo_root(repo_id)
			
 
				+    snapshot_dir = repo_root / "snapshots"
			
 
				+    index_file = next(snapshot_dir.glob("*/model.safetensors.index.json"), None)
			
 
				+
			
 
				+    if index_file and index_file.exists():
			
 
				+        with open(index_file, 'r') as f:
			
 
				+            index_data = json.load(f)
			
 
				+        return index_data.get("weight_map")
			
 
				+
			
 
				+    return None
			
 
				+
			
 
				+def extract_layer_num(tensor_name: str) -> Optional[int]:
			
 
				+    # This is a simple example and might need to be adjusted based on the actual naming convention
			
 
				+    parts = tensor_name.split('.')
			
 
				+    for part in parts:
			
 
				+        if part.isdigit():
			
 
				+            return int(part)
			
 
				+    return None
			
--- a/exo/download/hf/hf_shard_download.py
+++ b/exo/download/hf/hf_shard_download.py
@@ -0,0 +1,76 @@
 
				+import asyncio
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, List, Tuple
			
 
				+from exo.inference.shard import Shard
			
 
				+from exo.download.shard_download import ShardDownloader
			
 
				+from exo.download.download_progress import RepoProgressEvent
			
 
				+from exo.download.hf.hf_helpers import download_repo_files, RepoProgressEvent, get_repo_root, get_weight_map, extract_layer_num
			
 
				+from exo.helpers import AsyncCallbackSystem
			
 
				+
			
 
				+class HFShardDownloader(ShardDownloader):
			
 
				+    def __init__(self):
			
 
				+        self.active_downloads: List[Tuple[Shard, asyncio.Task]] = []
			
 
				+        self._on_progress = AsyncCallbackSystem[str, Tuple[Shard, RepoProgressEvent]]()
			
 
				+
			
 
				+    async def ensure_shard(self, shard: Shard) -> Path:
			
 
				+        # Cancel any overlapping downloads
			
 
				+        to_remove = []
			
 
				+        for active_shard, task in self.active_downloads:
			
 
				+            if shard.overlaps(active_shard):
			
 
				+                task.cancel()
			
 
				+                try:
			
 
				+                    await task
			
 
				+                except asyncio.CancelledError:
			
 
				+                    pass  # This is expected when cancelling a task
			
 
				+                to_remove.append((active_shard, task))
			
 
				+
			
 
				+        # Remove cancelled downloads from the list
			
 
				+        for item in to_remove:
			
 
				+            self.active_downloads.remove(item)
			
 
				+
			
 
				+        # Start new download
			
 
				+        download_task = asyncio.create_task(self._download_shard(shard))
			
 
				+        self.active_downloads.append((shard, download_task))
			
 
				+
			
 
				+        try:
			
 
				+            return await download_task
			
 
				+        finally:
			
 
				+            # Ensure the task is removed even if an exception occurs
			
 
				+            if (shard, download_task) in self.active_downloads:
			
 
				+                self.active_downloads.remove((shard, download_task))
			
 
				+
			
 
				+    async def _download_shard(self, shard: Shard) -> Path:
			
 
				+        async def wrapped_progress_callback(event: RepoProgressEvent):
			
 
				+            self._on_progress.trigger_all(shard, event)
			
 
				+
			
 
				+        weight_map = await get_weight_map(shard.model_id)
			
 
				+        allow_patterns = self._get_allow_patterns(weight_map, shard.start_layer, shard.end_layer)
			
 
				+
			
 
				+        return await download_repo_files(
			
 
				+            repo_id=shard.model_id,
			
 
				+            progress_callback=wrapped_progress_callback,
			
 
				+            allow_patterns=allow_patterns
			
 
				+        )
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _get_allow_patterns(weight_map: Dict[str, str], start_layer: int, end_layer: int) -> List[str]:
			
 
				+        default_patterns = [
			
 
				+            "*.json",
			
 
				+            "*.py",
			
 
				+            "tokenizer.model",
			
 
				+            "*.tiktoken",
			
 
				+            "*.txt",
			
 
				+        ]
			
 
				+        shard_specific_patterns = []
			
 
				+        if weight_map:
			
 
				+            for tensor_name, filename in weight_map.items():
			
 
				+                layer_num = extract_layer_num(tensor_name)
			
 
				+                if layer_num is not None and start_layer <= layer_num <= end_layer:
			
 
				+                    shard_specific_patterns.append(filename)
			
 
				+        else:
			
 
				+            shard_specific_patterns = ["*.safetensors"]
			
 
				+        return list(set(default_patterns + shard_specific_patterns))  # Remove duplicates
			
 
				+
			
 
				+    @property
			
 
				+    def on_progress(self) -> AsyncCallbackSystem[str, Tuple[Shard, RepoProgressEvent]]:
			
 
				+        return self._on_progress
			
--- a/exo/download/shard_download.py
+++ b/exo/download/shard_download.py
@@ -0,0 +1,25 @@
 
				+from abc import ABC, abstractmethod
			
 
				+from typing import Optional, Tuple
			
 
				+from pathlib import Path
			
 
				+from exo.inference.shard import Shard
			
 
				+from exo.download.download_progress import RepoProgressEvent
			
 
				+from exo.helpers import AsyncCallbackSystem
			
 
				+
			
 
				+class ShardDownloader(ABC):
			
 
				+    @abstractmethod
			
 
				+    async def ensure_shard(self, shard: Shard) -> Path:
			
 
				+        """
			
 
				+        Ensures that the shard is downloaded.
			
 
				+        Does not allow multiple overlapping downloads at once.
			
 
				+        If you try to download a Shard which overlaps a Shard that is already being downloaded,
			
 
				+        the download will be cancelled and a new download will start.
			
 
				+
			
 
				+        Args:
			
 
				+            shard (Shard): The shard to download.
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    @property
			
 
				+    @abstractmethod
			
 
				+    def on_progress(self) -> AsyncCallbackSystem[str, Tuple[Shard, RepoProgressEvent]]:
			
 
				+        pass
			
--- a/exo/helpers.py
+++ b/exo/helpers.py
@@ -31,17 +31,17 @@ def get_system_info():
 
				   return "Non-Mac, non-Linux system"
			
 
				 
			
 
				 
			
 
				-def get_inference_engine(inference_engine_name):
			
 
				+def get_inference_engine(inference_engine_name, shard_downloader: 'ShardDownloader'):
			
 
				   if inference_engine_name == "mlx":
			
 
				     from exo.inference.mlx.sharded_inference_engine import MLXDynamicShardInferenceEngine
			
 
				 
			
 
				-    return MLXDynamicShardInferenceEngine()
			
 
				+    return MLXDynamicShardInferenceEngine(shard_downloader)
			
 
				   elif inference_engine_name == "tinygrad":
			
 
				     from exo.inference.tinygrad.inference import TinygradDynamicShardInferenceEngine
			
 
				     import tinygrad.helpers
			
 
				     tinygrad.helpers.DEBUG.value = int(os.getenv("TINYGRAD_DEBUG", default="0"))
			
 
				 
			
 
				-    return TinygradDynamicShardInferenceEngine()
			
 
				+    return TinygradDynamicShardInferenceEngine(shard_downloader)
			
 
				   else:
			
 
				     raise ValueError(f"Inference engine {inference_engine_name} not supported")
			
 
				 
			
--- a/exo/inference/inference_engine.py
+++ b/exo/inference/inference_engine.py
@@ -1,9 +1,8 @@
 
				 import numpy as np
			
 
				 
			
 
				-from typing import Tuple, Optional, Callable, Coroutine, Any
			
 
				+from typing import Tuple, Optional
			
 
				 from abc import ABC, abstractmethod
			
 
				 from .shard import Shard
			
 
				-from exo.inference.hf_helpers import HFRepoProgressEvent
			
 
				 
			
 
				 class InferenceEngine(ABC):
			
 
				   @abstractmethod
			
@@ -13,7 +12,3 @@ class InferenceEngine(ABC):
 
				   @abstractmethod
			
 
				   async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
			
 
				     pass
			
 
				-
			
 
				-  @abstractmethod
			
 
				-  def set_progress_callback(self, progress_callback: Callable[[HFRepoProgressEvent], Coroutine[Any, Any, None]]):
			
 
				-    pass
			
--- a/exo/inference/mlx/sharded_inference_engine.py
+++ b/exo/inference/mlx/sharded_inference_engine.py
@@ -4,14 +4,14 @@ from ..inference_engine import InferenceEngine
 
				 from .sharded_model import StatefulShardedModel
			
 
				 from .sharded_utils import load_shard, get_image_from_str
			
 
				 from ..shard import Shard
			
 
				-from typing import Optional, Callable
			
 
				-from exo.inference.hf_helpers import HFRepoProgressCallback
			
 
				+from typing import Optional
			
 
				+from exo.download.shard_download import ShardDownloader
			
 
				 
			
 
				 
			
 
				 class MLXDynamicShardInferenceEngine(InferenceEngine):
			
 
				-  def __init__(self, progress_callback: Optional[HFRepoProgressCallback] = None):
			
 
				+  def __init__(self, shard_downloader: ShardDownloader):
			
 
				     self.shard = None
			
 
				-    self.progress_callback = progress_callback
			
 
				+    self.shard_downloader = shard_downloader
			
 
				 
			
 
				   async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> (np.ndarray, str, bool):
			
 
				     await self.ensure_shard(shard)
			
@@ -34,9 +34,7 @@ class MLXDynamicShardInferenceEngine(InferenceEngine):
 
				     if self.shard == shard:
			
 
				       return
			
 
				 
			
 
				-    model_shard, self.tokenizer = await load_shard(shard.model_id, shard, progress_callback=self.progress_callback)
			
 
				+    model_path = await self.shard_downloader.ensure_shard(shard)
			
 
				+    model_shard, self.tokenizer = await load_shard(model_path, shard)
			
 
				     self.stateful_sharded_model = StatefulShardedModel(shard, model_shard)
			
 
				     self.shard = shard
			
 
				-
			
 
				-  def set_progress_callback(self, progress_callback: HFRepoProgressCallback):
			
 
				-    self.progress_callback = progress_callback
			
--- a/exo/inference/mlx/sharded_utils.py
+++ b/exo/inference/mlx/sharded_utils.py
@@ -15,18 +15,12 @@ import base64
 
				 
			
 
				 import mlx.core as mx
			
 
				 import mlx.nn as nn
			
 
				-from huggingface_hub import snapshot_download, list_repo_tree, get_paths_info
			
 
				-from huggingface_hub.utils import filter_repo_objects
			
 
				-from huggingface_hub.file_download import repo_folder_name
			
 
				-from huggingface_hub.constants import HF_HUB_CACHE
			
 
				-from huggingface_hub.utils._errors import RepositoryNotFoundError
			
 
				 from transformers import AutoProcessor
			
 
				 
			
 
				 from mlx_lm.tokenizer_utils import load_tokenizer, TokenizerWrapper
			
 
				 from mlx_lm.tuner.utils import apply_lora_layers
			
 
				 
			
 
				 from exo import DEBUG
			
 
				-from exo.inference.hf_helpers import download_all_files, HFRepoProgressCallback
			
 
				 from ..shard import Shard
			
 
				 
			
 
				 
			
@@ -162,182 +156,14 @@ def load_model_shard(
 
				   model.eval()
			
 
				   return model
			
 
				 
			
 
				-
			
 
				-repo_id_safetensors_layers = {
			
 
				-  "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit": {
			
 
				-    "model.safetensors": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
			
 
				-  },
			
 
				-  "mlx-community/Meta-Llama-3.1-70B-Instruct-4bit": {
			
 
				-    "model-00001-of-00008.safetensors": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
			
 
				-    "model-00002-of-00008.safetensors": [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
			
 
				-    "model-00003-of-00008.safetensors": [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
			
 
				-    "model-00004-of-00008.safetensors": [31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42],
			
 
				-    "model-00005-of-00008.safetensors": [42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53],
			
 
				-    "model-00006-of-00008.safetensors": [53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64],
			
 
				-    "model-00007-of-00008.safetensors": [64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75],
			
 
				-    "model-00008-of-00008.safetensors": [75, 76, 77, 78, 79],
			
 
				-  },
			
 
				-  "mlx-community/Meta-Llama-3.1-405B-Instruct-4bit": {
			
 
				-    "model-00001-of-00046.safetensors": [0, 1, 2],
			
 
				-    "model-00002-of-00046.safetensors": [2, 3, 4, 5],
			
 
				-    "model-00003-of-00046.safetensors": [5, 6, 7],
			
 
				-    "model-00004-of-00046.safetensors": [8, 9, 10],
			
 
				-    "model-00005-of-00046.safetensors": [10, 11, 12, 13],
			
 
				-    "model-00006-of-00046.safetensors": [13, 14, 15, 16],
			
 
				-    "model-00007-of-00046.safetensors": [16, 17, 18, 19],
			
 
				-    "model-00008-of-00046.safetensors": [19, 20, 21],
			
 
				-    "model-00009-of-00046.safetensors": [22, 23, 24],
			
 
				-    "model-00010-of-00046.safetensors": [24, 25, 26, 27],
			
 
				-    "model-00011-of-00046.safetensors": [27, 28, 29, 30],
			
 
				-    "model-00012-of-00046.safetensors": [30, 31, 32, 33],
			
 
				-    "model-00013-of-00046.safetensors": [33, 34, 35],
			
 
				-    "model-00014-of-00046.safetensors": [36, 37, 38],
			
 
				-    "model-00015-of-00046.safetensors": [38, 39, 40, 41],
			
 
				-    "model-00016-of-00046.safetensors": [41, 42, 43, 44],
			
 
				-    "model-00017-of-00046.safetensors": [44, 45, 46, 47],
			
 
				-    "model-00018-of-00046.safetensors": [47, 48, 49],
			
 
				-    "model-00019-of-00046.safetensors": [50, 51, 52],
			
 
				-    "model-00020-of-00046.safetensors": [52, 53, 54, 55],
			
 
				-    "model-00021-of-00046.safetensors": [55, 56, 57, 58],
			
 
				-    "model-00022-of-00046.safetensors": [58, 59, 60, 61],
			
 
				-    "model-00023-of-00046.safetensors": [61, 62, 63],
			
 
				-    "model-00024-of-00046.safetensors": [64, 65, 66],
			
 
				-    "model-00025-of-00046.safetensors": [66, 67, 68, 69],
			
 
				-    "model-00026-of-00046.safetensors": [69, 70, 71, 72],
			
 
				-    "model-00027-of-00046.safetensors": [72, 73, 74, 75],
			
 
				-    "model-00028-of-00046.safetensors": [75, 76, 77],
			
 
				-    "model-00029-of-00046.safetensors": [78, 79, 80],
			
 
				-    "model-00030-of-00046.safetensors": [80, 81, 82, 83],
			
 
				-    "model-00031-of-00046.safetensors": [83, 84, 85, 86],
			
 
				-    "model-00032-of-00046.safetensors": [86, 87, 88, 89],
			
 
				-    "model-00033-of-00046.safetensors": [89, 90, 91],
			
 
				-    "model-00034-of-00046.safetensors": [92, 93, 94],
			
 
				-    "model-00035-of-00046.safetensors": [94, 95, 96, 97],
			
 
				-    "model-00036-of-00046.safetensors": [97, 98, 99, 100],
			
 
				-    "model-00037-of-00046.safetensors": [100, 101, 102, 103],
			
 
				-    "model-00038-of-00046.safetensors": [103, 104, 105],
			
 
				-    "model-00039-of-00046.safetensors": [106, 107, 108],
			
 
				-    "model-00040-of-00046.safetensors": [108, 109, 110, 111],
			
 
				-    "model-00041-of-00046.safetensors": [111, 112, 113, 114],
			
 
				-    "model-00042-of-00046.safetensors": [114, 115, 116, 117],
			
 
				-    "model-00043-of-00046.safetensors": [117, 118, 119],
			
 
				-    "model-00044-of-00046.safetensors": [120, 121, 122],
			
 
				-    "model-00045-of-00046.safetensors": [122, 123, 124, 125],
			
 
				-    "model-00046-of-00046.safetensors": [125]
			
 
				-  },
			
 
				-  "mlx-community/Mistral-Nemo-Instruct-2407-4bit": {
			
 
				-    "model-00001-of-00002.safetensors": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32],
			
 
				-    "model-00002-of-00002.safetensors": [32, 33, 34, 35, 36, 37, 38, 39],
			
 
				-  },
			
 
				-  "mlx-community/Mistral-Large-Instruct-2407-4bit": {
			
 
				-    "model-00001-of-00014.safetensors": [0, 1, 2, 3, 4, 5, 6],
			
 
				-    "model-00002-of-00014.safetensors": [6, 7, 8, 9, 10, 11, 12, 13],
			
 
				-    "model-00003-of-00014.safetensors": [13, 14, 15, 16, 17, 18, 19, 20],
			
 
				-    "model-00004-of-00014.safetensors": [20, 21, 22, 23, 24, 25, 26],
			
 
				-    "model-00005-of-00014.safetensors": [27, 28, 29, 30, 31, 32, 33],
			
 
				-    "model-00006-of-00014.safetensors": [33, 34, 35, 36, 37, 38, 39, 40],
			
 
				-    "model-00007-of-00014.safetensors": [40, 41, 42, 43, 44, 45, 46, 47],
			
 
				-    "model-00008-of-00014.safetensors": [47, 48, 49, 50, 51, 52, 53, 54],
			
 
				-    "model-00009-of-00014.safetensors": [54, 55, 56, 57, 58, 59, 60],
			
 
				-    "model-00010-of-00014.safetensors": [61, 62, 63, 64, 65, 66, 67],
			
 
				-    "model-00011-of-00014.safetensors": [67, 68, 69, 70, 71, 72, 73, 74],
			
 
				-    "model-00012-of-00014.safetensors": [74, 75, 76, 77, 78, 79, 80, 81],
			
 
				-    "model-00013-of-00014.safetensors": [81, 82, 83, 84, 85, 86, 87],
			
 
				-    "model-00014-of-00014.safetensors": [87]
			
 
				-  },
			
 
				-  "llava-hf/llava-1.5-7b-hf": {
			
 
				-    "model-00001-of-00003.safetensors": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
			
 
				-    "model-00002-of-00003.safetensors": [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22],
			
 
				-    "model-00003-of-00003.safetensors": [22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-def get_safetensors_allow_patterns(repo_id: str, shard: Optional[Shard] = None):
			
 
				-    return ["*.safetensors"] # TODO: enable this
			
 
				-    if not shard:
			
 
				-      return ["*.safetensors"]
			
 
				-
			
 
				-    allow_patterns = []
			
 
				-    for repo_id, safetensors_layers in repo_id_safetensors_layers.items():
			
 
				-        if repo_id == shard.model_id:
			
 
				-            for safetensor, layers in safetensors_layers.items():
			
 
				-                if any(shard.start_layer <= layer <= shard.end_layer for layer in layers):
			
 
				-                    allow_patterns.append(safetensor)
			
 
				-
			
 
				-    return allow_patterns if len(allow_patterns) > 0 else ["*.safetensors"]
			
 
				-
			
 
				-async def get_model_path(path_or_hf_repo: str, shard: Optional[Shard] = None, revision: str = "main", progress_callback: Optional[HFRepoProgressCallback] = None) -> Path:
			
 
				-  """
			
 
				-  Ensures the model is available locally. If the path does not exist locally,
			
 
				-  it is downloaded from the Hugging Face Hub.
			
 
				-
			
 
				-  Args:
			
 
				-   path_or_hf_repo (str): The local path or Hugging Face repository ID of the model.
			
 
				-   revision (str, optional): A revision id which can be a branch name, a tag, or a commit hash.
			
 
				-
			
 
				-  Returns:
			
 
				-   Path: The path to the model.
			
 
				-  """
			
 
				-  model_path = Path(path_or_hf_repo)
			
 
				-  if not model_path.exists():
			
 
				-    try:
			
 
				-      model_path = Path(
			
 
				-        await download_all_files(
			
 
				-          repo_id=path_or_hf_repo,
			
 
				-          revision=revision,
			
 
				-          allow_patterns=[
			
 
				-            "*.json",
			
 
				-            "*.py",
			
 
				-            "tokenizer.model",
			
 
				-            "*.tiktoken",
			
 
				-            "*.txt",
			
 
				-          ] + get_safetensors_allow_patterns(path_or_hf_repo, shard),
			
 
				-          progress_callback=progress_callback,
			
 
				-        )
			
 
				-      )
			
 
				-    except RepositoryNotFoundError:
			
 
				-      raise ModelNotFoundError(
			
 
				-        f"Model not found for path or HF repo: {path_or_hf_repo}.\n"
			
 
				-        "Please make sure you specified the local path or Hugging Face"
			
 
				-        " repo id correctly.\nIf you are trying to access a private or"
			
 
				-        " gated Hugging Face repo, make sure you are authenticated:\n"
			
 
				-        "https://huggingface.co/docs/huggingface_hub/en/guides/cli#huggingface-cli-login"
			
 
				-      ) from None
			
 
				-  return model_path
			
 
				-
			
 
				-
			
 
				 async def load_shard(
			
 
				-  path_or_hf_repo: str,
			
 
				+  model_path: str,
			
 
				   shard: Shard,
			
 
				   tokenizer_config={},
			
 
				   model_config={},
			
 
				   adapter_path: Optional[str] = None,
			
 
				   lazy: bool = False,
			
 
				-  progress_callback: Optional[HFRepoProgressCallback] = None,
			
 
				 ) -> Tuple[nn.Module, TokenizerWrapper]:
			
 
				-  """
			
 
				-  Load the model and tokenizer from a given path or a huggingface repository.
			
 
				-
			
 
				-  Args:
			
 
				-   path_or_hf_repo (Path): The path or the huggingface repository to load the model from.
			
 
				-   tokenizer_config (dict, optional): Configuration parameters specifically for the tokenizer.
			
 
				-    Defaults to an empty dictionary.
			
 
				-   model_config(dict, optional): Configuration parameters specifically for the model.
			
 
				-    Defaults to an empty dictionary.
			
 
				-   adapter_path (str, optional): Path to the LoRA adapters. If provided, applies LoRA layers
			
 
				-    to the model. Default: ``None``.
			
 
				-   lazy (bool): If False eval the model parameters to make sure they are
			
 
				-    loaded in memory before returning, otherwise they will be loaded
			
 
				-    when needed. Default: ``False``
			
 
				-  Returns:
			
 
				-   Tuple[nn.Module, TokenizerWrapper]: A tuple containing the loaded model and tokenizer.
			
 
				-
			
 
				-  Raises:
			
 
				-   FileNotFoundError: If config file or safetensors are not found.
			
 
				-   ValueError: If model class or args class are not found.
			
 
				-  """
			
 
				-  model_path = await get_model_path(path_or_hf_repo, shard, progress_callback=progress_callback)
			
 
				-
			
 
				   model = load_model_shard(model_path, shard, lazy, model_config)
			
 
				   if adapter_path is not None:
			
 
				     model = apply_lora_layers(model, adapter_path)
			
--- a/exo/inference/shard.py
+++ b/exo/inference/shard.py
@@ -24,3 +24,12 @@ class Shard:
 
				       "end_layer": self.end_layer,
			
 
				       "n_layers": self.n_layers,
			
 
				     }
			
 
				+
			
 
				+  def overlaps(self, other: 'Shard') -> bool:
			
 
				+    return shards_overlap(self, other)
			
 
				+
			
 
				+def shards_overlap(shard1: Shard, shard2: Shard) -> bool:
			
 
				+  return (
			
 
				+      shard1.model_id == shard2.model_id
			
 
				+      and max(shard1.start_layer, shard2.start_layer) <= min(shard1.end_layer, shard2.end_layer)
			
 
				+  )
			
--- a/exo/inference/tinygrad/inference.py
+++ b/exo/inference/tinygrad/inference.py
@@ -1,5 +1,5 @@
 
				 from pathlib import Path
			
 
				-from typing import List, Optional, Union, Callable, Coroutine, Any
			
 
				+from typing import List, Optional
			
 
				 import json
			
 
				 from exo.inference.tinygrad.models.llama import Transformer, convert_from_huggingface, fix_bf16
			
 
				 from tinygrad.nn.state import safe_load, torch_load, load_state_dict
			
@@ -8,7 +8,7 @@ from tinygrad.helpers import tqdm
 
				 from exo.inference.shard import Shard
			
 
				 from exo.inference.inference_engine import InferenceEngine
			
 
				 import numpy as np
			
 
				-from exo.inference.hf_helpers import HFRepoProgressCallback, HFRepoProgressEvent, download_all_files
			
 
				+from exo.download.shard_download import ShardDownloader
			
 
				 
			
 
				 MODEL_PARAMS = {
			
 
				   "8B": {
			
@@ -147,9 +147,9 @@ def prefill(model, toks, start_pos=0):
 
				 
			
 
				 
			
 
				 class TinygradDynamicShardInferenceEngine(InferenceEngine):
			
 
				-  def __init__(self, progress_callback: Optional[HFRepoProgressCallback] = None):
			
 
				+  def __init__(self, shard_downloader: ShardDownloader):
			
 
				     self.shard = None
			
 
				-    self.progress_callback = progress_callback
			
 
				+    self.shard_downloader = shard_downloader
			
 
				 
			
 
				   async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> (np.ndarray, str, bool):
			
 
				     # TODO: we need to refactor models/llamaa to handle per-request-kv-cache. right now it's shared between requests.
			
@@ -188,7 +188,7 @@ class TinygradDynamicShardInferenceEngine(InferenceEngine):
 
				     if self.shard == shard:
			
 
				       return
			
 
				 
			
 
				-    model_path = await download_all_files(shard.model_id, progress_callback=self.progress_callback)
			
 
				+    model_path = await self.shard_downloader.ensure_shard(shard)
			
 
				     print(f"{model_path=}")
			
 
				     model = build_transformer(model_path, shard=shard, model_size="8B" if "8b" in shard.model_id else "70B" if "70b" in shard.model_id else "8B")
			
 
				     from transformers import AutoTokenizer
			
@@ -197,6 +197,3 @@ class TinygradDynamicShardInferenceEngine(InferenceEngine):
 
				     self.shard = shard
			
 
				     self.model = model
			
 
				     self.tokenizer = tokenizer
			
 
				-
			
 
				-  def set_progress_callback(self, progress_callback: Callable[[HFRepoProgressEvent], Coroutine[Any, Any, None]]):
			
 
				-    self.progress_callback = progress_callback
			
--- a/exo/orchestration/standard_node.py
+++ b/exo/orchestration/standard_node.py
@@ -14,7 +14,7 @@ from exo.topology.partitioning_strategy import Partition, PartitioningStrategy,
 
				 from exo import DEBUG
			
 
				 from exo.helpers import AsyncCallbackSystem
			
 
				 from exo.viz.topology_viz import TopologyViz
			
 
				-from exo.inference.hf_helpers import HFRepoProgressEvent
			
 
				+from exo.download.hf.hf_helpers import RepoProgressEvent
			
 
				 
			
 
				 
			
 
				 class StandardNode(Node):
			
@@ -25,7 +25,7 @@ class StandardNode(Node):
 
				     inference_engine: InferenceEngine,
			
 
				     discovery: Discovery,
			
 
				     partitioning_strategy: PartitioningStrategy = None,
			
 
				-    max_generate_tokens: int = 256,
			
 
				+    max_generate_tokens: int = 1024,
			
 
				     chatgpt_api_endpoint: Optional[str] = None,
			
 
				     web_chat_url: Optional[str] = None,
			
 
				     disable_tui: Optional[bool] = False,
			
@@ -44,6 +44,7 @@ class StandardNode(Node):
 
				     self._on_token = AsyncCallbackSystem[str, Tuple[str, List[int], bool]]()
			
 
				     self._on_opaque_status = AsyncCallbackSystem[str, Tuple[str, str]]()
			
 
				     self._on_opaque_status.register("node_status").on_next(self.on_node_status)
			
 
				+    self.node_download_progress: Dict[str, RepoProgressEvent] = {}
			
 
				 
			
 
				   def on_node_status(self, request_id, opaque_status):
			
 
				     try:
			
@@ -57,14 +58,13 @@ class StandardNode(Node):
 
				       download_progress = None
			
 
				       if status_data.get("type", "") == "download_progress":
			
 
				         if DEBUG >= 5: print(f"Download progress from {status_data.get('node_id')}: {status_data.get('progress')}")
			
 
				-        if status_data.get("node_id") == self.id:
			
 
				-          download_progress = HFRepoProgressEvent.from_dict(status_data.get('progress'))
			
 
				+        download_progress = RepoProgressEvent.from_dict(status_data.get('progress'))
			
 
				+        self.node_download_progress[status_data.get('node_id')] = download_progress
			
 
				       if self.topology_viz:
			
 
				-        self.topology_viz.update_visualization(self.current_topology, self.partitioning_strategy.partition(self.current_topology), download_progress)
			
 
				+        self.topology_viz.update_visualization(self.current_topology, self.partitioning_strategy.partition(self.current_topology), self.id, self.node_download_progress)
			
 
				     except Exception as e:
			
 
				       if DEBUG >= 1: print(f"Error updating visualization: {e}")
			
 
				-      traceback.print_exc()
			
 
				-      pass
			
 
				+      if DEBUG >= 1: traceback.print_exc()
			
 
				 
			
 
				   async def start(self, wait_for_peers: int = 0) -> None:
			
 
				     await self.server.start()
			
@@ -347,7 +347,7 @@ class StandardNode(Node):
 
				     next_topology.active_node_id = self.topology.active_node_id  # this is not so clean.
			
 
				     self.topology = next_topology
			
 
				     if self.topology_viz:
			
 
				-      self.topology_viz.update_visualization(self.current_topology, self.partitioning_strategy.partition(self.current_topology))
			
 
				+      self.topology_viz.update_visualization(self.current_topology, self.partitioning_strategy.partition(self.current_topology), self.id)
			
 
				     return next_topology
			
 
				 
			
 
				   @property
			
--- a/exo/viz/test_topology_viz.py
+++ b/exo/viz/test_topology_viz.py
@@ -1,9 +1,57 @@
 
				 import asyncio
			
 
				 import unittest
			
 
				+from datetime import timedelta
			
 
				 from exo.viz.topology_viz import TopologyViz
			
 
				 from exo.topology.topology import Topology
			
 
				 from exo.topology.device_capabilities import DeviceCapabilities, DeviceFlops
			
 
				 from exo.topology.partitioning_strategy import Partition
			
 
				+from exo.download.hf.hf_helpers import RepoProgressEvent, RepoFileProgressEvent
			
 
				+
			
 
				+
			
 
				+def create_hf_repo_progress_event(
			
 
				+    completed_files: int = 5,
			
 
				+    total_files: int = 10,
			
 
				+    downloaded_bytes: int = 500000000,
			
 
				+    downloaded_bytes_this_session: int = 250000000,
			
 
				+    total_bytes: int = 1000000000,
			
 
				+    overall_speed: int = 5000000,
			
 
				+    overall_eta: timedelta = timedelta(seconds=100),
			
 
				+    file_progress: dict = None,
			
 
				+    status: str = "in_progress"
			
 
				+) -> RepoProgressEvent:
			
 
				+    if file_progress is None:
			
 
				+        file_progress = {
			
 
				+            "file1.bin": RepoFileProgressEvent(
			
 
				+                file_path="file1.bin",
			
 
				+                downloaded=100000000,
			
 
				+                downloaded_this_session=50000000,
			
 
				+                total=200000000,
			
 
				+                speed=1000000,
			
 
				+                eta=timedelta(seconds=100),
			
 
				+                status="in_progress"
			
 
				+            ),
			
 
				+            "file2.bin": RepoFileProgressEvent(
			
 
				+                file_path="file2.bin",
			
 
				+                downloaded=200000000,
			
 
				+                downloaded_this_session=100000000,
			
 
				+                total=200000000,
			
 
				+                speed=2000000,
			
 
				+                eta=timedelta(seconds=0),
			
 
				+                status="complete"
			
 
				+            )
			
 
				+        }
			
 
				+
			
 
				+    return RepoProgressEvent(
			
 
				+        completed_files=completed_files,
			
 
				+        total_files=total_files,
			
 
				+        downloaded_bytes=downloaded_bytes,
			
 
				+        downloaded_bytes_this_session=downloaded_bytes_this_session,
			
 
				+        total_bytes=total_bytes,
			
 
				+        overall_speed=overall_speed,
			
 
				+        overall_eta=overall_eta,
			
 
				+        file_progress=file_progress,
			
 
				+        status=status
			
 
				+    )
			
 
				 
			
 
				 
			
 
				 class TestNodeViz(unittest.IsolatedAsyncioTestCase):
			
@@ -30,7 +78,7 @@ class TestNodeViz(unittest.IsolatedAsyncioTestCase):
 
				     await asyncio.sleep(2)  # Simulate running for a short time
			
 
				 
			
 
				   async def test_layout_generation(self):
			
 
				-    self.top_viz._generate_layout()
			
 
				+    # self.top_viz._generate_layout()
			
 
				     self.top_viz.refresh()
			
 
				     import time
			
 
				 
			
@@ -43,6 +91,13 @@ class TestNodeViz(unittest.IsolatedAsyncioTestCase):
 
				         Partition("node2", 0.4, 0.8),
			
 
				         Partition("node3", 0.8, 0.9),
			
 
				       ],
			
 
				+      "node1",
			
 
				+      {
			
 
				+        "node1": create_hf_repo_progress_event(),
			
 
				+        "node2": create_hf_repo_progress_event(),
			
 
				+        "node3": create_hf_repo_progress_event(),
			
 
				+        "node4": create_hf_repo_progress_event(),
			
 
				+      },
			
 
				     )
			
 
				     time.sleep(2)
			
 
				     self.topology.active_node_id = "node3"
			
@@ -54,6 +109,11 @@ class TestNodeViz(unittest.IsolatedAsyncioTestCase):
 
				         Partition("node2", 0.5, 0.7),
			
 
				         Partition("node4", 0.7, 0.9),
			
 
				       ],
			
 
				+      "node5",
			
 
				+      {
			
 
				+        "node1": create_hf_repo_progress_event(),
			
 
				+        "node5": create_hf_repo_progress_event(),
			
 
				+      },
			
 
				     )
			
 
				     time.sleep(2)
			
 
				 
			
--- a/exo/viz/topology_viz.py
+++ b/exo/viz/topology_viz.py
@@ -1,17 +1,17 @@
 
				 import math
			
 
				-from typing import List, Optional, Tuple
			
 
				+from typing import List, Optional, Tuple, Dict
			
 
				 from exo.helpers import exo_text, pretty_print_bytes, pretty_print_bytes_per_second
			
 
				 from exo.topology.topology import Topology
			
 
				 from exo.topology.partitioning_strategy import Partition
			
 
				+from exo.download.hf.hf_helpers import RepoProgressEvent
			
 
				 from rich.console import Console
			
 
				 from rich.panel import Panel
			
 
				 from rich.text import Text
			
 
				 from rich.live import Live
			
 
				 from rich.style import Style
			
 
				-from rich.progress import Progress, BarColumn, TextColumn, TimeRemainingColumn
			
 
				 from rich.table import Table
			
 
				+from rich.layout import Layout
			
 
				 from exo.topology.device_capabilities import UNKNOWN_DEVICE_CAPABILITIES
			
 
				-from exo.inference.hf_helpers import HFRepoProgressEvent
			
 
				 
			
 
				 class TopologyViz:
			
 
				   def __init__(self, chatgpt_api_endpoint: str = None, web_chat_url: str = None):
			
@@ -19,60 +19,54 @@ class TopologyViz:
 
				     self.web_chat_url = web_chat_url
			
 
				     self.topology = Topology()
			
 
				     self.partitions: List[Partition] = []
			
 
				-    self.download_progress = None
			
 
				+    self.node_id = None
			
 
				+    self.node_download_progress: Dict[str, RepoProgressEvent] = {}
			
 
				 
			
 
				     self.console = Console()
			
 
				-    self.panel = Panel(self._generate_layout(), title="Exo Cluster (0 nodes)", border_style="bright_yellow")
			
 
				-    self.live_panel = Live(self.panel, auto_refresh=False, console=self.console)
			
 
				+    self.layout = Layout()
			
 
				+    self.layout.split(
			
 
				+      Layout(name="main"),
			
 
				+      Layout(name="download", size=15)
			
 
				+    )
			
 
				+    self.main_panel = Panel(self._generate_main_layout(), title="Exo Cluster (0 nodes)", border_style="bright_yellow")
			
 
				+    self.download_panel = Panel("", title="Download Progress", border_style="cyan")
			
 
				+    self.layout["main"].update(self.main_panel)
			
 
				+    self.layout["download"].update(self.download_panel)
			
 
				+    self.live_panel = Live(self.layout, auto_refresh=False, console=self.console)
			
 
				     self.live_panel.start()
			
 
				 
			
 
				-  def update_visualization(self, topology: Topology, partitions: List[Partition], download_progress: HFRepoProgressEvent = None):
			
 
				+  def update_visualization(self, topology: Topology, partitions: List[Partition], node_id: Optional[str] = None, node_download_progress: Dict[str, RepoProgressEvent] = {}):
			
 
				     self.topology = topology
			
 
				     self.partitions = partitions
			
 
				-    self.download_progress = download_progress
			
 
				+    self.node_id = node_id
			
 
				+    if node_download_progress:
			
 
				+      self.node_download_progress = node_download_progress
			
 
				     self.refresh()
			
 
				 
			
 
				   def refresh(self):
			
 
				-    self.panel.renderable = self._generate_layout()
			
 
				+    self.main_panel.renderable = self._generate_main_layout()
			
 
				     # Update the panel title with the number of nodes and partitions
			
 
				     node_count = len(self.topology.nodes)
			
 
				-    self.panel.title = f"Exo Cluster ({node_count} node{'s' if node_count != 1 else ''})"
			
 
				-    self.live_panel.update(self.panel, refresh=True)
			
 
				+    self.main_panel.title = f"Exo Cluster ({node_count} node{'s' if node_count != 1 else ''})"
			
 
				 
			
 
				-  def _generate_layout(self) -> str:
			
 
				+    # Only show download_panel if there are in-progress downloads
			
 
				+    if any(progress.status == "in_progress" for progress in self.node_download_progress.values()):
			
 
				+      self.download_panel.renderable = self._generate_download_layout()
			
 
				+      self.layout["download"].visible = True
			
 
				+    else:
			
 
				+      self.layout["download"].visible = False
			
 
				+
			
 
				+    self.live_panel.update(self.layout, refresh=True)
			
 
				+
			
 
				+  def _generate_main_layout(self) -> str:
			
 
				     # Calculate visualization parameters
			
 
				     num_partitions = len(self.partitions)
			
 
				-    radius_x = 30  # Increased horizontal radius
			
 
				-    radius_y = 12  # Decreased vertical radius
			
 
				-    center_x, center_y = 50, 28  # Centered horizontally and moved up slightly
			
 
				+    radius_x = 30
			
 
				+    radius_y = 12
			
 
				+    center_x, center_y = 50, 24  # Increased center_y to add more space
			
 
				 
			
 
				     # Generate visualization
			
 
				-    visualization = [[" " for _ in range(100)] for _ in range(55)]  # Decreased height
			
 
				-
			
 
				-    # Draw download first so everything else is drawn on top
			
 
				-    # If a download is in progress, show the download info summary
			
 
				-    if self.download_progress and self.download_progress.status != "complete":
			
 
				-        download_summary = _generate_download_summary(self.download_progress)
			
 
				-        download_panel = Panel(
			
 
				-            download_summary,
			
 
				-            title="Download Progress",
			
 
				-            border_style="cyan",
			
 
				-            expand=False,
			
 
				-            width=96,  # Further reduced to ensure it fits within the visualization
			
 
				-            height=None  # Allow the panel to adjust its height based on content
			
 
				-        )
			
 
				-        console = Console(width=98, height=55)  # Reduced console width
			
 
				-        with console.capture() as capture:
			
 
				-            console.print(download_panel)
			
 
				-        download_lines = capture.get().split('\n')
			
 
				-        download_start_y = 15
			
 
				-        panel_width = len(max(download_lines, key=len))
			
 
				-        start_x = max(1, (100 - panel_width) // 2)  # Ensure start_x is at least 1 to avoid left border cut-off
			
 
				-        for i, line in enumerate(download_lines):
			
 
				-            for j, char in enumerate(line):
			
 
				-                if 1 <= start_x + j < 99 and download_start_y + i < 55:  # Ensure we don't write to the rightmost column
			
 
				-                    visualization[download_start_y + i][start_x + j] = char
			
 
				-
			
 
				+    visualization = [[" " for _ in range(100)] for _ in range(48)]  # Increased height to 48
			
 
				 
			
 
				     # Add exo_text at the top in bright yellow
			
 
				     exo_lines = exo_text.split("\n")
			
@@ -80,7 +74,7 @@ class TopologyViz:
 
				     max_line_length = max(len(line) for line in exo_lines)
			
 
				     for i, line in enumerate(exo_lines):
			
 
				       centered_line = line.center(max_line_length)
			
 
				-      start_x = (100 - max_line_length) // 2 + 15  # Center the text plus empirical adjustment of 15
			
 
				+      start_x = (100 - max_line_length) // 2 + 15
			
 
				       colored_line = Text(centered_line, style=yellow_style)
			
 
				       for j, char in enumerate(str(colored_line)):
			
 
				         if 0 <= start_x + j < 100 and i < len(visualization):
			
@@ -95,9 +89,9 @@ class TopologyViz:
 
				 
			
 
				     info_start_y = len(exo_lines) + 1
			
 
				     for i, line in enumerate(info_lines):
			
 
				-      start_x = (100 - len(line)) // 2 + 15  # Center the info lines plus empirical adjustment of 15
			
 
				+      start_x = (100 - len(line)) // 2 + 15
			
 
				       for j, char in enumerate(line):
			
 
				-        if 0 <= start_x + j < 100 and info_start_y + i < 55:
			
 
				+        if 0 <= start_x + j < 100 and info_start_y + i < 48:
			
 
				           visualization[info_start_y + i][start_x + j] = char
			
 
				 
			
 
				     # Calculate total FLOPS and position on the bar
			
@@ -105,13 +99,13 @@ class TopologyViz:
 
				     bar_pos = (math.tanh(total_flops / 20 - 2) + 1) / 2
			
 
				 
			
 
				     # Add GPU poor/rich bar
			
 
				-    bar_width = 30  # Increased bar width
			
 
				-    bar_start_x = (100 - bar_width) // 2  # Center the bar
			
 
				-    bar_y = info_start_y + len(info_lines) + 1  # Position the bar below the info section with two cells of space
			
 
				+    bar_width = 30
			
 
				+    bar_start_x = (100 - bar_width) // 2
			
 
				+    bar_y = info_start_y + len(info_lines) + 1
			
 
				 
			
 
				     # Create a gradient bar using emojis
			
 
				     gradient_bar = Text()
			
 
				-    emojis = ["🟥", "🟧", "🟨", "🟩"]  # Red, Orange, Yellow, Green
			
 
				+    emojis = ["🟥", "🟧", "🟨", "🟩"]
			
 
				     for i in range(bar_width):
			
 
				       emoji_index = min(int(i / (bar_width / len(emojis))), len(emojis) - 1)
			
 
				       gradient_bar.append(emojis[emoji_index])
			
@@ -133,6 +127,9 @@ class TopologyViz:
 
				     visualization[bar_y + 1][pos_x - len(flops_str) // 2 : pos_x + len(flops_str) // 2 + len(flops_str) % 2] = flops_str
			
 
				     visualization[bar_y + 2][pos_x] = "▲"
			
 
				 
			
 
				+    # Add an extra empty line for spacing
			
 
				+    bar_y += 4
			
 
				+
			
 
				     for i, partition in enumerate(self.partitions):
			
 
				       device_capabilities = self.topology.nodes.get(partition.node_id, UNKNOWN_DEVICE_CAPABILITIES)
			
 
				 
			
@@ -140,11 +137,13 @@ class TopologyViz:
 
				       x = int(center_x + radius_x * math.cos(angle))
			
 
				       y = int(center_y + radius_y * math.sin(angle))
			
 
				 
			
 
				-      # Place node with different color for active node
			
 
				+      # Place node with different color for active node and this node
			
 
				       if partition.node_id == self.topology.active_node_id:
			
 
				-        visualization[y][x] = "🔴"  # Red circle for active node
			
 
				+        visualization[y][x] = "🔴"
			
 
				+      elif partition.node_id == self.node_id:
			
 
				+        visualization[y][x] = "🟢"
			
 
				       else:
			
 
				-        visualization[y][x] = "🔵"  # Blue circle for inactive nodes
			
 
				+        visualization[y][x] = "🔵"
			
 
				 
			
 
				       # Place node info (model, memory, TFLOPS, partition) on three lines
			
 
				       node_info = [
			
@@ -154,28 +153,27 @@ class TopologyViz:
 
				       ]
			
 
				 
			
 
				       # Calculate info position based on angle
			
 
				-      info_distance_x = radius_x + 6  # Increased horizontal distance
			
 
				-      info_distance_y = radius_y + 3  # Decreased vertical distance
			
 
				+      info_distance_x = radius_x + 6
			
 
				+      info_distance_y = radius_y + 3
			
 
				       info_x = int(center_x + info_distance_x * math.cos(angle))
			
 
				       info_y = int(center_y + info_distance_y * math.sin(angle))
			
 
				 
			
 
				       # Adjust text position to avoid overwriting the node icon and prevent cutoff
			
 
				-      if info_x < x:  # Text is to the left of the node
			
 
				+      if info_x < x:
			
 
				         info_x = max(0, x - len(max(node_info, key=len)) - 1)
			
 
				-      elif info_x > x:  # Text is to the right of the node
			
 
				+      elif info_x > x:
			
 
				         info_x = min(99 - len(max(node_info, key=len)), info_x)
			
 
				 
			
 
				       # Adjust for top and bottom nodes
			
 
				-      if 5 * math.pi / 4 < angle < 7 * math.pi / 4:  # Node is near the top
			
 
				-        info_x += 4  # Shift text slightly to the right
			
 
				-      elif math.pi / 4 < angle < 3 * math.pi / 4:  # Node is near the bottom
			
 
				-        info_x += 3  # Shift text slightly to the right
			
 
				-        info_y -= 2  # Move text up by two cells
			
 
				+      if 5 * math.pi / 4 < angle < 7 * math.pi / 4:
			
 
				+        info_x += 4
			
 
				+      elif math.pi / 4 < angle < 3 * math.pi / 4:
			
 
				+        info_x += 3
			
 
				+        info_y -= 2
			
 
				 
			
 
				       for j, line in enumerate(node_info):
			
 
				         for k, char in enumerate(line):
			
 
				-          if 0 <= info_y + j < 55 and 0 <= info_x + k < 100:  # Updated height check
			
 
				-            # Ensure we're not overwriting the node icon
			
 
				+          if 0 <= info_y + j < 48 and 0 <= info_x + k < 100:
			
 
				             if info_y + j != y or info_x + k != x:
			
 
				               visualization[info_y + j][info_x + k] = char
			
 
				 
			
@@ -190,33 +188,47 @@ class TopologyViz:
 
				       for step in range(1, steps):
			
 
				         line_x = int(x + (next_x - x) * step / steps)
			
 
				         line_y = int(y + (next_y - y) * step / steps)
			
 
				-        if 0 <= line_y < 55 and 0 <= line_x < 100:  # Updated height check
			
 
				+        if 0 <= line_y < 48 and 0 <= line_x < 100:
			
 
				           visualization[line_y][line_x] = "-"
			
 
				 
			
 
				     # Convert to string
			
 
				     return "\n".join("".join(str(char) for char in row) for row in visualization)
			
 
				 
			
 
				-def _generate_download_summary(download_progress) -> Table:
			
 
				+  def _generate_download_layout(self) -> Table:
			
 
				     summary = Table(show_header=False, box=None, padding=(0, 1))
			
 
				     summary.add_column("Info", style="cyan", no_wrap=True)
			
 
				     summary.add_column("Progress", style="cyan", no_wrap=True)
			
 
				     summary.add_column("Percentage", style="cyan", no_wrap=True)
			
 
				 
			
 
				-    title = f"Downloading model ({download_progress.completed_files}/{download_progress.total_files}):"
			
 
				-    summary.add_row(Text(title, style="bold"))
			
 
				-    progress_info = f"{pretty_print_bytes(download_progress.downloaded_bytes)} / {pretty_print_bytes(download_progress.total_bytes)} ({pretty_print_bytes_per_second(download_progress.overall_speed)})"
			
 
				-    summary.add_row(progress_info)
			
 
				+    # Current node download progress
			
 
				+    if self.node_id in self.node_download_progress:
			
 
				+        download_progress = self.node_download_progress[self.node_id]
			
 
				+        title = f"Downloading model ({download_progress.completed_files}/{download_progress.total_files}):"
			
 
				+        summary.add_row(Text(title, style="bold"))
			
 
				+        progress_info = f"{pretty_print_bytes(download_progress.downloaded_bytes)} / {pretty_print_bytes(download_progress.total_bytes)} ({pretty_print_bytes_per_second(download_progress.overall_speed)})"
			
 
				+        summary.add_row(progress_info)
			
 
				+
			
 
				+        eta_info = f"ETA: {download_progress.overall_eta}"
			
 
				+        summary.add_row(eta_info)
			
 
				+
			
 
				+        summary.add_row("")  # Empty row for spacing
			
 
				 
			
 
				-    eta_info = f"ETA: {download_progress.overall_eta}"
			
 
				-    summary.add_row(eta_info)
			
 
				+        for file_path, file_progress in download_progress.file_progress.items():
			
 
				+            if file_progress.status != "complete":
			
 
				+                progress = int(file_progress.downloaded / file_progress.total * 20)
			
 
				+                bar = f"[{'=' * progress}{' ' * (20 - progress)}]"
			
 
				+                percentage = f"{file_progress.downloaded / file_progress.total * 100:.0f}%"
			
 
				+                summary.add_row(Text(file_path[:20], style="cyan"), bar, percentage)
			
 
				 
			
 
				     summary.add_row("")  # Empty row for spacing
			
 
				 
			
 
				-    for file_path, file_progress in download_progress.file_progress.items():
			
 
				-      if file_progress.status != "complete":
			
 
				-        progress = int(file_progress.downloaded / file_progress.total * 20)  # Increased bar width
			
 
				-        bar = f"[{'=' * progress}{' ' * (20 - progress)}]"
			
 
				-        percentage = f"{file_progress.downloaded / file_progress.total * 100:.0f}%"
			
 
				-        summary.add_row(Text(file_path[:20], style="cyan"), bar, percentage)  # Increased file path length
			
 
				+    # Other nodes download progress summary
			
 
				+    summary.add_row(Text("Other Nodes Download Progress:", style="bold"))
			
 
				+    for node_id, progress in self.node_download_progress.items():
			
 
				+        if node_id != self.node_id:
			
 
				+            truncated_id = node_id[:8] + "..." if len(node_id) > 8 else node_id
			
 
				+            percentage = progress.downloaded_bytes / progress.total_bytes * 100 if progress.total_bytes > 0 else 0
			
 
				+            speed = pretty_print_bytes_per_second(progress.overall_speed)
			
 
				+            summary.add_row(f"{truncated_id}: {percentage:.1f}% ({speed})")
			
 
				 
			
 
				     return summary
			
--- a/extra/download_hf.py
+++ b/extra/download_hf.py
@@ -1,6 +1,6 @@
 
				 import argparse
			
 
				 import asyncio
			
 
				-from exo.inference.hf_helpers import download_all_files, HFRepoProgressEvent, HFRepoFileProgressEvent
			
 
				+from exo.download.hf.hf_helpers import download_all_files, RepoProgressEvent
			
 
				 
			
 
				 DEFAULT_ALLOW_PATTERNS = [
			
 
				     "*.json",
			
@@ -23,7 +23,7 @@ DEFAULT_IGNORE_PATTERNS = [
 
				 ]
			
 
				 
			
 
				 async def main(repo_id, revision="main", allow_patterns=None, ignore_patterns=None):
			
 
				-    async def progress_callback(event: HFRepoProgressEvent):
			
 
				+    async def progress_callback(event: RepoProgressEvent):
			
 
				         print(f"Overall Progress: {event.completed_files}/{event.total_files} files, {event.downloaded_bytes}/{event.total_bytes} bytes")
			
 
				         print(f"Estimated time remaining: {event.overall_eta}")
			
 
				         print("File Progress:")
			
--- a/main.py
+++ b/main.py
@@ -2,12 +2,14 @@ import argparse
 
				 import asyncio
			
 
				 import signal
			
 
				 import json
			
 
				-import uuid
			
 
				+import time
			
 
				 from exo.orchestration.standard_node import StandardNode
			
 
				 from exo.networking.grpc.grpc_server import GRPCServer
			
 
				 from exo.networking.grpc.grpc_discovery import GRPCDiscovery
			
 
				 from exo.topology.ring_memory_weighted_partitioning_strategy import RingMemoryWeightedPartitioningStrategy
			
 
				 from exo.api import ChatGPTAPI
			
 
				+from exo.download.shard_download import ShardDownloader
			
 
				+from exo.download.hf.hf_shard_download import HFShardDownloader
			
 
				 from exo.helpers import print_yellow_exo, find_available_port, DEBUG, get_inference_engine, get_system_info, get_or_create_node_id
			
 
				 
			
 
				 # parse args
			
@@ -22,7 +24,7 @@ parser.add_argument("--discovery-timeout", type=int, default=30, help="Discovery
 
				 parser.add_argument("--wait-for-peers", type=int, default=0, help="Number of peers to wait to connect to before starting")
			
 
				 parser.add_argument("--chatgpt-api-port", type=int, default=8000, help="ChatGPT API port")
			
 
				 parser.add_argument("--chatgpt-api-response-timeout-secs", type=int, default=90, help="ChatGPT API response timeout in seconds")
			
 
				-parser.add_argument("--max-generate-tokens", type=int, default=256, help="Max tokens to generate in each request")
			
 
				+parser.add_argument("--max-generate-tokens", type=int, default=1024, help="Max tokens to generate in each request")
			
 
				 parser.add_argument("--inference-engine", type=str, default=None, help="Inference engine to use")
			
 
				 parser.add_argument("--disable-tui", action=argparse.BooleanOptionalAction, help="Disable TUI")
			
 
				 args = parser.parse_args()
			
@@ -32,9 +34,10 @@ print_yellow_exo()
 
				 system_info = get_system_info()
			
 
				 print(f"Detected system: {system_info}")
			
 
				 
			
 
				+shard_downloader: ShardDownloader = HFShardDownloader()
			
 
				 inference_engine_name = args.inference_engine or ("mlx" if system_info == "Apple Silicon Mac" else "tinygrad")
			
 
				-inference_engine = get_inference_engine(inference_engine_name)
			
 
				-print(f"Using inference engine: {inference_engine.__class__.__name__}")
			
 
				+inference_engine = get_inference_engine(inference_engine_name, shard_downloader)
			
 
				+print(f"Using inference engine: {inference_engine.__class__.__name__} with shard downloader: {shard_downloader.__class__.__name__}")
			
 
				 
			
 
				 if args.node_port is None:
			
 
				     args.node_port = find_available_port(args.node_host)
			
@@ -60,7 +63,15 @@ node.on_token.register("main_log").on_next(lambda _, tokens, __: print(inference
 
				 if args.prometheus_client_port:
			
 
				     from exo.stats.metrics import start_metrics_server
			
 
				     start_metrics_server(node, args.prometheus_client_port)
			
 
				-inference_engine.set_progress_callback(lambda event: asyncio.create_task(node.broadcast_opaque_status("", json.dumps({"type": "download_progress", "node_id": node.id, "progress": event.to_dict()}))))
			
 
				+
			
 
				+last_broadcast_time = 0
			
 
				+def throttled_broadcast(shard, event):
			
 
				+    global last_broadcast_time
			
 
				+    current_time = time.time()
			
 
				+    if current_time - last_broadcast_time >= 0.1:
			
 
				+        last_broadcast_time = current_time
			
 
				+        asyncio.create_task(node.broadcast_opaque_status("", json.dumps({"type": "download_progress", "node_id": node.id, "progress": event.to_dict()})))
			
 
				+shard_downloader.on_progress.register("broadcast").on_next(throttled_broadcast)
			
 
				 
			
 
				 async def shutdown(signal, loop):
			
 
				     """Gracefully shutdown the server and close the asyncio loop."""