11 luni în urmă · d22ed12e7b
--- a/exo/api/chatgpt_api.py
+++ b/exo/api/chatgpt_api.py
@@ -7,6 +7,7 @@ from transformers import AutoTokenizer, AutoProcessor
 
				 from typing import List, Literal, Union, Dict
			
 
				 from aiohttp import web
			
 
				 import aiohttp_cors
			
 
				+import traceback
			
 
				 from exo import DEBUG, VERSION
			
 
				 from exo.helpers import terminal_link, PrefixDict
			
 
				 from exo.inference.shard import Shard
			
@@ -16,20 +17,22 @@ shard_mappings = {
 
				   ### llama
			
 
				   "llama-3.1-8b": {
			
 
				     "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3.1-8B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=32),
			
 
				+    "TinygradDynamicShardInferenceEngine": Shard(model_id="mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated", start_layer=0, end_layer=0, n_layers=32),
			
 
				   },
			
 
				   "llama-3.1-70b": {
			
 
				     "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3.1-70B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=80),
			
 
				+    "TinygradDynamicShardInferenceEngine": Shard(model_id="NousResearch/Meta-Llama-3.1-70B", start_layer=0, end_layer=0, n_layers=80),
			
 
				   },
			
 
				   "llama-3.1-405b": {
			
 
				     "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3.1-405B-4bit", start_layer=0, end_layer=0, n_layers=126),
			
 
				   },
			
 
				   "llama-3-8b": {
			
 
				     "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3-8B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=32),
			
 
				-    "TinygradDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3-8B-Instruct", start_layer=0, end_layer=0, n_layers=32),
			
 
				+    "TinygradDynamicShardInferenceEngine": Shard(model_id="TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-8B-R", start_layer=0, end_layer=0, n_layers=32),
			
 
				   },
			
 
				   "llama-3-70b": {
			
 
				     "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3-70B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=80),
			
 
				-    "TinygradDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3-70B-Instruct", start_layer=0, end_layer=0, n_layers=80),
			
 
				+    "TinygradDynamicShardInferenceEngine": Shard(model_id="TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-70B-R", start_layer=0, end_layer=0, n_layers=80),
			
 
				   },
			
 
				   ### mistral
			
 
				   "mistral-nemo": {
			
@@ -79,7 +82,7 @@ class ChatCompletionRequest:
 
				 
			
 
				 async def resolve_tokenizer(model_id: str):
			
 
				   try:
			
 
				-    if DEBUG >= 2: print(f"Trying AutoProcessor for {model_id}")
			
 
				+    if DEBUG >= 4: print(f"Trying AutoProcessor for {model_id}")
			
 
				     processor = AutoProcessor.from_pretrained(model_id, use_fast=False)
			
 
				     if not hasattr(processor, 'eos_token_id'):
			
 
				       processor.eos_token_id = getattr(processor, 'tokenizer', getattr(processor, '_tokenizer', processor)).eos_token_id
			
@@ -89,21 +92,18 @@ async def resolve_tokenizer(model_id: str):
 
				       processor.decode = getattr(processor, 'tokenizer', getattr(processor, '_tokenizer', processor)).decode
			
 
				     return processor
			
 
				   except Exception as e:
			
 
				-    if DEBUG >= 2: print(f"Failed to load processor for {model_id}. Error: {e}")
			
 
				-    import traceback
			
 
				+    if DEBUG >= 4: print(f"Failed to load processor for {model_id}. Error: {e}")
			
 
				 
			
 
				-    if DEBUG >= 2: print(traceback.format_exc())
			
 
				+    if DEBUG >= 4: print(traceback.format_exc())
			
 
				 
			
 
				   try:
			
 
				-    if DEBUG >= 2: print(f"Trying AutoTokenizer for {model_id}")
			
 
				+    if DEBUG >= 4: print(f"Trying AutoTokenizer for {model_id}")
			
 
				     return AutoTokenizer.from_pretrained(model_id)
			
 
				   except Exception as e:
			
 
				-    if DEBUG >= 2: print(f"Failed to load tokenizer for {model_id}. Falling back to tinygrad tokenizer. Error: {e}")
			
 
				-    import traceback
			
 
				+    if DEBUG >= 4: print(f"Failed to load tokenizer for {model_id}. Falling back to tinygrad tokenizer. Error: {e}")
			
 
				+    if DEBUG >= 4: print(traceback.format_exc())
			
 
				 
			
 
				-    if DEBUG >= 2: print(traceback.format_exc())
			
 
				-
			
 
				-  if DEBUG >= 2: print(f"Trying mlx tokenizer for {model_id}")
			
 
				+  if DEBUG >= 4: print(f"Trying mlx tokenizer for {model_id}")
			
 
				   from exo.inference.mlx.sharded_utils import get_model_path, load_tokenizer
			
 
				 
			
 
				   return load_tokenizer(await get_model_path(model_id))
			
@@ -308,10 +308,7 @@ class ChatGPTAPI:
 
				     try:
			
 
				       await self.node.process_prompt(shard, prompt, image_str, request_id=request_id)
			
 
				     except Exception as e:
			
 
				-      if DEBUG >= 2:
			
 
				-        import traceback
			
 
				-
			
 
				-        traceback.print_exc()
			
 
				+      if DEBUG >= 2: traceback.print_exc()
			
 
				       return web.json_response({"detail": f"Error processing prompt (see logs with DEBUG>=2): {str(e)}"}, status=500)
			
 
				 
			
 
				     try:
			
--- a/exo/helpers.py
+++ b/exo/helpers.py
@@ -201,3 +201,27 @@ def get_or_create_node_id():
 
				     except Exception as e:
			
 
				         if DEBUG >= 2: print(f"Unexpected error creating node_id: {e}")
			
 
				         return str(uuid.uuid4())
			
 
				+
			
 
				+def pretty_print_bytes(size_in_bytes: int) -> str:
			
 
				+    if size_in_bytes < 1024:
			
 
				+        return f"{size_in_bytes} B"
			
 
				+    elif size_in_bytes < 1024 ** 2:
			
 
				+        return f"{size_in_bytes / 1024:.2f} KB"
			
 
				+    elif size_in_bytes < 1024 ** 3:
			
 
				+        return f"{size_in_bytes / (1024 ** 2):.2f} MB"
			
 
				+    elif size_in_bytes < 1024 ** 4:
			
 
				+        return f"{size_in_bytes / (1024 ** 3):.2f} GB"
			
 
				+    else:
			
 
				+        return f"{size_in_bytes / (1024 ** 4):.2f} TB"
			
 
				+
			
 
				+def pretty_print_bytes_per_second(bytes_per_second: int) -> str:
			
 
				+    if bytes_per_second < 1024:
			
 
				+        return f"{bytes_per_second} B/s"
			
 
				+    elif bytes_per_second < 1024 ** 2:
			
 
				+        return f"{bytes_per_second / 1024:.2f} KB/s"
			
 
				+    elif bytes_per_second < 1024 ** 3:
			
 
				+        return f"{bytes_per_second / (1024 ** 2):.2f} MB/s"
			
 
				+    elif bytes_per_second < 1024 ** 4:
			
 
				+        return f"{bytes_per_second / (1024 ** 3):.2f} GB/s"
			
 
				+    else:
			
 
				+        return f"{bytes_per_second / (1024 ** 4):.2f} TB/s"
			
--- a/exo/inference/hf_helpers.py
+++ b/exo/inference/hf_helpers.py
@@ -103,19 +103,69 @@ async def fetch_file_list(session, repo_id, revision, path=""):
 
				 class HFRepoFileProgressEvent:
			
 
				     file_path: str
			
 
				     downloaded: int
			
 
				+    downloaded_this_session: int
			
 
				     total: int
			
 
				-    speed: float
			
 
				+    speed: int
			
 
				     eta: timedelta
			
 
				     status: Literal["not_started", "in_progress", "complete"]
			
 
				 
			
 
				+    def to_dict(self):
			
 
				+        return {
			
 
				+            "file_path": self.file_path,
			
 
				+            "downloaded": self.downloaded,
			
 
				+            "downloaded_this_session": self.downloaded_this_session,
			
 
				+            "total": self.total,
			
 
				+            "speed": self.speed,
			
 
				+            "eta": self.eta.total_seconds(),
			
 
				+            "status": self.status
			
 
				+        }
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_dict(cls, data):
			
 
				+        # Convert eta from seconds back to timedelta
			
 
				+        if 'eta' in data:
			
 
				+            data['eta'] = timedelta(seconds=data['eta'])
			
 
				+        return cls(**data)
			
 
				+
			
 
				 @dataclass
			
 
				 class HFRepoProgressEvent:
			
 
				     completed_files: int
			
 
				     total_files: int
			
 
				     downloaded_bytes: int
			
 
				+    downloaded_bytes_this_session: int
			
 
				     total_bytes: int
			
 
				+    overall_speed: int
			
 
				     overall_eta: timedelta
			
 
				     file_progress: Dict[str, HFRepoFileProgressEvent]
			
 
				+    status: Literal["not_started", "in_progress", "complete"]
			
 
				+
			
 
				+    def to_dict(self):
			
 
				+        return {
			
 
				+            "completed_files": self.completed_files,
			
 
				+            "total_files": self.total_files,
			
 
				+            "downloaded_bytes": self.downloaded_bytes,
			
 
				+            "downloaded_bytes_this_session": self.downloaded_bytes_this_session,
			
 
				+            "total_bytes": self.total_bytes,
			
 
				+            "overall_speed": self.overall_speed,
			
 
				+            "overall_eta": self.overall_eta.total_seconds(),
			
 
				+            "file_progress": {k: v.to_dict() for k, v in self.file_progress.items()},
			
 
				+            "status": self.status
			
 
				+        }
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_dict(cls, data):
			
 
				+        # Convert overall_eta from seconds back to timedelta
			
 
				+        if 'overall_eta' in data:
			
 
				+            data['overall_eta'] = timedelta(seconds=data['overall_eta'])
			
 
				+
			
 
				+        # Parse file_progress
			
 
				+        if 'file_progress' in data:
			
 
				+            data['file_progress'] = {
			
 
				+                k: HFRepoFileProgressEvent.from_dict(v)
			
 
				+                for k, v in data['file_progress'].items()
			
 
				+            }
			
 
				+
			
 
				+        return cls(**data)
			
 
				 
			
 
				 HFRepoFileProgressCallback = Callable[[HFRepoFileProgressEvent], Coroutine[Any, Any, None]]
			
 
				 HFRepoProgressCallback = Callable[[HFRepoProgressEvent], Coroutine[Any, Any, None]]
			
@@ -143,11 +193,12 @@ async def download_file(session: aiohttp.ClientSession, repo_id: str, revision:
 
				     async with session.get(url, headers=headers) as response:
			
 
				         total_size = int(response.headers.get('Content-Length', 0))
			
 
				         downloaded_size = local_file_size
			
 
				+        downloaded_this_session = 0
			
 
				         mode = 'ab' if use_range_request else 'wb'
			
 
				         if downloaded_size == total_size:
			
 
				             if DEBUG >= 2: print(f"File already downloaded: {file_path}")
			
 
				             if progress_callback:
			
 
				-                await progress_callback(HFRepoFileProgressEvent(file_path, downloaded_size, total_size, 0, timedelta(0), "complete"))
			
 
				+                await progress_callback(HFRepoFileProgressEvent(file_path, downloaded_size, downloaded_this_session, total_size, 0, timedelta(0), "complete"))
			
 
				             return
			
 
				 
			
 
				         if response.status == 200:
			
@@ -170,7 +221,7 @@ async def download_file(session: aiohttp.ClientSession, repo_id: str, revision:
 
				                 if downloaded_size == total_size:
			
 
				                     if DEBUG >= 2: print(f"File fully downloaded on first pass: {file_path}")
			
 
				                     if progress_callback:
			
 
				-                        await progress_callback(HFRepoFileProgressEvent(file_path, downloaded_size, total_size, 0, timedelta(0), "complete"))
			
 
				+                        await progress_callback(HFRepoFileProgressEvent(file_path, downloaded_size, downloaded_this_session, total_size, 0, timedelta(0), "complete"))
			
 
				                     return
			
 
				             except ValueError:
			
 
				                 if DEBUG >= 1: print(f"Failed to parse Content-Range header: {content_range}. Starting download from scratch...")
			
@@ -181,7 +232,7 @@ async def download_file(session: aiohttp.ClientSession, repo_id: str, revision:
 
				         if downloaded_size == total_size:
			
 
				             print(f"File already downloaded: {file_path}")
			
 
				             if progress_callback:
			
 
				-                await progress_callback(HFRepoFileProgressEvent(file_path, downloaded_size, total_size, 0, timedelta(0), "complete"))
			
 
				+                await progress_callback(HFRepoFileProgressEvent(file_path, downloaded_size, downloaded_this_session, total_size, 0, timedelta(0), "complete"))
			
 
				             return
			
 
				 
			
 
				         DOWNLOAD_CHUNK_SIZE = 32768
			
@@ -190,13 +241,15 @@ async def download_file(session: aiohttp.ClientSession, repo_id: str, revision:
 
				             async for chunk in response.content.iter_chunked(DOWNLOAD_CHUNK_SIZE):
			
 
				                 f.write(chunk)
			
 
				                 downloaded_size += len(chunk)
			
 
				+                downloaded_this_session += len(chunk)
			
 
				                 if progress_callback and total_size:
			
 
				                     elapsed_time = (datetime.now() - start_time).total_seconds()
			
 
				-                    speed = downloaded_size / elapsed_time if elapsed_time > 0 else 0
			
 
				+                    speed = int(downloaded_this_session / elapsed_time) if elapsed_time > 0 else 0
			
 
				                     remaining_size = total_size - downloaded_size
			
 
				                     eta = timedelta(seconds=remaining_size / speed) if speed > 0 else timedelta(0)
			
 
				                     status = "in_progress" if downloaded_size < total_size else "complete"
			
 
				-                    await progress_callback(HFRepoFileProgressEvent(file_path, downloaded_size, total_size, speed, eta, status))
			
 
				+                    if DEBUG >= 8: print(f"HF repo file download progress: {file_path=} {elapsed_time=} {speed=} Downloaded={downloaded_size}/{total_size} {remaining_size=} {eta=} {status=}")
			
 
				+                    await progress_callback(HFRepoFileProgressEvent(file_path, downloaded_size, downloaded_this_session, total_size, speed, eta, status))
			
 
				         if DEBUG >= 2: print(f"Downloaded: {file_path}")
			
 
				 
			
 
				 async def download_all_files(repo_id: str, revision: str = "main", progress_callback: Optional[HFRepoProgressCallback] = None, allow_patterns: Optional[Union[List[str], str]] = None, ignore_patterns: Optional[Union[List[str], str]] = None):
			
@@ -229,35 +282,36 @@ async def download_all_files(repo_id: str, revision: str = "main", progress_call
 
				         file_list = await fetch_file_list(session, repo_id, revision)
			
 
				         filtered_file_list = list(filter_repo_objects(file_list, allow_patterns=allow_patterns, ignore_patterns=ignore_patterns, key=lambda x: x["path"]))
			
 
				         total_files = len(filtered_file_list)
			
 
				-        completed_files = 0
			
 
				         total_bytes = sum(file["size"] for file in filtered_file_list)
			
 
				-        downloaded_bytes = 0
			
 
				-        file_progress: Dict[str, HFRepoFileProgressEvent] = {file["path"]: HFRepoFileProgressEvent(file["path"], 0, file["size"], 0, timedelta(0), "not_started") for file in filtered_file_list}
			
 
				+        file_progress: Dict[str, HFRepoFileProgressEvent] = {file["path"]: HFRepoFileProgressEvent(file["path"], 0, 0, file["size"], 0, timedelta(0), "not_started") for file in filtered_file_list}
			
 
				         start_time = datetime.now()
			
 
				 
			
 
				-        async def download_with_progress(file_info):
			
 
				-            nonlocal completed_files, downloaded_bytes, file_progress
			
 
				-
			
 
				+        async def download_with_progress(file_info, progress_state):
			
 
				             async def file_progress_callback(event: HFRepoFileProgressEvent):
			
 
				-                nonlocal downloaded_bytes, file_progress
			
 
				-                downloaded_bytes += event.downloaded - file_progress[event.file_path].downloaded
			
 
				+                progress_state['downloaded_bytes'] += event.downloaded - file_progress[event.file_path].downloaded
			
 
				+                progress_state['downloaded_bytes_this_session'] += event.downloaded_this_session - file_progress[event.file_path].downloaded_this_session
			
 
				                 file_progress[event.file_path] = event
			
 
				                 if progress_callback:
			
 
				                     elapsed_time = (datetime.now() - start_time).total_seconds()
			
 
				-                    overall_speed = downloaded_bytes / elapsed_time if elapsed_time > 0 else 0
			
 
				-                    overall_eta = timedelta(seconds=(total_bytes - downloaded_bytes) / overall_speed) if overall_speed > 0 else timedelta(0)
			
 
				-                    await progress_callback(HFRepoProgressEvent(completed_files, total_files, downloaded_bytes, total_bytes, overall_eta, file_progress))
			
 
				+                    overall_speed = int(progress_state['downloaded_bytes_this_session'] / elapsed_time) if elapsed_time > 0 else 0
			
 
				+                    remaining_bytes = total_bytes - progress_state['downloaded_bytes']
			
 
				+                    overall_eta = timedelta(seconds=remaining_bytes / overall_speed) if overall_speed > 0 else timedelta(seconds=0)
			
 
				+                    status = "in_progress" if progress_state['downloaded_bytes'] < total_bytes else "complete"
			
 
				+                    await progress_callback(HFRepoProgressEvent(progress_state['completed_files'], total_files, progress_state['downloaded_bytes'], progress_state['downloaded_bytes_this_session'], total_bytes, overall_speed, overall_eta, file_progress, status))
			
 
				 
			
 
				             await download_file(session, repo_id, revision, file_info["path"], snapshot_dir, file_progress_callback)
			
 
				-            completed_files += 1
			
 
				-            file_progress[file_info["path"]] = HFRepoFileProgressEvent(file_info["path"], file_info["size"], file_info["size"], 0, timedelta(0), "complete")
			
 
				+            progress_state['completed_files'] += 1
			
 
				+            file_progress[file_info["path"]] = HFRepoFileProgressEvent(file_info["path"], file_info["size"], file_progress[file_info["path"]].downloaded_this_session, file_info["size"], 0, timedelta(0), "complete")
			
 
				             if progress_callback:
			
 
				                 elapsed_time = (datetime.now() - start_time).total_seconds()
			
 
				-                overall_speed = downloaded_bytes / elapsed_time if elapsed_time > 0 else 0
			
 
				-                overall_eta = timedelta(seconds=(total_bytes - downloaded_bytes) / overall_speed) if overall_speed > 0 else timedelta(0)
			
 
				-                await progress_callback(HFRepoProgressEvent(completed_files, total_files, downloaded_bytes, total_bytes, overall_eta, file_progress))
			
 
				-
			
 
				-        tasks = [download_with_progress(file_info) for file_info in filtered_file_list]
			
 
				+                overall_speed = int(progress_state['downloaded_bytes_this_session'] / elapsed_time) if elapsed_time > 0 else 0
			
 
				+                remaining_bytes = total_bytes - progress_state['downloaded_bytes']
			
 
				+                overall_eta = timedelta(seconds=remaining_bytes / overall_speed) if overall_speed > 0 else timedelta(seconds=0)
			
 
				+                status = "in_progress" if progress_state['completed_files'] < total_files else "complete"
			
 
				+                await progress_callback(HFRepoProgressEvent(progress_state['completed_files'], total_files, progress_state['downloaded_bytes'], progress_state['downloaded_bytes_this_session'], total_bytes, overall_speed, overall_eta, file_progress, status))
			
 
				+
			
 
				+        progress_state = {'completed_files': 0, 'downloaded_bytes': 0, 'downloaded_bytes_this_session': 0}
			
 
				+        tasks = [download_with_progress(file_info, progress_state) for file_info in filtered_file_list]
			
 
				         await asyncio.gather(*tasks)
			
 
				 
			
 
				     return snapshot_dir
			
--- a/exo/inference/tinygrad/inference.py
+++ b/exo/inference/tinygrad/inference.py
@@ -1,16 +1,14 @@
 
				-from functools import partial
			
 
				 from pathlib import Path
			
 
				 from typing import List, Optional, Union, Callable, Coroutine, Any
			
 
				 import json
			
 
				-from tiktoken.load import load_tiktoken_bpe
			
 
				 from exo.inference.tinygrad.models.llama import Transformer, convert_from_huggingface, fix_bf16
			
 
				 from tinygrad.nn.state import safe_load, torch_load, load_state_dict
			
 
				 from tinygrad import Tensor, nn, Context, GlobalCounters
			
 
				-from tinygrad.helpers import DEBUG, tqdm, _cache_dir, fetch
			
 
				+from tinygrad.helpers import tqdm
			
 
				 from exo.inference.shard import Shard
			
 
				 from exo.inference.inference_engine import InferenceEngine
			
 
				 import numpy as np
			
 
				-from exo.inference.hf_helpers import HFRepoProgressCallback, HFRepoProgressEvent, download_all_files, get_repo_root
			
 
				+from exo.inference.hf_helpers import HFRepoProgressCallback, HFRepoProgressEvent, download_all_files
			
 
				 
			
 
				 MODEL_PARAMS = {
			
 
				   "8B": {
			
@@ -88,6 +86,7 @@ def build_transformer(model_path: Path, shard: Shard, model_size="8B", quantize=
 
				       )
			
 
				   else:
			
 
				     weights = load(str(model_path))
			
 
				+
			
 
				   if "model.embed_tokens.weight" in weights:
			
 
				     weights = convert_from_huggingface(
			
 
				       weights,
			
--- a/exo/orchestration/standard_node.py
+++ b/exo/orchestration/standard_node.py
@@ -3,6 +3,7 @@ import json
 
				 import asyncio
			
 
				 import uuid
			
 
				 import time
			
 
				+import traceback
			
 
				 from typing import List, Dict, Optional, Tuple, Union
			
 
				 from exo.networking import Discovery, PeerHandle, Server
			
 
				 from exo.inference.inference_engine import InferenceEngine, Shard
			
@@ -13,6 +14,7 @@ from exo.topology.partitioning_strategy import Partition, PartitioningStrategy,
 
				 from exo import DEBUG
			
 
				 from exo.helpers import AsyncCallbackSystem
			
 
				 from exo.viz.topology_viz import TopologyViz
			
 
				+from exo.inference.hf_helpers import HFRepoProgressEvent
			
 
				 
			
 
				 
			
 
				 class StandardNode(Node):
			
@@ -54,12 +56,14 @@ class StandardNode(Node):
 
				             self.current_topology.active_node_id = None
			
 
				       download_progress = None
			
 
				       if status_data.get("type", "") == "download_progress":
			
 
				-        if DEBUG >= 5: print(f"Download progress from {status_data.get('node_id')}: {status_data.get('current')}/{status_data.get('total')} ({round(status_data.get('current') / status_data.get('total') * 100, 2)}%)")
			
 
				+        if DEBUG >= 5: print(f"Download progress from {status_data.get('node_id')}: {status_data.get('progress')}")
			
 
				         if status_data.get("node_id") == self.id:
			
 
				-          download_progress = (status_data.get('current'), status_data.get('total'))
			
 
				+          download_progress = HFRepoProgressEvent.from_dict(status_data.get('progress'))
			
 
				       if self.topology_viz:
			
 
				         self.topology_viz.update_visualization(self.current_topology, self.partitioning_strategy.partition(self.current_topology), download_progress)
			
 
				-    except json.JSONDecodeError:
			
 
				+    except Exception as e:
			
 
				+      if DEBUG >= 1: print(f"Error updating visualization: {e}")
			
 
				+      traceback.print_exc()
			
 
				       pass
			
 
				 
			
 
				   async def start(self, wait_for_peers: int = 0) -> None:
			
@@ -231,8 +235,6 @@ class StandardNode(Node):
 
				       return np.array(self.buffered_token_output[request_id][0]) if len(self.buffered_token_output[request_id][0]) > 0 else None
			
 
				     except Exception as e:
			
 
				       print(f"Error processing tensor for shard {shard}: {e}")
			
 
				-      import traceback
			
 
				-
			
 
				       traceback.print_exc()
			
 
				       return None
			
 
				 
			
@@ -368,8 +370,6 @@ class StandardNode(Node):
 
				         print(f"Timeout broadcasting result to {peer.id()}")
			
 
				       except Exception as e:
			
 
				         print(f"Error broadcasting result to {peer.id()}: {e}")
			
 
				-        import traceback
			
 
				-
			
 
				         traceback.print_exc()
			
 
				 
			
 
				     await asyncio.gather(*[send_result_to_peer(peer) for peer in self.peers], return_exceptions=True)
			
@@ -383,8 +383,6 @@ class StandardNode(Node):
 
				         print(f"Timeout sending opaque status to {peer.id()}")
			
 
				       except Exception as e:
			
 
				         print(f"Error sending opaque status to {peer.id()}: {e}")
			
 
				-        import traceback
			
 
				-
			
 
				         traceback.print_exc()
			
 
				 
			
 
				     await asyncio.gather(*[send_status_to_peer(peer) for peer in self.peers], return_exceptions=True)
			
--- a/exo/viz/topology_viz.py
+++ b/exo/viz/topology_viz.py
@@ -1,6 +1,6 @@
 
				 import math
			
 
				 from typing import List, Optional, Tuple
			
 
				-from exo.helpers import exo_text
			
 
				+from exo.helpers import exo_text, pretty_print_bytes, pretty_print_bytes_per_second
			
 
				 from exo.topology.topology import Topology
			
 
				 from exo.topology.partitioning_strategy import Partition
			
 
				 from rich.console import Console
			
@@ -8,8 +8,10 @@ from rich.panel import Panel
 
				 from rich.text import Text
			
 
				 from rich.live import Live
			
 
				 from rich.style import Style
			
 
				+from rich.progress import Progress, BarColumn, TextColumn, TimeRemainingColumn
			
 
				+from rich.table import Table
			
 
				 from exo.topology.device_capabilities import UNKNOWN_DEVICE_CAPABILITIES
			
 
				-
			
 
				+from exo.inference.hf_helpers import HFRepoProgressEvent
			
 
				 
			
 
				 class TopologyViz:
			
 
				   def __init__(self, chatgpt_api_endpoint: str = None, web_chat_url: str = None):
			
@@ -24,7 +26,7 @@ class TopologyViz:
 
				     self.live_panel = Live(self.panel, auto_refresh=False, console=self.console)
			
 
				     self.live_panel.start()
			
 
				 
			
 
				-  def update_visualization(self, topology: Topology, partitions: List[Partition], download_progress: Optional[Tuple[int, int]] = None):
			
 
				+  def update_visualization(self, topology: Topology, partitions: List[Partition], download_progress: HFRepoProgressEvent = None):
			
 
				     self.topology = topology
			
 
				     self.partitions = partitions
			
 
				     self.download_progress = download_progress
			
@@ -34,7 +36,7 @@ class TopologyViz:
 
				     self.panel.renderable = self._generate_layout()
			
 
				     # Update the panel title with the number of nodes and partitions
			
 
				     node_count = len(self.topology.nodes)
			
 
				-    self.panel.title = f"Exo Cluster ({node_count} node{'s' if node_count != 1 else ''}){f' {self.download_progress[0]/self.download_progress[1]:.2%} Downloaded' if self.download_progress else ''}"
			
 
				+    self.panel.title = f"Exo Cluster ({node_count} node{'s' if node_count != 1 else ''})"
			
 
				     self.live_panel.update(self.panel, refresh=True)
			
 
				 
			
 
				   def _generate_layout(self) -> str:
			
@@ -47,6 +49,31 @@ class TopologyViz:
 
				     # Generate visualization
			
 
				     visualization = [[" " for _ in range(100)] for _ in range(55)]  # Decreased height
			
 
				 
			
 
				+    # Draw download first so everything else is drawn on top
			
 
				+    # If a download is in progress, show the download info summary
			
 
				+    if self.download_progress and self.download_progress.status != "complete":
			
 
				+        download_summary = _generate_download_summary(self.download_progress)
			
 
				+        download_panel = Panel(
			
 
				+            download_summary,
			
 
				+            title="Download Progress",
			
 
				+            border_style="cyan",
			
 
				+            expand=False,
			
 
				+            width=96,  # Further reduced to ensure it fits within the visualization
			
 
				+            height=None  # Allow the panel to adjust its height based on content
			
 
				+        )
			
 
				+        console = Console(width=98, height=55)  # Reduced console width
			
 
				+        with console.capture() as capture:
			
 
				+            console.print(download_panel)
			
 
				+        download_lines = capture.get().split('\n')
			
 
				+        download_start_y = 15
			
 
				+        panel_width = len(max(download_lines, key=len))
			
 
				+        start_x = max(1, (100 - panel_width) // 2)  # Ensure start_x is at least 1 to avoid left border cut-off
			
 
				+        for i, line in enumerate(download_lines):
			
 
				+            for j, char in enumerate(line):
			
 
				+                if 1 <= start_x + j < 99 and download_start_y + i < 55:  # Ensure we don't write to the rightmost column
			
 
				+                    visualization[download_start_y + i][start_x + j] = char
			
 
				+
			
 
				+
			
 
				     # Add exo_text at the top in bright yellow
			
 
				     exo_lines = exo_text.split("\n")
			
 
				     yellow_style = Style(color="bright_yellow")
			
@@ -168,3 +195,28 @@ class TopologyViz:
 
				 
			
 
				     # Convert to string
			
 
				     return "\n".join("".join(str(char) for char in row) for row in visualization)
			
 
				+
			
 
				+def _generate_download_summary(download_progress) -> Table:
			
 
				+    summary = Table(show_header=False, box=None, padding=(0, 1))
			
 
				+    summary.add_column("Info", style="cyan", no_wrap=True)
			
 
				+    summary.add_column("Progress", style="cyan", no_wrap=True)
			
 
				+    summary.add_column("Percentage", style="cyan", no_wrap=True)
			
 
				+
			
 
				+    title = f"Downloading model ({download_progress.completed_files}/{download_progress.total_files}):"
			
 
				+    summary.add_row(Text(title, style="bold"))
			
 
				+    progress_info = f"{pretty_print_bytes(download_progress.downloaded_bytes)} / {pretty_print_bytes(download_progress.total_bytes)} ({pretty_print_bytes_per_second(download_progress.overall_speed)})"
			
 
				+    summary.add_row(progress_info)
			
 
				+
			
 
				+    eta_info = f"ETA: {download_progress.overall_eta}"
			
 
				+    summary.add_row(eta_info)
			
 
				+
			
 
				+    summary.add_row("")  # Empty row for spacing
			
 
				+
			
 
				+    for file_path, file_progress in download_progress.file_progress.items():
			
 
				+      if file_progress.status != "complete":
			
 
				+        progress = int(file_progress.downloaded / file_progress.total * 20)  # Increased bar width
			
 
				+        bar = f"[{'=' * progress}{' ' * (20 - progress)}]"
			
 
				+        percentage = f"{file_progress.downloaded / file_progress.total * 100:.0f}%"
			
 
				+        summary.add_row(Text(file_path[:20], style="cyan"), bar, percentage)  # Increased file path length
			
 
				+
			
 
				+    return summary
			
--- a/main.py
+++ b/main.py
@@ -60,7 +60,7 @@ node.on_token.register("main_log").on_next(lambda _, tokens, __: print(inference
 
				 if args.prometheus_client_port:
			
 
				     from exo.stats.metrics import start_metrics_server
			
 
				     start_metrics_server(node, args.prometheus_client_port)
			
 
				-inference_engine.set_progress_callback(lambda event: asyncio.create_task(node.broadcast_opaque_status("", json.dumps({"type": "download_progress", "node_id": node.id, "current": event.downloaded_bytes, "total": event.total_bytes}))))
			
 
				+inference_engine.set_progress_callback(lambda event: asyncio.create_task(node.broadcast_opaque_status("", json.dumps({"type": "download_progress", "node_id": node.id, "progress": event.to_dict()}))))
			
 
				 
			
 
				 async def shutdown(signal, loop):
			
 
				     """Gracefully shutdown the server and close the asyncio loop."""