1 年之前 · d6a7e46324
--- a/exo/helpers.py
+++ b/exo/helpers.py
@@ -1,7 +1,6 @@
 
															 import os
														
 
															 import asyncio
														
 
															-from typing import Any, Callable, TypeVar, Optional, Dict, Generic, Tuple, List
														
 
															-from collections import defaultdict
														
 
															+from typing import Callable, TypeVar, Optional, Dict, Generic, Tuple, List
														
 
															 import socket
														
 
															 import random
														
 
															 import platform
														
--- a/exo/inference/mlx/sharded_inference_engine.py
+++ b/exo/inference/mlx/sharded_inference_engine.py
@@ -4,12 +4,13 @@ from ..inference_engine import InferenceEngine
 
															 from .sharded_model import StatefulShardedModel
														
 
															 from .sharded_utils import load_shard, get_image_from_str
														
 
															 from ..shard import Shard
														
 
															-from typing import Optional
														
 
															+from typing import Optional, Callable
														
 
															 class MLXDynamicShardInferenceEngine(InferenceEngine):
														
 
															-  def __init__(self):
														
 
															+  def __init__(self, on_download_progress: Callable[[int, int], None] = None):
														
 
															     self.shard = None
														
 
															+    self.on_download_progress = on_download_progress
														
 
															   async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> (np.ndarray, str, bool):
														
 
															     await self.ensure_shard(shard)
														
@@ -32,6 +33,9 @@ class MLXDynamicShardInferenceEngine(InferenceEngine):
 
															     if self.shard == shard:
														
 
															       return
														
 
															-    model_shard, self.tokenizer = await load_shard(shard.model_id, shard)
														
 
															+    model_shard, self.tokenizer = await load_shard(shard.model_id, shard, on_download_progress=self.on_download_progress)
														
 
															     self.stateful_sharded_model = StatefulShardedModel(shard, model_shard)
														
 
															     self.shard = shard
														
 
															+
														
 
															+  def set_on_download_progress(self, on_download_progress: Callable[[int, int], None]):
														
 
															+    self.on_download_progress = on_download_progress
														
--- a/exo/inference/mlx/sharded_utils.py
+++ b/exo/inference/mlx/sharded_utils.py
@@ -8,16 +8,19 @@ import asyncio
 
															 import aiohttp
														
 
															 from functools import partial
														
 
															 from pathlib import Path
														
 
															-from typing import Optional, Tuple
														
 
															-import requests
														
 
															+from typing import Optional, Tuple, Union, List, Callable
														
 
															 from PIL import Image
														
 
															 from io import BytesIO
														
 
															 import base64
														
 
															+import os
														
 
															 from exo import DEBUG
														
 
															 import mlx.core as mx
														
 
															 import mlx.nn as nn
														
 
															-from huggingface_hub import snapshot_download
														
 
															+from huggingface_hub import snapshot_download, list_repo_tree, get_paths_info
														
 
															+from huggingface_hub.utils import filter_repo_objects
														
 
															+from huggingface_hub.file_download import repo_folder_name
														
 
															+from huggingface_hub.constants import HF_HUB_CACHE
														
 
															 from huggingface_hub.utils._errors import RepositoryNotFoundError
														
 
															 from transformers import AutoProcessor
														
@@ -144,12 +147,50 @@ def load_model_shard(
 
															   return model
														
 
															-async def snapshot_download_async(*args, **kwargs):
														
 
															-  func = partial(snapshot_download, *args, **kwargs)
														
 
															-  return await asyncio.get_event_loop().run_in_executor(None, func)
														
 
															-
														
 
															-
														
 
															-async def get_model_path(path_or_hf_repo: str, revision: Optional[str] = None) -> Path:
														
 
															+async def get_repo_size(repo_id: str, revision: Optional[str] = None, allow_patterns: Optional[Union[List[str], str]] = None, repo_type: Optional[str] = None):
														
 
															+  it = await asyncio.to_thread(list_repo_tree, repo_id, revision=revision, repo_type=repo_type)
														
 
															+  files = list(filter_repo_objects(it, allow_patterns=allow_patterns, key=lambda f: f.path))
														
 
															+  return sum(file.size for file in files if file.size is not None)
														
 
															+
														
 
															+async def monitor_progress(dir, total_size, print_progress=False, on_progress: Callable[[int, int], None] = None):
														
 
															+    while True:
														
 
															+        await asyncio.sleep(0.1)
														
 
															+        current_size = sum(os.path.getsize(os.path.join(root, file))
														
 
															+                           for root, _, files in os.walk(dir)
														
 
															+                           for file in files)
														
 
															+        progress = min(current_size / total_size * 100, 100)
														
 
															+        if print_progress:
														
 
															+          print(f"\rProgress: {progress:.2f}% ({current_size}/{total_size} bytes)", end="", flush=True)
														
 
															+        if on_progress:
														
 
															+          on_progress(current_size, total_size)
														
 
															+        if progress >= 100:
														
 
															+          if print_progress:
														
 
															+            print("\nDownload complete!")
														
 
															+          break
														
 
															+
														
 
															+async def download_repo(repo_id: str, revision: Optional[str] = None, allow_patterns: Optional[Union[List[str], str]] = None, repo_type: Optional[str] = None):
														
 
															+  # Use snapshot_download in a separate thread to not block the event loop
														
 
															+  return await asyncio.to_thread(snapshot_download, repo_id=repo_id, revision=revision, allow_patterns=allow_patterns, repo_type=repo_type)
														
 
															+
														
 
															+async def download_async_with_progress(repo_id: str, revision: Optional[str] = None, allow_patterns: Optional[Union[List[str], str]] = None, repo_type: Optional[str] = None, on_progress: Callable[[int, int], None] = None):
														
 
															+  storage_folder = os.path.join(HF_HUB_CACHE, repo_folder_name(repo_id=repo_id, repo_type="model"))
														
 
															+  # os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '1'
														
 
															+  # os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'
														
 
															+
														
 
															+  print(f"Estimating size of repository: {repo_id}")
														
 
															+  total_size = await get_repo_size(repo_id)
														
 
															+  print(f"Estimated total size: {total_size} bytes")
														
 
															+
														
 
															+  # Create tasks for download and progress checking
														
 
															+  download_task = asyncio.create_task(download_repo(repo_id, revision=revision, allow_patterns=allow_patterns, repo_type=repo_type))
														
 
															+  progress_task = asyncio.create_task(monitor_progress(storage_folder, total_size, on_progress=on_progress))
														
 
															+
														
 
															+  # Wait for both tasks to complete
														
 
															+  result = await asyncio.gather(download_task, progress_task)
														
 
															+  return result[0]  # Return the result from download_task
														
 
															+
														
 
															+
														
 
															+async def get_model_path(path_or_hf_repo: str, revision: Optional[str] = None, on_download_progress: Callable[[int, int], None] = None) -> Path:
														
 
															   """
														
 
															   Ensures the model is available locally. If the path does not exist locally,
														
 
															   it is downloaded from the Hugging Face Hub.
														
@@ -165,7 +206,7 @@ async def get_model_path(path_or_hf_repo: str, revision: Optional[str] = None) -
 
															   if not model_path.exists():
														
 
															     try:
														
 
															       model_path = Path(
														
 
															-        await snapshot_download_async(
														
 
															+        await download_async_with_progress(
														
 
															           repo_id=path_or_hf_repo,
														
 
															           revision=revision,
														
 
															           allow_patterns=[
														
@@ -176,6 +217,7 @@ async def get_model_path(path_or_hf_repo: str, revision: Optional[str] = None) -
 
															             "*.tiktoken",
														
 
															             "*.txt",
														
 
															           ],
														
 
															+          on_progress=on_download_progress,
														
 
															         )
														
 
															       )
														
 
															     except RepositoryNotFoundError:
														
@@ -196,6 +238,7 @@ async def load_shard(
 
															   model_config={},
														
 
															   adapter_path: Optional[str] = None,
														
 
															   lazy: bool = False,
														
 
															+  on_download_progress: Callable[[int, int], None] = None,
														
 
															 ) -> Tuple[nn.Module, TokenizerWrapper]:
														
 
															   """
														
 
															   Load the model and tokenizer from a given path or a huggingface repository.
														
@@ -218,7 +261,7 @@ async def load_shard(
 
															    FileNotFoundError: If config file or safetensors are not found.
														
 
															    ValueError: If model class or args class are not found.
														
 
															   """
														
 
															-  model_path = await get_model_path(path_or_hf_repo)
														
 
															+  model_path = await get_model_path(path_or_hf_repo, on_download_progress=on_download_progress)
														
 
															   model = load_model_shard(model_path, shard, lazy, model_config)
														
 
															   if adapter_path is not None:
														
--- a/exo/orchestration/standard_node.py
+++ b/exo/orchestration/standard_node.py
@@ -52,8 +52,13 @@ class StandardNode(Node):
 
															         elif status_data.get("status", "").startswith("end_"):
														
 
															           if status_data.get("node_id") == self.current_topology.active_node_id:
														
 
															             self.current_topology.active_node_id = None
														
 
															+      download_progress = None
														
 
															+      if status_data.get("type", "") == "download_progress":
														
 
															+        if DEBUG >= 5: print(f"Download progress from {status_data.get('node_id')}: {status_data.get('current')}/{status_data.get('total')} ({round(status_data.get('current') / status_data.get('total') * 100, 2)}%)")
														
 
															+        if status_data.get("node_id") == self.id:
														
 
															+          download_progress = (status_data.get('current'), status_data.get('total'))
														
 
															       if self.topology_viz:
														
 
															-        self.topology_viz.update_visualization(self.current_topology, self.partitioning_strategy.partition(self.current_topology))
														
 
															+        self.topology_viz.update_visualization(self.current_topology, self.partitioning_strategy.partition(self.current_topology), download_progress)
														
 
															     except json.JSONDecodeError:
														
 
															       pass
														
@@ -370,6 +375,7 @@ class StandardNode(Node):
 
															     await asyncio.gather(*[send_result_to_peer(peer) for peer in self.peers], return_exceptions=True)
														
 
															   async def broadcast_opaque_status(self, request_id: str, status: str) -> None:
														
 
															+    if DEBUG >= 5: print(f"Broadcasting opaque status: {request_id=} {status=}")
														
 
															     async def send_status_to_peer(peer):
														
 
															       try:
														
 
															         await asyncio.wait_for(peer.send_opaque_status(request_id, status), timeout=15.0)
														
--- a/exo/viz/topology_viz.py
+++ b/exo/viz/topology_viz.py
@@ -1,5 +1,5 @@
 
															 import math
														
 
															-from typing import List
														
 
															+from typing import List, Optional, Tuple
														
 
															 from exo.helpers import exo_text
														
 
															 from exo.topology.topology import Topology
														
 
															 from exo.topology.partitioning_strategy import Partition
														
@@ -17,22 +17,24 @@ class TopologyViz:
 
															     self.web_chat_url = web_chat_url
														
 
															     self.topology = Topology()
														
 
															     self.partitions: List[Partition] = []
														
 
															+    self.download_progress = None
														
 
															     self.console = Console()
														
 
															     self.panel = Panel(self._generate_layout(), title="Exo Cluster (0 nodes)", border_style="bright_yellow")
														
 
															     self.live_panel = Live(self.panel, auto_refresh=False, console=self.console)
														
 
															     self.live_panel.start()
														
 
															-  def update_visualization(self, topology: Topology, partitions: List[Partition]):
														
 
															+  def update_visualization(self, topology: Topology, partitions: List[Partition], download_progress: Optional[Tuple[int, int]] = None):
														
 
															     self.topology = topology
														
 
															     self.partitions = partitions
														
 
															+    self.download_progress = download_progress
														
 
															     self.refresh()
														
 
															   def refresh(self):
														
 
															     self.panel.renderable = self._generate_layout()
														
 
															     # Update the panel title with the number of nodes and partitions
														
 
															     node_count = len(self.topology.nodes)
														
 
															-    self.panel.title = f"Exo Cluster ({node_count} node{'s' if node_count != 1 else ''})"
														
 
															+    self.panel.title = f"Exo Cluster ({node_count} node{'s' if node_count != 1 else ''}){f' {self.download_progress[0]/self.download_progress[1]:.2%} Downloaded' if self.download_progress else ''}"
														
 
															     self.live_panel.update(self.panel, refresh=True)
														
 
															   def _generate_layout(self) -> str:
														
--- a/main.py
+++ b/main.py
@@ -1,6 +1,7 @@
 
															 import argparse
														
 
															 import asyncio
														
 
															 import signal
														
 
															+import json
														
 
															 import uuid
														
 
															 from exo.orchestration.standard_node import StandardNode
														
 
															 from exo.networking.grpc.grpc_server import GRPCServer
														
@@ -58,6 +59,7 @@ node.on_token.register("main_log").on_next(lambda _, tokens, __: print(inference
 
															 if args.prometheus_client_port:
														
 
															     from exo.stats.metrics import start_metrics_server
														
 
															     start_metrics_server(node, args.prometheus_client_port)
														
 
															+inference_engine.set_on_download_progress(lambda current, total: asyncio.create_task(node.broadcast_opaque_status("", json.dumps({"type": "download_progress", "node_id": node.id, "current": current, "total": total}))))
														
 
															 async def shutdown(signal, loop):
														
 
															     """Gracefully shutdown the server and close the asyncio loop."""
														
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,8 @@ install_requires = [
 
															     "blobfile==2.1.1",
														
 
															     "grpcio==1.64.1",
														
 
															     "grpcio-tools==1.64.1",
														
 
															-    "huggingface-hub==0.23.4",
														
 
															+    "hf-transfer==0.1.8",
														
 
															+    "huggingface-hub==0.24.5",
														
 
															     "Jinja2==3.1.4",
														
 
															     "numpy==2.0.0",
														
 
															     "pillow==10.4.0",