9 місяців тому · f29963f41e
--- a/exo/download/hf/hf_shard_download.py
+++ b/exo/download/hf/hf_shard_download.py
@@ -1,11 +1,12 @@
 
				 import asyncio
			
 
				+import traceback
			
 
				 from pathlib import Path
			
 
				 from typing import Dict, List, Tuple
			
 
				 from exo.inference.shard import Shard
			
 
				 from exo.download.shard_download import ShardDownloader
			
 
				 from exo.download.download_progress import RepoProgressEvent
			
 
				 from exo.download.hf.hf_helpers import download_repo_files, RepoProgressEvent, get_repo_root, get_weight_map, extract_layer_num
			
 
				-from exo.helpers import AsyncCallbackSystem
			
 
				+from exo.helpers import AsyncCallbackSystem, DEBUG
			
 
				 
			
 
				 class HFShardDownloader(ShardDownloader):
			
 
				     def __init__(self):
			
@@ -13,25 +14,29 @@ class HFShardDownloader(ShardDownloader):
 
				         self._on_progress = AsyncCallbackSystem[str, Tuple[Shard, RepoProgressEvent]]()
			
 
				 
			
 
				     async def ensure_shard(self, shard: Shard) -> Path:
			
 
				-        # Cancel any overlapping downloads
			
 
				-        to_remove = []
			
 
				+        # If a download on this shard is already in progress, keep that one
			
 
				         for active_shard, task in self.active_downloads:
			
 
				-            if shard.overlaps(active_shard):
			
 
				-                task.cancel()
			
 
				-                try:
			
 
				-                    await task
			
 
				-                except asyncio.CancelledError:
			
 
				-                    pass  # This is expected when cancelling a task
			
 
				-                to_remove.append((active_shard, task))
			
 
				+            if active_shard == shard:
			
 
				+                return await task
			
 
				 
			
 
				-        # Remove cancelled downloads from the list
			
 
				-        for item in to_remove:
			
 
				-            self.active_downloads.remove(item)
			
 
				+        # Cancel any downloads for this model_id on a different shard
			
 
				+        to_remove = [(active_shard, task) for active_shard, task in self.active_downloads if active_shard.model_id == shard.model_id]
			
 
				+        for active_shard, task in to_remove:
			
 
				+            if DEBUG >= 2: print(f"Cancelling download for {active_shard} (replacing with {shard})")
			
 
				+            task.cancel()
			
 
				+            try:
			
 
				+                await task
			
 
				+            except asyncio.CancelledError:
			
 
				+                pass  # This is expected when cancelling a task
			
 
				+            except Exception as e:
			
 
				+                if DEBUG >= 2: print(f"Error in cancelling download {active_shard}: {e}")
			
 
				+                traceback.print_exc()
			
 
				+        if DEBUG >= 2: print(f"Removing cancelled downloads: {to_remove}")
			
 
				+        self.active_downloads = [(active_shard, task) for active_shard, task in self.active_downloads if active_shard.model_id != shard.model_id]
			
 
				 
			
 
				         # Start new download
			
 
				         download_task = asyncio.create_task(self._download_shard(shard))
			
 
				         self.active_downloads.append((shard, download_task))
			
 
				-
			
 
				         try:
			
 
				             return await download_task
			
 
				         finally:
			
--- a/exo/inference/shard.py
+++ b/exo/inference/shard.py
@@ -25,6 +25,9 @@ class Shard:
 
				       "n_layers": self.n_layers,
			
 
				     }
			
 
				 
			
 
				+  def from_dict(data: dict) -> 'Shard':
			
 
				+    return Shard(**data)
			
 
				+
			
 
				   def overlaps(self, other: 'Shard') -> bool:
			
 
				     return shards_overlap(self, other)
			
 
				 
			
--- a/main.py
+++ b/main.py
@@ -3,6 +3,7 @@ import asyncio
 
				 import signal
			
 
				 import json
			
 
				 import time
			
 
				+import traceback
			
 
				 from exo.orchestration.standard_node import StandardNode
			
 
				 from exo.networking.grpc.grpc_server import GRPCServer
			
 
				 from exo.networking.grpc.grpc_discovery import GRPCDiscovery
			
@@ -11,6 +12,7 @@ from exo.api import ChatGPTAPI
 
				 from exo.download.shard_download import ShardDownloader
			
 
				 from exo.download.hf.hf_shard_download import HFShardDownloader
			
 
				 from exo.helpers import print_yellow_exo, find_available_port, DEBUG, get_inference_engine, get_system_info, get_or_create_node_id
			
 
				+from exo.inference.shard import Shard
			
 
				 
			
 
				 # parse args
			
 
				 parser = argparse.ArgumentParser(description="Initialize GRPC Discovery")
			
@@ -60,6 +62,18 @@ server = GRPCServer(node, args.node_host, args.node_port)
 
				 node.server = server
			
 
				 api = ChatGPTAPI(node, inference_engine.__class__.__name__, response_timeout_secs=args.chatgpt_api_response_timeout_secs)
			
 
				 node.on_token.register("main_log").on_next(lambda _, tokens, __: print(inference_engine.tokenizer.decode(tokens) if hasattr(inference_engine, "tokenizer") else tokens))
			
 
				+def preemptively_start_download(request_id: str, opaque_status: str):
			
 
				+    try:
			
 
				+        status = json.loads(opaque_status)
			
 
				+        if status.get("type") == "node_status" and status.get("status") == "start_process_prompt":
			
 
				+            current_shard = node.get_current_shard(Shard.from_dict(status.get("shard")))
			
 
				+            if DEBUG >= 2: print(f"Preemptively starting download for {current_shard}")
			
 
				+            asyncio.create_task(shard_downloader.ensure_shard(current_shard))
			
 
				+    except Exception as e:
			
 
				+        if DEBUG >= 2:
			
 
				+            print(f"Failed to preemptively start download: {e}")
			
 
				+            traceback.print_exc()
			
 
				+node.on_opaque_status.register("start_download").on_next(preemptively_start_download)
			
 
				 if args.prometheus_client_port:
			
 
				     from exo.stats.metrics import start_metrics_server
			
 
				     start_metrics_server(node, args.prometheus_client_port)