10 сар өмнө · 425bd214eb
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -17,11 +17,11 @@ commands:
 
				             source env/bin/activate
			
 
				 
			
 
				             # Start first instance
			
 
				-            HF_HOME="$(pwd)/.hf_cache_node1" DEBUG_DISCOVERY=7 DEBUG=7 python3 main.py --inference-engine <<parameters.inference_engine>> --node-id "node1" --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 --chatgpt-api-response-timeout 900 2>&1 | tee output1.log &
			
 
				+            HF_HOME="$(pwd)/.hf_cache_node1" DEBUG_DISCOVERY=7 DEBUG=7 exo --inference-engine <<parameters.inference_engine>> --node-id "node1" --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 --chatgpt-api-response-timeout 900 2>&1 | tee output1.log &
			
 
				             PID1=$!
			
 
				 
			
 
				             # Start second instance
			
 
				-            HF_HOME="$(pwd)/.hf_cache_node2" DEBUG_DISCOVERY=7 DEBUG=7 python3 main.py --inference-engine <<parameters.inference_engine>> --node-id "node2" --listen-port 5679 --broadcast-port 5678 --chatgpt-api-port 8001 --chatgpt-api-response-timeout 900 2>&1 | tee output2.log &
			
 
				+            HF_HOME="$(pwd)/.hf_cache_node2" DEBUG_DISCOVERY=7 DEBUG=7 exo --inference-engine <<parameters.inference_engine>> --node-id "node2" --listen-port 5679 --broadcast-port 5678 --chatgpt-api-port 8001 --chatgpt-api-response-timeout 900 2>&1 | tee output2.log &
			
 
				             PID2=$!
			
 
				 
			
 
				             # Wait for discovery
			
@@ -138,9 +138,9 @@ jobs:
 
				           name: Run discovery integration test
			
 
				           command: |
			
 
				             source env/bin/activate
			
 
				-            DEBUG_DISCOVERY=7 DEBUG=7 python3 main.py --node-id "node1" --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 > output1.log 2>&1 &
			
 
				+            DEBUG_DISCOVERY=7 DEBUG=7 exo --node-id "node1" --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 > output1.log 2>&1 &
			
 
				             PID1=$!
			
 
				-            DEBUG_DISCOVERY=7 DEBUG=7 python3 main.py --node-id "node2" --listen-port 5679 --broadcast-port 5678 --chatgpt-api-port 8001 > output2.log 2>&1 &
			
 
				+            DEBUG_DISCOVERY=7 DEBUG=7 exo --node-id "node2" --listen-port 5679 --broadcast-port 5678 --chatgpt-api-port 8001 > output2.log 2>&1 &
			
 
				             PID2=$!
			
 
				             sleep 10
			
 
				             kill $PID1 $PID2
			
--- a/README.md
+++ b/README.md
@@ -79,19 +79,11 @@ The current recommended way to install exo is from source.
 
				 
			
 
				 ### Hardware Requirements
			
 
				 
			
 
				-| Component          | MLX Requirements                                              | TinyGrad Requirements (for Llama-3.1-8B or similar)                    |
			
 
				-|--------------------|---------------------------------------------------------------|------------------------------------------------------------------------|
			
 
				-| **CPU**            | Apple Silicon (M1, M2, or later) only                         | Minimum: Intel Core i7-12700 or AMD Ryzen 7 5800X <br>Recommended: Intel Core i9-12900K or AMD Ryzen 9 5900X |
			
 
				-| **GPU**            | Apple Silicon Integrated GPU                                  | Minimum: NVIDIA RTX 4070 (12 GB VRAM) <br>Recommended: NVIDIA RTX 4080 (16 GB VRAM)  |
			
 
				-| **RAM**            | Minimum: 16 GB <br>Recommended: 32 GB                         | Minimum: 32 GB <br>Recommended: 64 GB                                  |
			
 
				-| **Storage**        | Minimum: 256 GB SSD <br>Recommended: 512 GB SSD               | Minimum: 512 GB SSD <br>Recommended: 1 TB SSD                          |
			
 
				-| **Operating System**| macOS (Big Sur)                               | Ubuntu                                              |
			
 
				-
			
 
				-**Note**:  
			
 
				-- For **MLX**, you can currently run **smaller models** such as **Llama-3.2-1B**, which are optimized for Apple Silicon hardware.
			
 
				-- For **TinyGrad**, the **smallest model** currently supported is **Llama-3.1-8B**, which requires more robust hardware to run effectively.
			
 
				-- **Hardware requirements are indicative**: The overall load is distributed across the **CPU, RAM**, and **GPU/VRAM**, not solely on the GPU. Therefore, your system's performance depends on its ability to handle this distribution effectively.
			
 
				-- It is also **possible to run models in a cluster mode**, utilizing multiple devices to distribute the computation load across multiple machines or GPUs, enhancing performance.
			
 
				+- The only requirement to run exo is to have enough memory across all your devices to fit the entire model into memory. For example, if you are running llama 3.1 8B (fp16), you need 16GB of memory across all devices. Any of the following configurations would work since they each have more than 16GB of memory in total:
			
 
				+  - 2 x 8GB M3 MacBook Airs
			
 
				+  - 1 x 16GB NVIDIA RTX 4070 Ti Laptop
			
 
				+  - 2 x Raspberry Pi 400 with 4GB of RAM each (running on CPU) + 1 x 8GB Mac Mini
			
 
				+- exo is designed to run on devices with heterogeneous capabilities. For example, you can have some devices with powerful GPUs and others with integrated GPUs or even CPUs. Adding less capable devices will slow down individual inference latency but will increase the overall throughput of the cluster.
			
 
				 
			
 
				 ### From source
			
 
				 
			
@@ -99,7 +91,7 @@ The current recommended way to install exo is from source.
 
				 ```sh
			
 
				 git clone https://github.com/exo-explore/exo.git
			
 
				 cd exo
			
 
				-pip install .
			
 
				+pip install -e .
			
 
				 # alternatively, with venv
			
 
				 source install.sh
			
 
				 ```
			
@@ -124,12 +116,12 @@ source install.sh
 
				 #### Device 1:
			
 
				 
			
 
				 ```sh
			
 
				-python3 main.py
			
 
				+exo
			
 
				 ```
			
 
				 
			
 
				 #### Device 2:
			
 
				 ```sh
			
 
				-python3 main.py
			
 
				+exo
			
 
				 ```
			
 
				 
			
 
				 That's it! No configuration required - exo will automatically discover the other device(s).
			
@@ -138,13 +130,13 @@ exo starts a ChatGPT-like WebUI (powered by [tinygrad tinychat](https://github.c
 
				 
			
 
				 For developers, exo also starts a ChatGPT-compatible API endpoint on http://localhost:8000/v1/chat/completions. Examples with curl:
			
 
				 
			
 
				-#### Llama 3.1 8B:
			
 
				+#### Llama 3.2 3B:
			
 
				 
			
 
				 ```sh
			
 
				 curl http://localhost:8000/v1/chat/completions \
			
 
				   -H "Content-Type: application/json" \
			
 
				   -d '{
			
 
				-     "model": "llama-3.1-8b",
			
 
				+     "model": "llama-3.2-3b",
			
 
				      "messages": [{"role": "user", "content": "What is the meaning of exo?"}],
			
 
				      "temperature": 0.7
			
 
				    }'
			
@@ -195,38 +187,49 @@ curl http://localhost:8000/v1/chat/completions \
 
				 #### Device 1 (MacOS):
			
 
				 
			
 
				 ```sh
			
 
				-python3 main.py --inference-engine tinygrad
			
 
				+exo --inference-engine tinygrad
			
 
				 ```
			
 
				 
			
 
				 Here we explicitly tell exo to use the **tinygrad** inference engine.
			
 
				 
			
 
				 #### Device 2 (Linux):
			
 
				 ```sh
			
 
				-python3 main.py
			
 
				+exo
			
 
				 ```
			
 
				 
			
 
				 Linux devices will automatically default to using the **tinygrad** inference engine.
			
 
				 
			
 
				 You can read about tinygrad-specific env vars [here](https://docs.tinygrad.org/env_vars/). For example, you can configure tinygrad to use the cpu by specifying `CLANG=1`.
			
 
				 
			
 
				+### Example Usage on a single device with "exo run" command
			
 
				+
			
 
				+```sh
			
 
				+exo run llama-3.2-3b
			
 
				+```
			
 
				+
			
 
				+With a custom prompt:
			
 
				+
			
 
				+```sh
			
 
				+exo run llama-3.2-3b --prompt "What is the meaning of exo?"
			
 
				+```
			
 
				 
			
 
				 ## Debugging
			
 
				 
			
 
				 Enable debug logs with the DEBUG environment variable (0-9).
			
 
				 
			
 
				 ```sh
			
 
				-DEBUG=9 python3 main.py
			
 
				+DEBUG=9 exo
			
 
				 ```
			
 
				 
			
 
				 For the **tinygrad** inference engine specifically, there is a separate DEBUG flag `TINYGRAD_DEBUG` that can be used to enable debug logs (1-6).
			
 
				 
			
 
				 ```sh
			
 
				-TINYGRAD_DEBUG=2 python3 main.py
			
 
				+TINYGRAD_DEBUG=2 exo
			
 
				 ```
			
 
				 
			
 
				 ## Known Issues
			
 
				 
			
 
				-- On some versions of MacOS/Python, certificates are not installed properly which can lead to SSL errors (e.g. SSL error with huggingface.co). To fix this, run the Install Certificates command, usually: 
			
 
				+- On some versions of MacOS/Python, certificates are not installed properly which can lead to SSL errors (e.g. SSL error with huggingface.co). To fix this, run the Install Certificates command, usually:
			
 
				 
			
 
				 ```sh
			
 
				 /Applications/Python 3.x/Install Certificates.command
			
--- a/exo/api/chatgpt_api.py
+++ b/exo/api/chatgpt_api.py
@@ -179,7 +179,7 @@ class ChatGPTAPI:
 
				     # Endpoint for download progress tracking
			
 
				     cors.add(self.app.router.add_get("/v1/download/progress", self.handle_get_download_progress), {"*": cors_options})
			
 
				 
			
 
				-    self.static_dir = Path(__file__).parent.parent.parent/"tinychat/examples/tinychat"
			
 
				+    self.static_dir = Path(__file__).parent.parent/"tinychat"
			
 
				     self.app.router.add_get("/", self.handle_root)
			
 
				     self.app.router.add_static("/", self.static_dir, name="static")
			
 
				 
			
--- a/exo/helpers.py
+++ b/exo/helpers.py
@@ -170,7 +170,7 @@ def is_valid_uuid(val):
 
				 
			
 
				 
			
 
				 def get_or_create_node_id():
			
 
				-  NODE_ID_FILE = Path(os.path.dirname(os.path.abspath(__file__)))/".exo_node_id"
			
 
				+  NODE_ID_FILE = Path(tempfile.gettempdir()) / ".exo_node_id"
			
 
				   try:
			
 
				     if NODE_ID_FILE.is_file():
			
 
				       with open(NODE_ID_FILE, "r") as f:
			
--- a/exo/inference/tinygrad/__init__.py
+++ b/exo/inference/tinygrad/__init__.py
--- a/exo/inference/tinygrad/inference.py
+++ b/exo/inference/tinygrad/inference.py
@@ -4,9 +4,8 @@ import os
 
				 from exo.inference.tinygrad.models.llama import Transformer, convert_from_huggingface, fix_bf16
			
 
				 from exo.inference.shard import Shard
			
 
				 from exo.inference.tokenizers import resolve_tokenizer
			
 
				-from tinygrad.nn.state import safe_load, torch_load, load_state_dict
			
 
				-from tinygrad import Tensor, dtypes, nn, Context
			
 
				-from transformers import AutoTokenizer
			
 
				+from tinygrad.nn.state import load_state_dict
			
 
				+from tinygrad import Tensor, nn, Context
			
 
				 from exo.inference.inference_engine import InferenceEngine
			
 
				 from typing import Optional, Tuple
			
 
				 import numpy as np
			
@@ -14,8 +13,6 @@ from exo.inference.tinygrad.tinygrad_helpers import concat_weights, load
 
				 from exo.download.shard_download import ShardDownloader
			
 
				 from concurrent.futures import ThreadPoolExecutor
			
 
				 import asyncio
			
 
				-import threading
			
 
				-from functools import partial
			
 
				 
			
 
				 Tensor.no_grad = True
			
 
				 # default settings
			
--- a/exo/inference/tinygrad/models/__init__.py
+++ b/exo/inference/tinygrad/models/__init__.py
--- a/exo/main.py
+++ b/exo/main.py
@@ -5,6 +5,7 @@ import json
 
				 import time
			
 
				 import traceback
			
 
				 import uuid
			
 
				+import sys
			
 
				 from exo.orchestration.standard_node import StandardNode
			
 
				 from exo.networking.grpc.grpc_server import GRPCServer
			
 
				 from exo.networking.udp.udp_discovery import UDPDiscovery
			
@@ -24,6 +25,8 @@ from exo.viz.topology_viz import TopologyViz
 
				 
			
 
				 # parse args
			
 
				 parser = argparse.ArgumentParser(description="Initialize GRPC Discovery")
			
 
				+parser.add_argument("command", nargs="?", choices=["run"], help="Command to run")
			
 
				+parser.add_argument("model_name", nargs="?", help="Model name to run")
			
 
				 parser.add_argument("--node-id", type=str, default=None, help="Node ID")
			
 
				 parser.add_argument("--node-host", type=str, default="0.0.0.0", help="Node host")
			
 
				 parser.add_argument("--node-port", type=int, default=None, help="Node port")
			
@@ -186,14 +189,18 @@ async def main():
 
				 
			
 
				   await node.start(wait_for_peers=args.wait_for_peers)
			
 
				 
			
 
				-  if args.run_model:
			
 
				-    await run_model_cli(node, inference_engine, args.run_model, args.prompt)
			
 
				+  if args.command == "run" or args.run_model:
			
 
				+    model_name = args.model_name or args.run_model
			
 
				+    if not model_name:
			
 
				+      print("Error: Model name is required when using 'run' command or --run-model")
			
 
				+      return
			
 
				+    await run_model_cli(node, inference_engine, model_name, args.prompt)
			
 
				   else:
			
 
				     asyncio.create_task(api.run(port=args.chatgpt_api_port))  # Start the API server as a non-blocking task
			
 
				     await asyncio.Event().wait()
			
 
				 
			
 
				 
			
 
				-if __name__ == "__main__":
			
 
				+def run():
			
 
				   loop = asyncio.new_event_loop()
			
 
				   asyncio.set_event_loop(loop)
			
 
				   try:
			
@@ -203,3 +210,6 @@ if __name__ == "__main__":
 
				   finally:
			
 
				     loop.run_until_complete(shutdown(signal.SIGTERM, loop))
			
 
				     loop.close()
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+  run()
			
--- a/exo/networking/tailscale/__init__.py
+++ b/exo/networking/tailscale/__init__.py
--- a/exo/networking/udp/__init__.py
+++ b/exo/networking/udp/__init__.py
--- a/tinychat/examples/tinychat/common.css
+++ b/tinychat/examples/tinychat/common.css
--- a/tinychat/examples/tinychat/favicon.svg
+++ b/tinychat/examples/tinychat/favicon.svg
--- a/tinychat/examples/tinychat/index.css
+++ b/tinychat/examples/tinychat/index.css
--- a/tinychat/examples/tinychat/index.html
+++ b/tinychat/examples/tinychat/index.html
@@ -22,7 +22,7 @@
 
				 <link href="/static/unpkg.com/@highlightjs/cdn-assets@11.9.0/styles/vs2015.min.css" rel="stylesheet"/>
			
 
				 <link href="/index.css" rel="stylesheet"/>
			
 
				 <link href="/common.css" rel="stylesheet"/>
			
 
				-</link></head>
			
 
				+</head>
			
 
				 <body>
			
 
				 <main x-data="state" x-init="console.log(endpoint)">
			
 
				      <!-- Error Toast -->
			
--- a/tinychat/examples/tinychat/index.js
+++ b/tinychat/examples/tinychat/index.js
--- a/tinychat/examples/tinychat/static/cdn.jsdelivr.net/npm/@alpine-collective/toolkit@1.0.2/dist/cdn.min.js
+++ b/tinychat/examples/tinychat/static/cdn.jsdelivr.net/npm/@alpine-collective/toolkit@1.0.2/dist/cdn.min.js
--- a/tinychat/examples/tinychat/static/cdn.jsdelivr.net/npm/@alpinejs/focus@3.x.x/dist/cdn.min.js
+++ b/tinychat/examples/tinychat/static/cdn.jsdelivr.net/npm/@alpinejs/focus@3.x.x/dist/cdn.min.js
--- a/tinychat/examples/tinychat/static/cdn.jsdelivr.net/npm/@alpinejs/intersect@3.x.x/dist/cdn.min.js
+++ b/tinychat/examples/tinychat/static/cdn.jsdelivr.net/npm/@alpinejs/intersect@3.x.x/dist/cdn.min.js
--- a/tinychat/examples/tinychat/static/cdn.jsdelivr.net/npm/purecss@3.0.0/build/base-min.css
+++ b/tinychat/examples/tinychat/static/cdn.jsdelivr.net/npm/purecss@3.0.0/build/base-min.css
--- a/tinychat/examples/tinychat/static/cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.2/css/all.min.css
+++ b/tinychat/examples/tinychat/static/cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.2/css/all.min.css
--- a/tinychat/examples/tinychat/static/cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.2/webfonts/fa-brands-400.ttf
+++ b/tinychat/examples/tinychat/static/cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.2/webfonts/fa-brands-400.ttf
--- a/tinychat/examples/tinychat/static/cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.2/webfonts/fa-brands-400.woff2
+++ b/tinychat/examples/tinychat/static/cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.2/webfonts/fa-brands-400.woff2
--- a/tinychat/examples/tinychat/static/cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.2/webfonts/fa-regular-400.ttf
+++ b/tinychat/examples/tinychat/static/cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.2/webfonts/fa-regular-400.ttf
--- a/tinychat/examples/tinychat/static/cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.2/webfonts/fa-regular-400.woff2
+++ b/tinychat/examples/tinychat/static/cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.2/webfonts/fa-regular-400.woff2
--- a/tinychat/examples/tinychat/static/cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.2/webfonts/fa-solid-900.ttf
+++ b/tinychat/examples/tinychat/static/cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.2/webfonts/fa-solid-900.ttf
--- a/tinychat/examples/tinychat/static/cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.2/webfonts/fa-solid-900.woff2
+++ b/tinychat/examples/tinychat/static/cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.2/webfonts/fa-solid-900.woff2
--- a/tinychat/examples/tinychat/static/cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.2/webfonts/fa-v4compatibility.ttf
+++ b/tinychat/examples/tinychat/static/cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.2/webfonts/fa-v4compatibility.ttf
--- a/tinychat/examples/tinychat/static/cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.2/webfonts/fa-v4compatibility.woff2
+++ b/tinychat/examples/tinychat/static/cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.2/webfonts/fa-v4compatibility.woff2
--- a/tinychat/examples/tinychat/static/fonts.googleapis.com/css2
+++ b/tinychat/examples/tinychat/static/fonts.googleapis.com/css2
--- a/tinychat/examples/tinychat/static/unpkg.com/@highlightjs/cdn-assets@11.9.0/highlight.min.js
+++ b/tinychat/examples/tinychat/static/unpkg.com/@highlightjs/cdn-assets@11.9.0/highlight.min.js
--- a/tinychat/examples/tinychat/static/unpkg.com/@highlightjs/cdn-assets@11.9.0/styles/vs2015.min.css
+++ b/tinychat/examples/tinychat/static/unpkg.com/@highlightjs/cdn-assets@11.9.0/styles/vs2015.min.css
--- a/tinychat/examples/tinychat/static/unpkg.com/@marcreichel/alpine-autosize@1.3.x/dist/alpine-autosize.min.js
+++ b/tinychat/examples/tinychat/static/unpkg.com/@marcreichel/alpine-autosize@1.3.x/dist/alpine-autosize.min.js
--- a/tinychat/examples/tinychat/static/unpkg.com/alpinejs@3.x.x/dist/cdn.min.js
+++ b/tinychat/examples/tinychat/static/unpkg.com/alpinejs@3.x.x/dist/cdn.min.js
--- a/tinychat/examples/tinychat/static/unpkg.com/dompurify@3.1.5/dist/purify.min.js
+++ b/tinychat/examples/tinychat/static/unpkg.com/dompurify@3.1.5/dist/purify.min.js
--- a/tinychat/examples/tinychat/static/unpkg.com/marked-highlight@2.1.2/lib/index.umd.js
+++ b/tinychat/examples/tinychat/static/unpkg.com/marked-highlight@2.1.2/lib/index.umd.js
--- a/tinychat/examples/tinychat/static/unpkg.com/marked@13.0.0/marked.min.js
+++ b/tinychat/examples/tinychat/static/unpkg.com/marked@13.0.0/marked.min.js
--- a/tinychat/examples/tinychat/update_deps.py
+++ b/tinychat/examples/tinychat/update_deps.py
--- a/install.sh
+++ b/install.sh
@@ -2,4 +2,4 @@
 
				 
			
 
				 python3 -m venv .venv
			
 
				 source .venv/bin/activate
			
 
				-pip install .
			
 
				+pip install -e .
			
--- a/setup.py
+++ b/setup.py
@@ -7,12 +7,8 @@ install_requires = [
 
				   "aiohttp==3.10.2",
			
 
				   "aiohttp_cors==0.7.0",
			
 
				   "aiofiles==24.1.0",
			
 
				-  "blobfile==2.1.1",
			
 
				   "grpcio==1.64.1",
			
 
				   "grpcio-tools==1.64.1",
			
 
				-  "hf-transfer==0.1.8",
			
 
				-  "huggingface-hub==0.24.5",
			
 
				-  "Jinja2==3.1.4",
			
 
				   "netifaces==0.11.0",
			
 
				   "numpy==2.0.0",
			
 
				   "pillow==10.4.0",
			
@@ -25,8 +21,6 @@ install_requires = [
 
				   "safetensors==0.4.3",
			
 
				   "tailscale==0.6.1",
			
 
				   "tenacity==9.0.0",
			
 
				-  "tiktoken==0.7.0",
			
 
				-  "tokenizers==0.19.1",
			
 
				   "tqdm==4.66.4",
			
 
				   "transformers==4.43.3",
			
 
				   "uuid==1.30",
			
@@ -55,4 +49,6 @@ setup(
 
				   packages=find_packages(),
			
 
				   install_requires=install_requires,
			
 
				   extras_require=extras_require,
			
 
				+  package_data={"exo": ["tinychat/**/*"]},
			
 
				+  entry_points={"console_scripts": ["exo = exo.main:run"]},
			
 
				 )