11 сар өмнө · acc94b50c7
--- a/exo/api/chatgpt_api.py
+++ b/exo/api/chatgpt_api.py
@@ -3,7 +3,7 @@ import time
 
				 import asyncio
			
 
				 import json
			
 
				 from pathlib import Path
			
 
				-from transformers import AutoTokenizer
			
 
				+from transformers import AutoTokenizer, AutoProcessor
			
 
				 from typing import List, Literal, Union, Dict
			
 
				 from aiohttp import web
			
 
				 import aiohttp_cors
			
@@ -42,11 +42,15 @@ shard_mappings = {
 
				   "deepseek-coder-v2-lite": {
			
 
				     "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/DeepSeek-Coder-V2-Lite-Instruct-4bit-mlx", start_layer=0, end_layer=0, n_layers=27),
			
 
				   },
			
 
				+  ### llava
			
 
				+  "llava-1.5-7b-hf": {
			
 
				+    "MLXDynamicShardInferenceEngine": Shard(model_id="llava-hf/llava-1.5-7b-hf", start_layer=0, end_layer=0, n_layers=32),
			
 
				+  },
			
 
				 }
			
 
				 
			
 
				 
			
 
				 class Message:
			
 
				-  def __init__(self, role: str, content: str):
			
 
				+  def __init__(self, role: str, content: Union[str, list]):
			
 
				     self.role = role
			
 
				     self.content = content
			
 
				 
			
@@ -68,6 +72,18 @@ def resolve_tinygrad_tokenizer(model_id: str):
 
				 
			
 
				 
			
 
				 async def resolve_tokenizer(model_id: str):
			
 
				+  try:
			
 
				+    if DEBUG >= 2: print(f"Trying to AutoProcessor for {model_id}")
			
 
				+    processor = AutoProcessor.from_pretrained(model_id)
			
 
				+    processor.eos_token_id = processor.tokenizer.eos_token_id
			
 
				+    processor.encode = processor.tokenizer.encode
			
 
				+    return processor
			
 
				+  except Exception as e:
			
 
				+    if DEBUG >= 2: print(f"Failed to load processor for {model_id}. Error: {e}")
			
 
				+    import traceback
			
 
				+
			
 
				+    if DEBUG >= 2: print(traceback.format_exc())
			
 
				+
			
 
				   try:
			
 
				     if DEBUG >= 2: print(f"Trying AutoTokenizer for {model_id}")
			
 
				     return AutoTokenizer.from_pretrained(model_id)
			
@@ -138,7 +154,18 @@ def generate_completion(
 
				 
			
 
				 
			
 
				 def build_prompt(tokenizer, messages: List[Message]):
			
 
				-  return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
			
 
				+  prompt =  tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
			
 
				+  image_str = None
			
 
				+  for message in messages:
			
 
				+    if not isinstance(message.content, list):
			
 
				+      continue
			
 
				+
			
 
				+    for content in message.content:
			
 
				+      if content.get("type", None) == "image":
			
 
				+        image_str = content.get("image", None)
			
 
				+        break
			
 
				+
			
 
				+  return prompt, image_str
			
 
				 
			
 
				 
			
 
				 def parse_message(data: dict):
			
@@ -195,7 +222,7 @@ class ChatGPTAPI:
 
				     shard = shard_mappings.get(data.get("model", "llama-3.1-8b"), {}).get(self.inference_engine_classname)
			
 
				     messages = [parse_message(msg) for msg in data.get("messages", [])]
			
 
				     tokenizer = await resolve_tokenizer(shard.model_id)
			
 
				-    return web.json_response({"length": len(build_prompt(tokenizer, messages))})
			
 
				+    return web.json_response({"length": len(build_prompt(tokenizer, messages)[0])})
			
 
				 
			
 
				   async def handle_post_chat_completions(self, request):
			
 
				     data = await request.json()
			
@@ -219,13 +246,13 @@ class ChatGPTAPI:
 
				     tokenizer = await resolve_tokenizer(shard.model_id)
			
 
				     if DEBUG >= 4: print(f"Resolved tokenizer: {tokenizer}")
			
 
				 
			
 
				-    prompt = build_prompt(tokenizer, chat_request.messages)
			
 
				+    prompt, image_str = build_prompt(tokenizer, chat_request.messages)
			
 
				     callback_id = f"chatgpt-api-wait-response-{request_id}"
			
 
				     callback = self.node.on_token.register(callback_id)
			
 
				 
			
 
				-    if DEBUG >= 2: print(f"Sending prompt from ChatGPT api {request_id=} {shard=} {prompt=}")
			
 
				+    if DEBUG >= 2: print(f"Sending prompt from ChatGPT api {request_id=} {shard=} {prompt=} {image_str=}")
			
 
				     try:
			
 
				-      await self.node.process_prompt(shard, prompt, request_id=request_id)
			
 
				+      await self.node.process_prompt(shard, prompt, image_str, request_id=request_id)
			
 
				     except Exception as e:
			
 
				       if DEBUG >= 2:
			
 
				         import traceback
			
@@ -294,7 +321,7 @@ class ChatGPTAPI:
 
				         )
			
 
				 
			
 
				         finish_reason = "length"
			
 
				-        eos_token_id = tokenizer.special_tokens_map.get("eos_token_id") if isinstance(tokenizer._tokenizer, AutoTokenizer) else tokenizer.eos_token_id
			
 
				+        eos_token_id = tokenizer.special_tokens_map.get("eos_token_id") if isinstance(getattr(tokenizer, "_tokenizer", None), AutoTokenizer) else tokenizer.eos_token_id
			
 
				         if DEBUG >= 2: print(f"Checking if end of tokens result {tokens[-1]=} is {eos_token_id=}")
			
 
				         if tokens[-1] == eos_token_id:
			
 
				           tokens = tokens[:-1]
			
--- a/exo/inference/mlx/sharded_inference_engine.py
+++ b/exo/inference/mlx/sharded_inference_engine.py
@@ -2,7 +2,7 @@ import numpy as np
 
				 import mlx.core as mx
			
 
				 from ..inference_engine import InferenceEngine
			
 
				 from .sharded_model import StatefulShardedModel
			
 
				-from .sharded_utils import load_shard
			
 
				+from .sharded_utils import load_shard, get_image_from_str
			
 
				 from ..shard import Shard
			
 
				 from typing import Optional
			
 
				 
			
@@ -11,9 +11,16 @@ class MLXDynamicShardInferenceEngine(InferenceEngine):
 
				   def __init__(self):
			
 
				     self.shard = None
			
 
				 
			
 
				-  async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, inference_state: Optional[str] = None) -> (np.ndarray, str, bool):
			
 
				+  async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> (np.ndarray, str, bool):
			
 
				     await self.ensure_shard(shard)
			
 
				-    output_data: np.ndarray = np.array(self.stateful_sharded_model.step(request_id, mx.array(self.tokenizer.encode(prompt))))
			
 
				+    if image_str:
			
 
				+      image = get_image_from_str(image_str)
			
 
				+      inputs = self.tokenizer(prompt, image, return_tensors="np")
			
 
				+      pixel_values = mx.array(inputs["pixel_values"])
			
 
				+      input_ids = mx.array(inputs["input_ids"])
			
 
				+      output_data: np.ndarray = np.array(self.stateful_sharded_model.step(request_id, input_ids, pixel_values))
			
 
				+    else:
			
 
				+      output_data: np.ndarray = np.array(self.stateful_sharded_model.step(request_id, mx.array(self.tokenizer.encode(prompt))))
			
 
				     return output_data, "", output_data.size == 1 and output_data.item() == self.tokenizer.eos_token_id
			
 
				 
			
 
				   async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> (np.ndarray, str, bool):
			
--- a/exo/inference/mlx/sharded_utils.py
+++ b/exo/inference/mlx/sharded_utils.py
@@ -8,6 +8,9 @@ import asyncio
 
				 from functools import partial
			
 
				 from pathlib import Path
			
 
				 from typing import Optional, Tuple
			
 
				+import requests
			
 
				+from PIL import Image
			
 
				+from io import BytesIO
			
 
				 
			
 
				 import mlx.core as mx
			
 
				 import mlx.nn as nn
			
@@ -222,7 +225,18 @@ async def load_shard(
 
				   # TODO: figure out a generic solution
			
 
				   if model.model_type == "llava":
			
 
				     processor = AutoProcessor.from_pretrained(model_path)
			
 
				+    processor.eos_token_id = processor.tokenizer.eos_token_id
			
 
				+    processor.encode = processor.tokenizer.encode
			
 
				     return model, processor
			
 
				   else:
			
 
				     tokenizer = load_tokenizer(model_path, tokenizer_config)
			
 
				     return model, tokenizer
			
 
				+
			
 
				+def get_image_from_str(image_str: str):
			
 
				+  if image_str.startswith("http"):
			
 
				+    response = requests.get(image_str, timeout=10)
			
 
				+    image = Image.open(BytesIO(response.content)).convert("RGB")
			
 
				+  else:
			
 
				+    imgdata = base64.b64decode(image_str)
			
 
				+    image = Image.open(io.BytesIO(imgdata))
			
 
				+  return image
			
--- a/exo/inference/mlx/test_sharded_llava.py
+++ b/exo/inference/mlx/test_sharded_llava.py
@@ -15,9 +15,11 @@ shard_full = Shard("llava", 0, 31, 32)
 
				 shard1 = Shard("llava", 0, 12, 32)
			
 
				 shard2 = Shard("llava", 13, 31, 32)
			
 
				 
			
 
				-full_model_shard, full_processor = asyncio.run(load_shard("llava-hf/llava-1.5-7b-hf", shard=shard_full))
			
 
				-model_shard1, processor1 = asyncio.run(load_shard("llava-hf/llava-1.5-7b-hf", shard=shard1))
			
 
				-model_shard2, processor2 = asyncio.run(load_shard("llava-hf/llava-1.5-7b-hf", shard=shard2))
			
 
				+model_path = "llava-hf/llava-1.5-7b-hf"
			
 
				+
			
 
				+full_model_shard, full_processor = asyncio.run(load_shard(model_path, shard=shard_full))
			
 
				+model_shard1, processor1 = asyncio.run(load_shard(model_path, shard=shard1))
			
 
				+model_shard2, processor2 = asyncio.run(load_shard(model_path, shard=shard2))
			
 
				 
			
 
				 full = StatefulShardedModel(shard_full, full_model_shard)
			
 
				 m1 = StatefulShardedModel(shard1, model_shard1)
			
--- a/exo/networking/grpc/grpc_peer_handle.py
+++ b/exo/networking/grpc/grpc_peer_handle.py
@@ -39,9 +39,10 @@ class GRPCPeerHandle(PeerHandle):
 
				     self.channel = None
			
 
				     self.stub = None
			
 
				 
			
 
				-  async def send_prompt(self, shard: Shard, prompt: str, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.array]:
			
 
				+  async def send_prompt(self, shard: Shard, prompt: str, image_str: Optional[str] = None, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.array]:
			
 
				     request = node_service_pb2.PromptRequest(
			
 
				       prompt=prompt,
			
 
				+      image_str=image_str,
			
 
				       shard=node_service_pb2.Shard(
			
 
				         model_id=shard.model_id,
			
 
				         start_layer=shard.start_layer,
			
--- a/exo/networking/grpc/grpc_server.py
+++ b/exo/networking/grpc/grpc_server.py
@@ -45,9 +45,10 @@ class GRPCServer(node_service_pb2_grpc.NodeServiceServicer):
 
				       n_layers=request.shard.n_layers,
			
 
				     )
			
 
				     prompt = request.prompt
			
 
				+    image_str = request.image_str
			
 
				     request_id = request.request_id
			
 
				-    result = await self.node.process_prompt(shard, prompt, request_id)
			
 
				-    if DEBUG >= 2: print(f"SendPrompt {shard=} {prompt=} {request_id=} result: {result}")
			
 
				+    result = await self.node.process_prompt(shard, prompt, image_str, request_id)
			
 
				+    if DEBUG >= 2: print(f"SendPrompt {shard=} {prompt=} {image=} {request_id=} result: {result}")
			
 
				     tensor_data = result.tobytes() if result is not None else None
			
 
				     return node_service_pb2.Tensor(tensor_data=tensor_data, shape=result.shape, dtype=str(result.dtype)) if result is not None else node_service_pb2.Tensor()
			
 
				 
			
--- a/exo/networking/grpc/node_service.proto
+++ b/exo/networking/grpc/node_service.proto
@@ -21,8 +21,9 @@ message Shard {
 
				 message PromptRequest {
			
 
				   Shard shard = 1;
			
 
				   string prompt = 2;
			
 
				-  optional string request_id = 3;
			
 
				-  optional string inference_state = 4;
			
 
				+  optional string image_str = 3;
			
 
				+  optional string request_id = 4;
			
 
				+  optional string inference_state = 5;
			
 
				 }
			
 
				 
			
 
				 message TensorRequest {
			
--- a/exo/networking/grpc/node_service_pb2.py
+++ b/exo/networking/grpc/node_service_pb2.py
--- a/exo/networking/peer_handle.py
+++ b/exo/networking/peer_handle.py
@@ -28,7 +28,7 @@ class PeerHandle(ABC):
 
				     pass
			
 
				 
			
 
				   @abstractmethod
			
 
				-  async def send_prompt(self, shard: Shard, prompt: str, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.array]:
			
 
				+  async def send_prompt(self, shard: Shard, prompt: str, image_str: Optional[str] = None, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.array]:
			
 
				     pass
			
 
				 
			
 
				   @abstractmethod
			
--- a/exo/orchestration/node.py
+++ b/exo/orchestration/node.py
@@ -16,7 +16,7 @@ class Node(ABC):
 
				     pass
			
 
				 
			
 
				   @abstractmethod
			
 
				-  async def process_prompt(self, shard: Shard, prompt: str, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.ndarray]:
			
 
				+  async def process_prompt(self, shard: Shard, prompt: str, image_str: Optional[str] = None, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.ndarray]:
			
 
				     pass
			
 
				 
			
 
				   @abstractmethod
			
--- a/exo/orchestration/standard_node.py
+++ b/exo/orchestration/standard_node.py
@@ -69,7 +69,7 @@ class StandardNode(Node):
 
				     await self.discovery.stop()
			
 
				     await self.server.stop()
			
 
				 
			
 
				-  async def process_prompt(self, base_shard: Shard, prompt: str, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.ndarray]:
			
 
				+  async def process_prompt(self, base_shard: Shard, prompt: str, image_str: Optional[str] = None, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.ndarray]:
			
 
				     shard = self.get_current_shard(base_shard)
			
 
				     asyncio.create_task(
			
 
				       self.broadcast_opaque_status(
			
@@ -82,6 +82,7 @@ class StandardNode(Node):
 
				             "base_shard": base_shard.to_dict(),
			
 
				             "shard": shard.to_dict(),
			
 
				             "prompt": prompt,
			
 
				+            "image_str": image_str,
			
 
				             "inference_state": inference_state,
			
 
				             "request_id": request_id,
			
 
				           }
			
@@ -89,7 +90,7 @@ class StandardNode(Node):
 
				       )
			
 
				     )
			
 
				     start_time = time.perf_counter_ns()
			
 
				-    resp = await self._process_prompt(base_shard, prompt, request_id, inference_state)
			
 
				+    resp = await self._process_prompt(base_shard, prompt, image_str, request_id, inference_state)
			
 
				     end_time = time.perf_counter_ns()
			
 
				     elapsed_time_ns = end_time - start_time
			
 
				     asyncio.create_task(
			
@@ -103,6 +104,7 @@ class StandardNode(Node):
 
				             "base_shard": base_shard.to_dict(),
			
 
				             "shard": shard.to_dict(),
			
 
				             "prompt": prompt,
			
 
				+            "image_str": image_str,
			
 
				             "inference_state": inference_state,
			
 
				             "request_id": request_id,
			
 
				             "elapsed_time_ns": elapsed_time_ns,
			
@@ -113,20 +115,20 @@ class StandardNode(Node):
 
				     )
			
 
				     return resp
			
 
				 
			
 
				-  async def _process_prompt(self, base_shard: Shard, prompt: str, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.ndarray]:
			
 
				+  async def _process_prompt(self, base_shard: Shard, prompt: str, image_str: Optional[str] = None, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.ndarray]:
			
 
				     if request_id is None:
			
 
				       request_id = str(uuid.uuid4())
			
 
				     if request_id not in self.buffered_token_output:
			
 
				       self.buffered_token_output[request_id] = ([], False)
			
 
				     shard = self.get_current_shard(base_shard)
			
 
				 
			
 
				-    if DEBUG >= 2: print(f"[{request_id}] process prompt: {base_shard=} {shard=} {prompt=}")
			
 
				+    if DEBUG >= 2: print(f"[{request_id}] process prompt: {base_shard=} {shard=} {prompt=} {image_str=}")
			
 
				     if shard.start_layer != 0:
			
 
				-      if DEBUG >= 2: print(f"[{request_id}] forwarding to next shard: {base_shard=} {shard=} {prompt=}")
			
 
				-      await self.forward_to_next_shard(shard, prompt, request_id)
			
 
				+      if DEBUG >= 2: print(f"[{request_id}] forwarding to next shard: {base_shard=} {shard=} {prompt=} {image_str=}")
			
 
				+      await self.forward_to_next_shard(shard, prompt, request_id, image_str)
			
 
				       return
			
 
				 
			
 
				-    result, inference_state, is_finished = await self.inference_engine.infer_prompt(request_id, shard, prompt, inference_state=inference_state)
			
 
				+    result, inference_state, is_finished = await self.inference_engine.infer_prompt(request_id, shard, prompt, image_str, inference_state=inference_state)
			
 
				     is_finished = is_finished or len(self.buffered_token_output[request_id][0]) >= self.max_generate_tokens
			
 
				     if is_finished:
			
 
				       self.buffered_token_output[request_id] = (self.buffered_token_output[request_id][0], True)
			
@@ -234,6 +236,7 @@ class StandardNode(Node):
 
				     base_shard: Shard,
			
 
				     tensor_or_prompt: Union[np.ndarray, str],
			
 
				     request_id: str,
			
 
				+    image_str: Optional[str] = None,
			
 
				     inference_state: Optional[str] = None,
			
 
				   ) -> None:
			
 
				     if not self.partitioning_strategy:
			
@@ -255,7 +258,7 @@ class StandardNode(Node):
 
				         if isinstance(tensor_or_prompt, np.ndarray):
			
 
				           await self.process_tensor(shard, tensor_or_prompt, request_id, inference_state=inference_state)
			
 
				         else:
			
 
				-          await self.process_prompt(shard, tensor_or_prompt, request_id, inference_state=inference_state)
			
 
				+          await self.process_prompt(shard, tensor_or_prompt, image_str, request_id, inference_state=inference_state)
			
 
				         return
			
 
				 
			
 
				       target_peer = next((p for p in self.peers if p.id() == next_partition.node_id), None)
			
@@ -267,7 +270,7 @@ class StandardNode(Node):
 
				       if isinstance(tensor_or_prompt, np.ndarray):
			
 
				         await target_peer.send_tensor(next_shard, tensor_or_prompt, request_id=request_id, inference_state=inference_state)
			
 
				       else:
			
 
				-        await target_peer.send_prompt(next_shard, tensor_or_prompt, request_id=request_id, inference_state=inference_state)
			
 
				+        await target_peer.send_prompt(next_shard, tensor_or_prompt, image_str=image_str, request_id=request_id, inference_state=inference_state)
			
 
				 
			
 
				   def get_current_shard(self, base_shard: Shard) -> Shard:
			
 
				     partitions = self.partitioning_strategy.partition(self.topology)
			
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@ install_requires = [
 
				     "tiktoken==0.7.0",
			
 
				     "tokenizers==0.19.1",
			
 
				     "tqdm==4.66.4",
			
 
				-    "transformers==4.41.2",
			
 
				+    "transformers==4.43.3",
			
 
				     "uuid==1.30",
			
 
				     "tinygrad @ git+https://github.com/tinygrad/tinygrad.git@a9f5a764dc640a5e5cbaaeeee21df7c8ca37da38",
			
 
				 ]