8 months ago · b400a442ee
--- a/exo/api/chatgpt_api.py
+++ b/exo/api/chatgpt_api.py
@@ -117,19 +117,11 @@ def remap_messages(messages: List[Message]) -> List[Message]:
 
															 def build_prompt(tokenizer, _messages: List[Message]):
														
 
															   messages = remap_messages(_messages)
														
 
															   prompt = tokenizer.apply_chat_template([m.to_dict() for m in messages], tokenize=False, add_generation_prompt=True)
														
 
															-  image_str = None
														
 
															   for message in messages:
														
 
															     if not isinstance(message.content, list):
														
 
															       continue
														
 
															-    for content in message.content:
														
 
															-      # note: we only support one image at a time right now. Multiple is possible. See: https://github.com/huggingface/transformers/blob/e68ec18ce224af879f22d904c7505a765fb77de3/docs/source/en/model_doc/llava.md?plain=1#L41
														
 
															-      # follows the convention in https://platform.openai.com/docs/guides/vision
														
 
															-      if isinstance(content, dict) and content.get("type", None) == "image":
														
 
															-        image_str = content.get("image", None)
														
 
															-        break
														
 
															-
														
 
															-  return prompt, image_str
														
 
															+  return prompt
														
 
															 def parse_message(data: dict):
														
@@ -246,7 +238,7 @@ class ChatGPTAPI:
 
															     tokenizer = await resolve_tokenizer(shard.model_id)
														
 
															     if DEBUG >= 4: print(f"Resolved tokenizer: {tokenizer}")
														
 
															-    prompt, image_str = build_prompt(tokenizer, chat_request.messages)
														
 
															+    prompt = build_prompt(tokenizer, chat_request.messages)
														
 
															     request_id = str(uuid.uuid4())
														
 
															     if self.on_chat_completion_request:
														
 
															       try:
														
@@ -269,10 +261,10 @@ class ChatGPTAPI:
 
															     callback_id = f"chatgpt-api-wait-response-{request_id}"
														
 
															     callback = self.node.on_token.register(callback_id)
														
 
															-    if DEBUG >= 2: print(f"Sending prompt from ChatGPT api {request_id=} {shard=} {prompt=} {image_str=}")
														
 
															+    if DEBUG >= 2: print(f"Sending prompt from ChatGPT api {request_id=} {shard=} {prompt=}")
														
 
															     try:
														
 
															-      await asyncio.wait_for(asyncio.shield(asyncio.create_task(self.node.process_prompt(shard, prompt, image_str, request_id=request_id))), timeout=self.response_timeout)
														
 
															+      await asyncio.wait_for(asyncio.shield(asyncio.create_task(self.node.process_prompt(shard, prompt, request_id=request_id))), timeout=self.response_timeout)
														
 
															       if DEBUG >= 2: print(f"Waiting for response to finish. timeout={self.response_timeout}s")
														
--- a/exo/inference/dummy_inference_engine.py
+++ b/exo/inference/dummy_inference_engine.py
@@ -1,60 +1,38 @@
 
															 from typing import Optional, Tuple, TYPE_CHECKING
														
 
															 import numpy as np
														
 
															+import random
														
 
															+import string
														
 
															 import asyncio
														
 
															 import json
														
 
															 from exo.inference.inference_engine import InferenceEngine
														
 
															 from exo.inference.shard import Shard
														
 
															-
														
 
															+def random_string(length: int):
														
 
															+  return ''.join([random.choice(string.ascii_lowercase) for i in range(length)])
														
 
															+  
														
 
															 class DummyInferenceEngine(InferenceEngine):
														
 
															   def __init__(self):
														
 
															     self.shard = None
														
 
															     self.vocab_size = 1000
														
 
															+    self.hidden_size = 256
														
 
															     self.eos_token_id = 0
														
 
															     self.latency_mean = 0.1
														
 
															     self.latency_stddev = 0.02
														
 
															-  async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
														
 
															-    try:
														
 
															-      await self.ensure_shard(shard)
														
 
															-
														
 
															-      # Generate random tokens
														
 
															-      output_length = np.random.randint(1, 10)
														
 
															-      output = np.random.randint(1, self.vocab_size, size=(1, output_length))
														
 
															-
														
 
															-      # Simulate latency
														
 
															-      await asyncio.sleep(max(0, np.random.normal(self.latency_mean, self.latency_stddev)))
														
 
															-
														
 
															-      # Randomly decide if finished
														
 
															-      is_finished = np.random.random() < 0.2
														
 
															-      if is_finished:
														
 
															-        output = np.array([[self.eos_token_id]])
														
 
															-
														
 
															-      new_state = json.dumps({"dummy_state": "some_value"})
														
 
															+  async def encode(self, shard: Shard, prompt: str) -> np.ndarray:
														
 
															+    return np.random.randint(1, self.vocab_size, size=(1, len(prompt.split())))
														
 
															+  
														
 
															+  async def sample(self, x: np.ndarray) -> np.ndarray:
														
 
															+    return np.random.randint(1, self.vocab_size)
														
 
															-      return output, new_state, is_finished
														
 
															-    except Exception as e:
														
 
															-      print(f"Error in DummyInferenceEngine.infer_prompt: {str(e)}")
														
 
															-      return np.array([[self.eos_token_id]]), json.dumps({"error": str(e)}), True
														
 
															+  async def decode(self, shard: Shard, tokens: np.ndarray) -> str:
														
 
															+    return ' '.join([random_string(np.random.randint(1, 34)) for token in tokens])
														
 
															-  async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
														
 
															+  async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> np.ndarray:
														
 
															     await self.ensure_shard(shard)
														
 
															-    state = json.loads(inference_state or "{}")
														
 
															-    start_pos = state.get("start_pos", 0)
														
 
															-
														
 
															-    output_length = np.random.randint(1, 10)
														
 
															-    output = np.random.randint(1, self.vocab_size, size=(1, output_length))
														
 
															-
														
 
															-    await asyncio.sleep(max(0, np.random.normal(self.latency_mean, self.latency_stddev)))
														
 
															-
														
 
															-    is_finished = np.random.random() < 0.2
														
 
															-    if is_finished:
														
 
															-      output = np.array([[self.eos_token_id]])
														
 
															-
														
 
															-    start_pos += input_data.shape[1] + output_length
														
 
															-    new_state = json.dumps({"start_pos": start_pos})
														
 
															-
														
 
															-    return output, new_state, is_finished
														
 
															+    sequence_length = input_data.shape[0 if self.shard.is_first_layer() else 1]
														
 
															+    output = np.random.random(size=(1, sequence_length, self.vocab_size if self.shard.is_last_layer() else self.hidden_size))
														
 
															+    return output
														
 
															   async def ensure_shard(self, shard: Shard):
														
 
															     if self.shard == shard:
														
--- a/exo/inference/inference_engine.py
+++ b/exo/inference/inference_engine.py
@@ -9,12 +9,25 @@ from .shard import Shard
 
															 class InferenceEngine(ABC):
														
 
															   @abstractmethod
														
 
															-  async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
														
 
															+  async def encode(self, shard: Shard, prompt: str) -> np.ndarray:
														
 
															+    pass
														
 
															+  
														
 
															+  @abstractmethod
														
 
															+  async def sample(self, x: np.ndarray) -> np.ndarray:
														
 
															+    pass
														
 
															+
														
 
															+  @abstractmethod
														
 
															+  async def decode(self, shard: Shard, tokens: np.ndarray) -> str:
														
 
															     pass
														
 
															   @abstractmethod
														
 
															-  async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
														
 
															+  async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> np.ndarray:
														
 
															     pass
														
 
															+  
														
 
															+  async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, inference_state: Optional[str] = None) -> np.ndarray:
														
 
															+    tokens = await self.encode(shard, prompt)
														
 
															+    output_data = await self.infer_tensor(request_id, shard, tokens, inference_state)
														
 
															+    return output_data 
														
 
															 def get_inference_engine(inference_engine_name: str, shard_downloader: 'ShardDownloader'):
														
@@ -33,4 +46,4 @@ def get_inference_engine(inference_engine_name: str, shard_downloader: 'ShardDow
 
															   elif inference_engine_name == "dummy":
														
 
															     from exo.inference.dummy_inference_engine import DummyInferenceEngine
														
 
															     return DummyInferenceEngine()
														
 
															-  raise ValueError(f"Unsupported inference engine: {inference_engine_name}")
														
 
															+  raise ValueError(f"Unsupported inference engine: {inference_engine_name}")
														
--- a/exo/inference/mlx/sharded_inference_engine.py
+++ b/exo/inference/mlx/sharded_inference_engine.py
@@ -1,15 +1,35 @@
 
															 import numpy as np
														
 
															 import mlx.core as mx
														
 
															+import mlx.nn as nn
														
 
															 from ..inference_engine import InferenceEngine
														
 
															-from .sharded_model import StatefulShardedModel
														
 
															+from .stateful_model import StatefulModel
														
 
															 from .sharded_utils import load_shard, get_image_from_str
														
 
															 from ..shard import Shard
														
 
															-from typing import Optional
														
 
															+from typing import Dict, Optional, Tuple
														
 
															 from exo.download.shard_download import ShardDownloader
														
 
															 import asyncio
														
 
															 from concurrent.futures import ThreadPoolExecutor
														
 
															 from functools import partial
														
 
															+def sample_logits(
														
 
															+  logits: mx.array,
														
 
															+  temp: float = 0.0,
														
 
															+  top_p: float = 1.0,
														
 
															+  logit_bias: Optional[Dict[int, float]] = None
														
 
															+) -> Tuple[mx.array, float]:
														
 
															+  if logit_bias:
														
 
															+    indices = mx.array(list(logit_bias.keys()))
														
 
															+    values = mx.array(list(logit_bias.values()))
														
 
															+    logits[:, indices] += values
														
 
															+  if temp == 0:
														
 
															+    token = mx.argmax(logits, axis=-1)
														
 
															+  else:
														
 
															+    if top_p > 0 and top_p < 1.0:
														
 
															+      token = top_p_sampling(logits, top_p, temp)
														
 
															+    else:
														
 
															+      token = mx.random.categorical(logits*(1/temp))
														
 
															+
														
 
															+  return token
														
 
															 class MLXDynamicShardInferenceEngine(InferenceEngine):
														
 
															   def __init__(self, shard_downloader: ShardDownloader):
														
@@ -17,25 +37,26 @@ class MLXDynamicShardInferenceEngine(InferenceEngine):
 
															     self.shard_downloader = shard_downloader
														
 
															     self.executor = ThreadPoolExecutor(max_workers=1)
														
 
															-  async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> (np.ndarray, str, bool):
														
 
															+  async def sample(self, x, temp: float = 0.0, top_p: float = 1.0) -> np.ndarray:
														
 
															+    y = mx.array(x)
														
 
															+    logits = y[:, -1, :]
														
 
															+    out = np.array(sample_logits(logits, temp=temp, top_p=top_p))
														
 
															+    return out
														
 
															+
														
 
															+  async def encode(self, shard: Shard, prompt: str) -> np.ndarray:
														
 
															     await self.ensure_shard(shard)
														
 
															-    loop = asyncio.get_running_loop()
														
 
															-    if image_str:
														
 
															-      image = await get_image_from_str(image_str)
														
 
															-      tokenize = partial(self.tokenizer, prompt, image, return_tensors="np")
														
 
															-      inputs = await loop.run_in_executor(self.executor, tokenize)
														
 
															-      pixel_values = mx.array(inputs["pixel_values"])
														
 
															-      input_ids = mx.array(inputs["input_ids"])
														
 
															-      output_data: np.ndarray = np.array(await loop.run_in_executor(self.executor, self.stateful_sharded_model.step, request_id, input_ids, pixel_values))
														
 
															-    else:
														
 
															-      input_ids = mx.array(await loop.run_in_executor(self.executor, self.tokenizer.encode, prompt))
														
 
															-      output_data: np.ndarray = np.array(await loop.run_in_executor(self.executor, self.stateful_sharded_model.step, request_id, input_ids))
														
 
															-    return output_data, "", output_data.size == 1 and output_data.item() == self.tokenizer.eos_token_id
														
 
															+    tokens = await asyncio.get_running_loop().run_in_executor(self.executor, self.tokenizer.encode, prompt)
														
 
															+    return np.array(tokens)
														
 
															-  async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> (np.ndarray, str, bool):
														
 
															+  async def decode(self, shard: Shard, tokens) -> str:
														
 
															+    await self.ensure_shard(shard)
														
 
															+    tokens = await asyncio.get_running_loop().run_in_executor(self.executor, self.tokenizer.decode, tokens)
														
 
															+    return tokens
														
 
															+    
														
 
															+  async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> np.ndarray:
														
 
															     await self.ensure_shard(shard)
														
 
															-    output_data: np.ndarray = np.array(await asyncio.get_running_loop().run_in_executor(self.executor, self.stateful_sharded_model.step, request_id, mx.array(input_data)))
														
 
															-    return output_data, "", output_data.size == 1 and output_data.item() == self.tokenizer.eos_token_id
														
 
															+    output_data: np.ndarray = np.array(await asyncio.get_running_loop().run_in_executor(self.executor, self.model, mx.array(input_data), request_id))
														
 
															+    return output_data
														
 
															   async def ensure_shard(self, shard: Shard):
														
 
															     if self.shard == shard:
														
@@ -50,5 +71,5 @@ class MLXDynamicShardInferenceEngine(InferenceEngine):
 
															         return asyncio.run(load_shard(model_path, shard))
														
 
															       model_shard, self.tokenizer = await loop.run_in_executor(self.executor, load_shard_wrapper)
														
 
															-      self.stateful_sharded_model = await loop.run_in_executor(self.executor, StatefulShardedModel, shard, model_shard)
														
 
															       self.shard = shard
														
 
															+      self.model = await loop.run_in_executor(self.executor, StatefulModel, model_shard) 
														
--- a/exo/inference/mlx/sharded_model.py
+++ b/exo/inference/mlx/sharded_model.py
@@ -1,89 +0,0 @@
 
															-from typing import Dict, Generator, Optional, Tuple
														
 
															-from collections import OrderedDict
														
 
															-
														
 
															-import mlx.core as mx
														
 
															-import mlx.nn as nn
														
 
															-from mlx_lm.models.cache import make_prompt_cache
														
 
															-from mlx_lm.sample_utils import top_p_sampling
														
 
															-
														
 
															-from ..shard import Shard
														
 
															-
														
 
															-
														
 
															-# TODO: support a speculative model so we can parallelise compute across devices
														
 
															-class StatefulShardedModel:
														
 
															-  def __init__(self, shard: Shard, model: nn.Module, max_kv_size: int = 1024, max_caches: int = 2):
														
 
															-    self.shard = shard
														
 
															-    self.model = model
														
 
															-    self.max_kv_size = max_kv_size
														
 
															-    self.max_caches = max_caches
														
 
															-    self.caches = OrderedDict()
														
 
															-
														
 
															-  def step(
														
 
															-    self,
														
 
															-    request_id: str,
														
 
															-    x,
														
 
															-    pixel_values=None,
														
 
															-    temp: float = 0.0,
														
 
															-    top_p: float = 1.0,
														
 
															-    logit_bias: Optional[Dict[int, float]] = None,
														
 
															-  ) -> Generator[Tuple[mx.array, mx.array], None, None]:
														
 
															-    def sample(logits: mx.array) -> Tuple[mx.array, float]:
														
 
															-      if logit_bias:
														
 
															-        indices = mx.array(list(logit_bias.keys()))
														
 
															-        values = mx.array(list(logit_bias.values()))
														
 
															-        logits[:, indices] += values
														
 
															-
														
 
															-      if temp == 0:
														
 
															-        token = mx.argmax(logits, axis=-1)
														
 
															-      else:
														
 
															-        if top_p > 0 and top_p < 1.0:
														
 
															-          token = top_p_sampling(logits, top_p, temp)
														
 
															-        else:
														
 
															-          token = mx.random.categorical(logits*(1/temp))
														
 
															-
														
 
															-      return token
														
 
															-
														
 
															-    y = x
														
 
															-
														
 
															-    if request_id not in self.caches:
														
 
															-      self.init_cache(request_id)
														
 
															-    else:
														
 
															-      self.caches.move_to_end(request_id)
														
 
															-
														
 
															-    cache = self.caches[request_id]
														
 
															-
														
 
															-    if pixel_values is None:
														
 
															-      output = self.model(y[None] if self.shard.is_first_layer() else y, cache=cache)
														
 
															-    else:
														
 
															-      output = self.model(y, pixel_values=pixel_values, cache=cache)
														
 
															-
														
 
															-    if self.shard.is_last_layer():
														
 
															-      logits = output[:, -1, :]
														
 
															-      y = sample(logits)
														
 
															-      return y
														
 
															-    else:
														
 
															-      return output
														
 
															-
														
 
															-  def __call__(
														
 
															-    self,
														
 
															-    request_id: str,
														
 
															-    x,
														
 
															-    temp: float = 0.0,
														
 
															-    top_p: float = 1.0,
														
 
															-    logit_bias: Optional[Dict[int, float]] = None,
														
 
															-  ) -> Generator[Tuple[mx.array, mx.array], None, None]:
														
 
															-    return self.step(request_id, x, temp=temp, top_p=top_p, logit_bias=logit_bias)
														
 
															-
														
 
															-  def init_cache(self, request_id: str):
														
 
															-    kv_heads = ([self.model.n_kv_heads]*len(self.model.layers) if isinstance(self.model.n_kv_heads, int) else self.model.n_kv_heads)
														
 
															-    # if self.max_kv_size is not None:
														
 
															-      # cache = [RotatingKVCache(self.model.head_dim, n, max_size=self.max_kv_size, keep=4) for n in kv_heads]
														
 
															-      # cache = [KVCache(self.model.head_dim, n) for n in kv_heads]
														
 
															-    # else:
														
 
															-      # cache = [KVCache(self.model.head_dim, n) for n in kv_heads]
														
 
															-    cache = make_prompt_cache(self.model)
														
 
															-
														
 
															-    if len(self.caches) >= self.max_caches:
														
 
															-      self.caches.popitem(last=False)
														
 
															-
														
 
															-    self.caches[request_id] = cache
														
--- a/exo/inference/mlx/sharded_utils.py
+++ b/exo/inference/mlx/sharded_utils.py
@@ -68,7 +68,6 @@ def load_config(model_path: Path) -> dict:
 
															     raise
														
 
															   return config
														
 
															-
														
 
															 def load_model_shard(
														
 
															   model_path: Path,
														
 
															   shard: Shard,
														
@@ -131,8 +130,17 @@ def load_model_shard(
 
															   model_class, model_args_class = _get_classes(config=config)
														
 
															+  class ShardedModel(model_class):
														
 
															+    def __init__(self, args):
														
 
															+      super().__init__(args)
														
 
															+      self.shard = Shard(args.shard.model_id, args.shard.start_layer, args.shard.end_layer, args.shard.n_layers)
														
 
															+
														
 
															+    def __call__(self, x, *args, **kwargs):
														
 
															+      y = super().__call__(x[None] if self.shard.is_first_layer() else x, *args, **kwargs)
														
 
															+      return y
														
 
															+
														
 
															   model_args = model_args_class.from_dict(config)
														
 
															-  model = model_class(model_args)
														
 
															+  model = ShardedModel(model_args)
														
 
															   if hasattr(model, "sanitize"):
														
 
															     weights = model.sanitize(weights)
														
@@ -158,7 +166,6 @@ def load_model_shard(
 
															   model.eval()
														
 
															   return model
														
 
															-
														
 
															 async def load_shard(
														
 
															   model_path: str,
														
 
															   shard: Shard,
														
--- a/exo/inference/mlx/stateful_model.py
+++ b/exo/inference/mlx/stateful_model.py
@@ -0,0 +1,42 @@
 
															+from typing import Dict, Tuple
														
 
															+from collections import OrderedDict
														
 
															+
														
 
															+import mlx.core as mx
														
 
															+import mlx.nn as nn
														
 
															+from mlx_lm.models.cache import make_prompt_cache
														
 
															+
														
 
															+from ..shard import Shard
														
 
															+
														
 
															+class StatefulModel(nn.Module):
														
 
															+  def __init__(self, model, max_kv_size: int = 1024, max_caches: int = 2):
														
 
															+    super().__init__()
														
 
															+    self.model = model
														
 
															+    self.max_kv_size = max_kv_size
														
 
															+    self.max_caches = max_caches
														
 
															+    self.caches = OrderedDict()
														
 
															+  
														
 
															+  def init_cache(self, request_id: str):
														
 
															+    kv_heads = ([self.model.n_kv_heads]*len(self.model.layers) if isinstance(self.model.n_kv_heads, int) else self.model.n_kv_heads)
														
 
															+    # if self.max_kv_size is not None:
														
 
															+      # cache = [RotatingKVCache(self.model.head_dim, n, max_size=self.max_kv_size, keep=4) for n in kv_heads]
														
 
															+      # cache = [KVCache(self.model.head_dim, n) for n in kv_heads]
														
 
															+    # else:
														
 
															+      # cache = [KVCache(self.model.head_dim, n) for n in kv_heads]
														
 
															+    cache = make_prompt_cache(self.model)
														
 
															+
														
 
															+    if len(self.caches) >= self.max_caches:
														
 
															+      self.caches.popitem(last=False)
														
 
															+
														
 
															+    self.caches[request_id] = cache
														
 
															+
														
 
															+  def __call__(self, x, request_id: str):
														
 
															+    if request_id not in self.caches:
														
 
															+      self.init_cache(request_id)
														
 
															+    else:
														
 
															+      self.caches.move_to_end(request_id)
														
 
															+
														
 
															+    cache = self.caches[request_id]
														
 
															+
														
 
															+    y = self.model(x, cache=cache)
														
 
															+    return y
														
 
															+    
														
--- a/exo/inference/mlx/test_sharded_llama.py
+++ b/exo/inference/mlx/test_sharded_llama.py
@@ -1,5 +1,5 @@
 
															 import mlx.core as mx
														
 
															-from exo.inference.mlx.sharded_model import StatefulShardedModel
														
 
															+from exo.inference.mlx.stateful_model import StatefulModel
														
 
															 from exo.inference.mlx.sharded_utils import load_shard
														
 
															 from exo.inference.shard import Shard
														
@@ -12,9 +12,9 @@ full_model_shard, full_tokenizer = load_shard("mlx-community/Meta-Llama-3-8B-Ins
 
															 model_shard1, tokenizer1 = load_shard("mlx-community/Meta-Llama-3-8B-Instruct-4bit", shard=shard1)
														
 
															 model_shard2, tokenizer2 = load_shard("mlx-community/Meta-Llama-3-8B-Instruct-4bit", shard=shard2)
														
 
															-full = StatefulShardedModel(shard_full, full_model_shard)
														
 
															-m1 = StatefulShardedModel(shard1, model_shard1)
														
 
															-m2 = StatefulShardedModel(shard2, model_shard2)
														
 
															+full = StatefulModel(shard_full, full_model_shard)
														
 
															+m1 = StatefulModel(shard1, model_shard1)
														
 
															+m2 = StatefulModel(shard2, model_shard2)
														
 
															 prompt = "write a beautiful haiku about a utopia where people own their AI with edge intelligence:"
														
 
															 prompt_tokens = mx.array(full_tokenizer.encode(prompt))
														
--- a/exo/inference/mlx/test_sharded_llava.py
+++ b/exo/inference/mlx/test_sharded_llava.py
@@ -7,7 +7,7 @@ from io import BytesIO
 
															 import mlx.core as mx
														
 
															 from mlx_lm.models.cache import KVCache
														
 
															-from exo.inference.mlx.sharded_model import StatefulShardedModel
														
 
															+from exo.inference.mlx.stateful_model import StatefulModel
														
 
															 from exo.inference.mlx.sharded_utils import load_shard
														
 
															 from exo.inference.shard import Shard
														
--- a/exo/inference/tinygrad/inference.py
+++ b/exo/inference/tinygrad/inference.py
@@ -1,7 +1,7 @@
 
															 from pathlib import Path
														
 
															 import json
														
 
															 import os
														
 
															-from exo.inference.tinygrad.models.llama import Transformer, convert_from_huggingface, fix_bf16
														
 
															+from exo.inference.tinygrad.models.llama import Transformer, convert_from_huggingface, fix_bf16, sample_logits
														
 
															 from exo.inference.shard import Shard
														
 
															 from exo.inference.tokenizers import resolve_tokenizer
														
 
															 from tinygrad.nn.state import load_state_dict
														
@@ -12,6 +12,7 @@ import numpy as np
 
															 from exo.inference.tinygrad.tinygrad_helpers import concat_weights, load
														
 
															 from exo.download.shard_download import ShardDownloader
														
 
															 from concurrent.futures import ThreadPoolExecutor
														
 
															+from .stateful_model import StatefulModel
														
 
															 import asyncio
														
 
															 Tensor.no_grad = True
														
@@ -58,44 +59,34 @@ def build_transformer(model_path: Path, shard: Shard, model_size="8B", device=No
 
															     load_state_dict(model, weights, strict=False, consume=False)  # consume=True
														
 
															   return model
														
 
															-
														
 
															 class TinygradDynamicShardInferenceEngine(InferenceEngine):
														
 
															   def __init__(self, shard_downloader: ShardDownloader):
														
 
															     self.shard = None
														
 
															     self.shard_downloader = shard_downloader
														
 
															     self.executor = ThreadPoolExecutor(max_workers=1)
														
 
															-  async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> tuple[np.ndarray, str, bool]:
														
 
															-    await self.ensure_shard(shard)
														
 
															-    start_pos = json.loads(inference_state or "{}").get("start_pos", 0)
														
 
															-    n_captured_toks = json.loads(inference_state or "{}").get("n_captured_toks", 0)
														
 
															-
														
 
															-    toks = await asyncio.get_event_loop().run_in_executor(self.executor, self.tokenizer.encode, prompt)
														
 
															-    h = await asyncio.get_event_loop().run_in_executor(self.executor, lambda: self.model(Tensor([toks]), start_pos, TEMPERATURE).realize())
														
 
															+  async def sample(self, x: np.ndarray, temp=TEMPERATURE, top_p: float = 0.0) -> np.ndarray:
														
 
															+    logits = x[:, -1, :]
														
 
															+    def sample_wrapper():
														
 
															+      return sample_logits(Tensor(logits).flatten(), temp, 0, 0.8, top_p, 0.0).realize()
														
 
															+    out = await asyncio.get_running_loop().run_in_executor(self.executor, sample_wrapper)
														
 
															+    return out.numpy()
														
 
															-    if h.shape == (1,):
														
 
															-      start_pos += len(toks)
														
 
															-      start_pos += 1
														
 
															-      n_captured_toks = 0
														
 
															-      return np.array([[h.item()]]), json.dumps({"start_pos": start_pos, "n_captured_toks": n_captured_toks}), h.item() == self.tokenizer.eos_token_id
														
 
															-    else:
														
 
															-      n_captured_toks = len(toks)
														
 
															-      return h.numpy(), json.dumps({"start_pos": start_pos, "n_captured_toks": n_captured_toks}), False
														
 
															+  async def encode(self, shard: Shard, prompt: str) -> np.ndarray:
														
 
															+    await self.ensure_shard(shard)
														
 
															+    tokens = await asyncio.get_running_loop().run_in_executor(self.executor, self.tokenizer.encode, prompt)
														
 
															+    return np.array(tokens)
														
 
															+  
														
 
															+  async def decode(self, shard: Shard, tokens) -> str:
														
 
															+    await self.ensure_shard(shard)
														
 
															+    tokens = await asyncio.get_running_loop().run_in_executor(self.executor, self.tokenizer.decode, tokens)
														
 
															+    return tokens
														
 
															-  async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> tuple[np.ndarray, str, bool]:
														
 
															+  async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> np.ndarray:
														
 
															     await self.ensure_shard(shard)
														
 
															     start_pos = json.loads(inference_state or "{}").get("start_pos", 0)
														
 
															-    n_captured_toks = json.loads(inference_state or "{}").get("n_captured_toks", 0)
														
 
															-
														
 
															-    h = await asyncio.get_event_loop().run_in_executor(self.executor, lambda: self.model(Tensor(input_data), start_pos, TEMPERATURE).realize())
														
 
															-
														
 
															-    if h.shape == (1,):
														
 
															-      start_pos += n_captured_toks
														
 
															-      start_pos += 1
														
 
															-      n_captured_toks = 0
														
 
															-      return np.array([[h.item()]]), json.dumps({"start_pos": start_pos, "n_captured_toks": n_captured_toks}), h.item() == self.tokenizer.eos_token_id
														
 
															-    else:
														
 
															-      return h.numpy(), json.dumps({"start_pos": start_pos, "n_captured_toks": n_captured_toks}), False
														
 
															+    output_data = await asyncio.get_running_loop().run_in_executor(self.executor, lambda: self.model(Tensor(input_data), start_pos, request_id).realize())
														
 
															+    return output_data.numpy()
														
 
															   async def ensure_shard(self, shard: Shard):
														
 
															     if self.shard == shard:
														
@@ -104,9 +95,11 @@ class TinygradDynamicShardInferenceEngine(InferenceEngine):
 
															     model_path = await self.shard_downloader.ensure_shard(shard)
														
 
															     if self.shard != shard:
														
 
															+      loop = asyncio.get_running_loop()
														
 
															       parameters = "1B" if "1b" in shard.model_id.lower() else "3B" if "3b" in shard.model_id.lower() else "8B" if "8b" in shard.model_id.lower() else "70B"
														
 
															-      self.model = await asyncio.get_event_loop().run_in_executor(self.executor, build_transformer, model_path, shard, parameters)
														
 
															+      model_shard = await loop.run_in_executor(self.executor, build_transformer, model_path, shard, parameters)
														
 
															       tokenizer_path = str((model_path if model_path.is_dir() else model_path.parent))
														
 
															       self.tokenizer = await resolve_tokenizer(tokenizer_path)
														
 
															       self.shard = shard
														
 
															+      self.model = await loop.run_in_executor(self.executor, StatefulModel, model_shard) 
														
--- a/exo/inference/tinygrad/models/llama.py
+++ b/exo/inference/tinygrad/models/llama.py
@@ -1,6 +1,7 @@
 
															-from typing import Tuple, Union, Optional, Dict, Any
														
 
															+from typing import Tuple, Union, Optional, Dict, Any, List
														
 
															 from tinygrad import Tensor, Variable, TinyJit, dtypes, nn, Device
														
 
															 from tinygrad.helpers import getenv
														
 
															+from collections import OrderedDict
														
 
															 # https://github.com/facebookresearch/llama/blob/1076b9c51c77ad06e9d7ba8a4c6df775741732bd/llama/model.py#L47
														
@@ -47,7 +48,6 @@ def repeat_kv(x: Tensor, n_rep: int) -> Tensor:
 
															   # NOTE: this is different from x.repeat((1, 1, n_rep, 1))
														
 
															   return x.repeat((1, 1, 1, n_rep)).reshape(bs, seqlen, n_kv_heads*n_rep, head_dim)
														
 
															-
														
 
															 class Attention:
														
 
															   def __init__(self, dim, n_heads, n_kv_heads, max_context, linear=nn.Linear):
														
 
															     self.n_heads = n_heads
														
@@ -61,7 +61,7 @@ class Attention:
 
															     self.wv = linear(dim, self.n_kv_heads*self.head_dim, bias=False)
														
 
															     self.wo = linear(self.n_heads*self.head_dim, dim, bias=False)
														
 
															-  def __call__(self, x: Tensor, start_pos: Union[Variable, int], freqs_cis: Tensor, mask: Optional[Tensor]) -> Tensor:
														
 
															+  def __call__(self, x: Tensor, start_pos: Union[Variable, int], freqs_cis: Tensor, mask: Optional[Tensor], cache: Optional[Tensor]=None) -> Tensor:
														
 
															     if getenv("WQKV"):
														
 
															       if not hasattr(self, 'wqkv'): self.wqkv = Tensor.cat(self.wq.weight, self.wk.weight, self.wv.weight)
														
 
															       xqkv = x @ self.wqkv.T
														
@@ -76,19 +76,16 @@ class Attention:
 
															     xq, xk = apply_rotary_emb(xq, xk, freqs_cis)
														
 
															     bsz, seqlen, _, _ = xq.shape
														
 
															-    # create kv cache
														
 
															-    if not hasattr(self, "cache_kv"):
														
 
															-      self.cache_kv = Tensor.zeros(2, bsz, self.max_context, self.n_kv_heads, self.head_dim, dtype=x.dtype).contiguous().realize()
														
 
															-      if isinstance(x.device, tuple):
														
 
															-        # TODO: instead of specifying how to shard, it can follow how xk and xv are being sharded
														
 
															-        self.cache_kv.shard_((x.device), axis=3 if getenv("SHARD_KVCACHE") else None).realize()
														
 
															-
														
 
															-    # update the cache
														
 
															-    assert xk.dtype == xv.dtype == self.cache_kv.dtype, f"{xk.dtype=}, {xv.dtype=}, {self.cache_kv.dtype=}"
														
 
															-    self.cache_kv.shrink((None, None, (start_pos, start_pos + seqlen), None, None)).assign(Tensor.stack(xk, xv)).realize()
														
 
															+    if cache is not None:
														
 
															+      # update the cache
														
 
															+      assert xk.dtype == xv.dtype == cache.dtype, f"{xk.dtype=}, {xv.dtype=}, {cache.dtype=}"
														
 
															+      cache.shrink((None, None, (start_pos, start_pos + seqlen), None, None)).assign(Tensor.stack(xk, xv)).realize()
														
 
															-    keys = self.cache_kv[0].shrink((None, (0, start_pos + seqlen), None, None)) if start_pos > 0 else xk
														
 
															-    values = self.cache_kv[1].shrink((None, (0, start_pos + seqlen), None, None)) if start_pos > 0 else xv
														
 
															+      keys = cache[0].shrink((None, (0, start_pos + seqlen), None, None)) if start_pos > 0 else xk
														
 
															+      values = cache[1].shrink((None, (0, start_pos + seqlen), None, None)) if start_pos > 0 else xv
														
 
															+    else:
														
 
															+      keys = xk
														
 
															+      values = xv
														
 
															     keys, values = repeat_kv(keys, self.n_rep), repeat_kv(values, self.n_rep)
														
 
															     xq, keys, values = xq.transpose(1, 2), keys.transpose(1, 2), values.transpose(1, 2)
														
@@ -114,13 +111,13 @@ class TransformerBlock:
 
															     self.attention_norm = nn.RMSNorm(dim, norm_eps)
														
 
															     self.ffn_norm = nn.RMSNorm(dim, norm_eps)
														
 
															-  def __call__(self, x: Tensor, start_pos: Union[Variable, int], freqs_cis: Tensor, mask: Optional[Tensor]):
														
 
															-    h = x + self.attention(self.attention_norm(x), start_pos, freqs_cis, mask)
														
 
															+  def __call__(self, x: Tensor, start_pos: Union[Variable, int], freqs_cis: Tensor, mask: Optional[Tensor], cache: Optional[Tensor]=None):
														
 
															+    h = x + self.attention(self.attention_norm(x), start_pos, freqs_cis, mask, cache=cache)
														
 
															     return (h + self.feed_forward(self.ffn_norm(h))).contiguous()
														
 
															 # standard openai sampling
														
 
															-def sample(logits: Tensor, temp: float, k: int, p: float, af: float, ap: float):
														
 
															+def sample_logits(logits: Tensor, temp: float, k: int, p: float, af: float, ap: float):
														
 
															   assert logits.ndim == 1, "only works on 1d tensors"
														
 
															   assert 0 <= p <= 1, "p must be between 0 and 1"
														
 
															   assert 0 <= k <= logits.numel(), "k must be between 0 and numel"
														
@@ -189,7 +186,7 @@ class Transformer:
 
															     jit=True,
														
 
															     feed_forward=FeedForward,
														
 
															     rope_scaling: Optional[Dict[str, float]] = None,
														
 
															-    tie_word_embeddings=False
														
 
															+    tie_word_embeddings=False,
														
 
															   ):
														
 
															     self.layers = [TransformerBlock(dim, hidden_dim, n_heads, n_kv_heads, norm_eps, max_context, linear, feed_forward=feed_forward) for _ in range(n_layers)]
														
 
															     self.norm = nn.RMSNorm(dim, norm_eps)
														
@@ -202,31 +199,38 @@ class Transformer:
 
															     self.forward_jit = TinyJit(self.forward) if jit else None
														
 
															     self.shard = shard
														
 
															-  def forward(self, x: Tensor, start_pos: Union[Variable, int], temperature: float, top_k: int, top_p: float, alpha_f: float, alpha_p: float):
														
 
															+  def forward(self, x: Tensor, start_pos: Union[Variable, int], cache: Optional[List[Tensor]] = None):
														
 
															     seqlen = x.shape[1]
														
 
															     freqs_cis = self.freqs_cis.shrink((None, (start_pos, start_pos + seqlen), None, None, None))
														
 
															     mask = Tensor.full((1, 1, seqlen, start_pos + seqlen), float("-100000000"), dtype=x.dtype, device=x.device).triu(start_pos + 1).realize() if seqlen > 1 else None
														
 
															-    if self.shard.is_first_layer():
														
 
															-      h = self.tok_embeddings(x)
														
 
															-    else:
														
 
															-      h = x
														
 
															+    h = x
														
 
															-    for i in range(self.shard.start_layer, self.shard.end_layer + 1):
														
 
															+    if cache is None:
														
 
															+      cache = [None for _ in range(self.shard.start_layer, self.shard.end_layer + 1)]  
														
 
															+    for i, c in zip(range(self.shard.start_layer, self.shard.end_layer + 1), cache):
														
 
															       layer = self.layers[i]
														
 
															-      h = layer(h, start_pos, freqs_cis, mask)
														
 
															+      h = layer(h, start_pos, freqs_cis, mask, cache=c)
														
 
															     if self.shard.is_last_layer():
														
 
															-      logits = self.output(self.norm(h)).float()[:, -1, :]
														
 
															-      return sample(logits.flatten(), temperature, top_k, top_p, alpha_f, alpha_p).realize()
														
 
															+      logits = self.output(self.norm(h)).float().realize()
														
 
															+      return logits
														
 
															     else:
														
 
															       return h
														
 
															-  def __call__(self, tokens: Tensor, start_pos: Variable, temperature: float = 0.0, top_k: int = 0, top_p: float = 0.8, alpha_f: float = 0.0, alpha_p: float = 0.0):
														
 
															+  def embed(self, inputs: Tensor):
														
 
															+    if self.shard.is_first_layer():
														
 
															+      h = self.tok_embeddings(inputs)
														
 
															+    else:
														
 
															+      h = inputs
														
 
															+    return h
														
 
															+
														
 
															+  def __call__(self, tokens: Tensor, start_pos: Variable, cache: Optional[List[Tensor]] = None):
														
 
															     # TODO: better way to handle the first call v.s. the rest?
														
 
															+    h = self.embed(x)
														
 
															     if tokens.shape[0:2] == (1, 1) and self.forward_jit is not None:
														
 
															-      return self.forward_jit(tokens, Variable("start_pos", 0, self.max_context).bind(start_pos), temperature, top_k, top_p, alpha_f, alpha_p)
														
 
															-    return self.forward(tokens, start_pos, temperature, top_k, top_p, alpha_f, alpha_p)
														
 
															+      return self.forward_jit(h, Variable("start_pos", 0, self.max_context).bind(start_pos), cache=cache)
														
 
															+    return self.forward(h, start_pos, cache=cache)
														
 
															 # *** helpers ***
														
@@ -260,7 +264,10 @@ def convert_from_huggingface(weights: Dict[str, Tensor], model: Transformer, n_h
 
															         v = permute(v, n_heads)
														
 
															       elif "k_proj" in k:
														
 
															         v = permute(v, n_kv_heads)
														
 
															-    sd[keymap[k]] = v
														
 
															+    if k in keymap:
														
 
															+      sd[keymap[k]] = v
														
 
															+    else:
														
 
															+      sd[k] = v
														
 
															   return sd
														
--- a/exo/inference/tinygrad/stateful_model.py
+++ b/exo/inference/tinygrad/stateful_model.py
@@ -0,0 +1,34 @@
 
															+from tinygrad import Tensor, Variable 
														
 
															+from collections import OrderedDict
														
 
															+
														
 
															+def create_kv_cache(x: Tensor, max_context: int, n_kv_heads: int, head_dim: int):
														
 
															+  cache_kv = Tensor.zeros(2, x.shape[0], max_context, n_kv_heads, head_dim, dtype=x.dtype).contiguous().realize()
														
 
															+  if isinstance(x.device, tuple):
														
 
															+    # TODO: instead of specifying how to shard, it can follow how xk and xv are being sharded
														
 
															+    cache_kv.shard_((x.device), axis=3 if getenv("SHARD_KVCACHE") else None).realize()
														
 
															+  return cache_kv.realize()
														
 
															+
														
 
															+class StatefulModel:
														
 
															+  def __init__(self, model, max_caches: int = 2):
														
 
															+    super().__init__()
														
 
															+    self.model = model
														
 
															+    self.max_caches = max_caches
														
 
															+    self.caches = OrderedDict()
														
 
															+ 
														
 
															+  def init_cache(self, x: Tensor, request_id: str):
														
 
															+    cache = [create_kv_cache(x, self.model.layers[i].attention.max_context, self.model.layers[i].attention.n_kv_heads, self.model.layers[i].attention.head_dim) for i in range(self.model.shard.start_layer, self.model.shard.end_layer + 1)]
														
 
															+    if len(self.caches) >= self.max_caches:
														
 
															+      self.caches.popitem(last=False)
														
 
															+
														
 
															+    self.caches[request_id] = cache
														
 
															+
														
 
															+  def __call__(self, x: Tensor, start_pos: Variable, request_id: str): 
														
 
															+    h = self.model.embed(x)
														
 
															+    if request_id not in self.caches:
														
 
															+      self.init_cache(h, request_id)
														
 
															+    else:
														
 
															+      self.caches.move_to_end(request_id)
														
 
															+    if h.shape[0:2] == (1, 1) and self.model.forward_jit is not None:
														
 
															+      return self.model.forward_jit(h, Variable("start_pos", 0, self.model.max_context).bind(start_pos), cache=self.caches[request_id])
														
 
															+    return self.model.forward(h, start_pos, cache=self.caches[request_id])
														
 
															+
														
--- a/exo/main.py
+++ b/exo/main.py
@@ -189,7 +189,7 @@ async def run_model_cli(node: Node, inference_engine: InferenceEngine, model_nam
 
															   try:
														
 
															     print(f"Processing prompt: {prompt}")
														
 
															-    await node.process_prompt(shard, prompt, None, request_id=request_id)
														
 
															+    await node.process_prompt(shard, prompt, request_id=request_id)
														
 
															     _, tokens, _ = await callback.wait(lambda _request_id, tokens, is_finished: _request_id == request_id and is_finished, timeout=300)
														
@@ -238,4 +238,4 @@ def run():
 
															 if __name__ == "__main__":
														
 
															-  run()
														
 
															+  run()
														
--- a/exo/networking/grpc/grpc_peer_handle.py
+++ b/exo/networking/grpc/grpc_peer_handle.py
@@ -63,10 +63,9 @@ class GRPCPeerHandle(PeerHandle):
 
															         traceback.print_exc()
														
 
															       return False
														
 
															-  async def send_prompt(self, shard: Shard, prompt: str, image_str: Optional[str] = None, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.array]:
														
 
															+  async def send_prompt(self, shard: Shard, prompt: str, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.array]:
														
 
															     request = node_service_pb2.PromptRequest(
														
 
															       prompt=prompt,
														
 
															-      image_str=image_str,
														
 
															       shard=node_service_pb2.Shard(
														
 
															         model_id=shard.model_id,
														
 
															         start_layer=shard.start_layer,
														
--- a/exo/networking/grpc/grpc_server.py
+++ b/exo/networking/grpc/grpc_server.py
@@ -49,10 +49,9 @@ class GRPCServer(node_service_pb2_grpc.NodeServiceServicer):
 
															       n_layers=request.shard.n_layers,
														
 
															     )
														
 
															     prompt = request.prompt
														
 
															-    image_str = request.image_str
														
 
															     request_id = request.request_id
														
 
															-    result = await self.node.process_prompt(shard, prompt, image_str, request_id)
														
 
															-    if DEBUG >= 5: print(f"SendPrompt {shard=} {prompt=} {image_str=} {request_id=} result: {result}")
														
 
															+    result = await self.node.process_prompt(shard, prompt, request_id)
														
 
															+    if DEBUG >= 5: print(f"SendPrompt {shard=} {prompt=} {request_id=} result: {result}")
														
 
															     tensor_data = result.tobytes() if result is not None else None
														
 
															     return node_service_pb2.Tensor(tensor_data=tensor_data, shape=result.shape, dtype=str(result.dtype)) if result is not None else node_service_pb2.Tensor()
														
--- a/exo/networking/grpc/node_service.proto
+++ b/exo/networking/grpc/node_service.proto
@@ -22,9 +22,8 @@ message Shard {
 
															 message PromptRequest {
														
 
															   Shard shard = 1;
														
 
															   string prompt = 2;
														
 
															-  optional string image_str = 3;
														
 
															-  optional string request_id = 4;
														
 
															-  optional string inference_state = 5;
														
 
															+  optional string request_id = 3;
														
 
															+  optional string inference_state = 4;
														
 
															 }
														
 
															 message TensorRequest {
														
@@ -93,4 +92,4 @@ message HealthCheckResponse {
 
															   bool is_healthy = 1;
														
 
															 }
														
 
															-message Empty {}
														
 
															+message Empty {}
														
--- a/exo/networking/grpc/node_service_pb2.py
+++ b/exo/networking/grpc/node_service_pb2.py
--- a/exo/networking/grpc/node_service_pb2_grpc.py
+++ b/exo/networking/grpc/node_service_pb2_grpc.py
@@ -12,298 +12,349 @@ SCHEDULED_RELEASE_DATE = 'June 25, 2024'
 
															 _version_not_supported = False
														
 
															 try:
														
 
															-  from grpc._utilities import first_version_is_lower
														
 
															-  _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
														
 
															+    from grpc._utilities import first_version_is_lower
														
 
															+    _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
														
 
															 except ImportError:
														
 
															-  _version_not_supported = True
														
 
															+    _version_not_supported = True
														
 
															 if _version_not_supported:
														
 
															-  warnings.warn(
														
 
															-    f'The grpc package installed is at version {GRPC_VERSION},' + f' but the generated code in node_service_pb2_grpc.py depends on' + f' grpcio>={GRPC_GENERATED_VERSION}.' +
														
 
															-    f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}' + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.' +
														
 
															-    f' This warning will become an error in {EXPECTED_ERROR_RELEASE},' + f' scheduled for release on {SCHEDULED_RELEASE_DATE}.', RuntimeWarning
														
 
															-  )
														
 
															+    warnings.warn(
														
 
															+        f'The grpc package installed is at version {GRPC_VERSION},'
														
 
															+        + f' but the generated code in node_service_pb2_grpc.py depends on'
														
 
															+        + f' grpcio>={GRPC_GENERATED_VERSION}.'
														
 
															+        + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
														
 
															+        + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
														
 
															+        + f' This warning will become an error in {EXPECTED_ERROR_RELEASE},'
														
 
															+        + f' scheduled for release on {SCHEDULED_RELEASE_DATE}.',
														
 
															+        RuntimeWarning
														
 
															+    )
														
 
															 class NodeServiceStub(object):
														
 
															-  """Missing associated documentation comment in .proto file."""
														
 
															-  def __init__(self, channel):
														
 
															-    """Constructor.
														
 
															+    """Missing associated documentation comment in .proto file."""
														
 
															+
														
 
															+    def __init__(self, channel):
														
 
															+        """Constructor.
														
 
															         Args:
														
 
															             channel: A grpc.Channel.
														
 
															         """
														
 
															-    self.SendPrompt = channel.unary_unary(
														
 
															-      '/node_service.NodeService/SendPrompt',
														
 
															-      request_serializer=node__service__pb2.PromptRequest.SerializeToString,
														
 
															-      response_deserializer=node__service__pb2.Tensor.FromString,
														
 
															-      _registered_method=True
														
 
															-    )
														
 
															-    self.SendTensor = channel.unary_unary(
														
 
															-      '/node_service.NodeService/SendTensor',
														
 
															-      request_serializer=node__service__pb2.TensorRequest.SerializeToString,
														
 
															-      response_deserializer=node__service__pb2.Tensor.FromString,
														
 
															-      _registered_method=True
														
 
															-    )
														
 
															-    self.GetInferenceResult = channel.unary_unary(
														
 
															-      '/node_service.NodeService/GetInferenceResult',
														
 
															-      request_serializer=node__service__pb2.GetInferenceResultRequest.SerializeToString,
														
 
															-      response_deserializer=node__service__pb2.InferenceResult.FromString,
														
 
															-      _registered_method=True
														
 
															-    )
														
 
															-    self.CollectTopology = channel.unary_unary(
														
 
															-      '/node_service.NodeService/CollectTopology',
														
 
															-      request_serializer=node__service__pb2.CollectTopologyRequest.SerializeToString,
														
 
															-      response_deserializer=node__service__pb2.Topology.FromString,
														
 
															-      _registered_method=True
														
 
															-    )
														
 
															-    self.SendResult = channel.unary_unary(
														
 
															-      '/node_service.NodeService/SendResult',
														
 
															-      request_serializer=node__service__pb2.SendResultRequest.SerializeToString,
														
 
															-      response_deserializer=node__service__pb2.Empty.FromString,
														
 
															-      _registered_method=True
														
 
															-    )
														
 
															-    self.SendOpaqueStatus = channel.unary_unary(
														
 
															-      '/node_service.NodeService/SendOpaqueStatus',
														
 
															-      request_serializer=node__service__pb2.SendOpaqueStatusRequest.SerializeToString,
														
 
															-      response_deserializer=node__service__pb2.Empty.FromString,
														
 
															-      _registered_method=True
														
 
															-    )
														
 
															-    self.HealthCheck = channel.unary_unary(
														
 
															-      '/node_service.NodeService/HealthCheck',
														
 
															-      request_serializer=node__service__pb2.HealthCheckRequest.SerializeToString,
														
 
															-      response_deserializer=node__service__pb2.HealthCheckResponse.FromString,
														
 
															-      _registered_method=True
														
 
															-    )
														
 
															+        self.SendPrompt = channel.unary_unary(
														
 
															+                '/node_service.NodeService/SendPrompt',
														
 
															+                request_serializer=node__service__pb2.PromptRequest.SerializeToString,
														
 
															+                response_deserializer=node__service__pb2.Tensor.FromString,
														
 
															+                _registered_method=True)
														
 
															+        self.SendTensor = channel.unary_unary(
														
 
															+                '/node_service.NodeService/SendTensor',
														
 
															+                request_serializer=node__service__pb2.TensorRequest.SerializeToString,
														
 
															+                response_deserializer=node__service__pb2.Tensor.FromString,
														
 
															+                _registered_method=True)
														
 
															+        self.GetInferenceResult = channel.unary_unary(
														
 
															+                '/node_service.NodeService/GetInferenceResult',
														
 
															+                request_serializer=node__service__pb2.GetInferenceResultRequest.SerializeToString,
														
 
															+                response_deserializer=node__service__pb2.InferenceResult.FromString,
														
 
															+                _registered_method=True)
														
 
															+        self.CollectTopology = channel.unary_unary(
														
 
															+                '/node_service.NodeService/CollectTopology',
														
 
															+                request_serializer=node__service__pb2.CollectTopologyRequest.SerializeToString,
														
 
															+                response_deserializer=node__service__pb2.Topology.FromString,
														
 
															+                _registered_method=True)
														
 
															+        self.SendResult = channel.unary_unary(
														
 
															+                '/node_service.NodeService/SendResult',
														
 
															+                request_serializer=node__service__pb2.SendResultRequest.SerializeToString,
														
 
															+                response_deserializer=node__service__pb2.Empty.FromString,
														
 
															+                _registered_method=True)
														
 
															+        self.SendOpaqueStatus = channel.unary_unary(
														
 
															+                '/node_service.NodeService/SendOpaqueStatus',
														
 
															+                request_serializer=node__service__pb2.SendOpaqueStatusRequest.SerializeToString,
														
 
															+                response_deserializer=node__service__pb2.Empty.FromString,
														
 
															+                _registered_method=True)
														
 
															+        self.HealthCheck = channel.unary_unary(
														
 
															+                '/node_service.NodeService/HealthCheck',
														
 
															+                request_serializer=node__service__pb2.HealthCheckRequest.SerializeToString,
														
 
															+                response_deserializer=node__service__pb2.HealthCheckResponse.FromString,
														
 
															+                _registered_method=True)
														
 
															 class NodeServiceServicer(object):
														
 
															-  """Missing associated documentation comment in .proto file."""
														
 
															-  def SendPrompt(self, request, context):
														
 
															     """Missing associated documentation comment in .proto file."""
														
 
															-    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
														
 
															-    context.set_details('Method not implemented!')
														
 
															-    raise NotImplementedError('Method not implemented!')
														
 
															-  def SendTensor(self, request, context):
														
 
															-    """Missing associated documentation comment in .proto file."""
														
 
															-    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
														
 
															-    context.set_details('Method not implemented!')
														
 
															-    raise NotImplementedError('Method not implemented!')
														
 
															+    def SendPrompt(self, request, context):
														
 
															+        """Missing associated documentation comment in .proto file."""
														
 
															+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
														
 
															+        context.set_details('Method not implemented!')
														
 
															+        raise NotImplementedError('Method not implemented!')
														
 
															-  def GetInferenceResult(self, request, context):
														
 
															-    """Missing associated documentation comment in .proto file."""
														
 
															-    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
														
 
															-    context.set_details('Method not implemented!')
														
 
															-    raise NotImplementedError('Method not implemented!')
														
 
															+    def SendTensor(self, request, context):
														
 
															+        """Missing associated documentation comment in .proto file."""
														
 
															+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
														
 
															+        context.set_details('Method not implemented!')
														
 
															+        raise NotImplementedError('Method not implemented!')
														
 
															-  def CollectTopology(self, request, context):
														
 
															-    """Missing associated documentation comment in .proto file."""
														
 
															-    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
														
 
															-    context.set_details('Method not implemented!')
														
 
															-    raise NotImplementedError('Method not implemented!')
														
 
															+    def GetInferenceResult(self, request, context):
														
 
															+        """Missing associated documentation comment in .proto file."""
														
 
															+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
														
 
															+        context.set_details('Method not implemented!')
														
 
															+        raise NotImplementedError('Method not implemented!')
														
 
															-  def SendResult(self, request, context):
														
 
															-    """Missing associated documentation comment in .proto file."""
														
 
															-    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
														
 
															-    context.set_details('Method not implemented!')
														
 
															-    raise NotImplementedError('Method not implemented!')
														
 
															+    def CollectTopology(self, request, context):
														
 
															+        """Missing associated documentation comment in .proto file."""
														
 
															+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
														
 
															+        context.set_details('Method not implemented!')
														
 
															+        raise NotImplementedError('Method not implemented!')
														
 
															-  def SendOpaqueStatus(self, request, context):
														
 
															-    """Missing associated documentation comment in .proto file."""
														
 
															-    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
														
 
															-    context.set_details('Method not implemented!')
														
 
															-    raise NotImplementedError('Method not implemented!')
														
 
															+    def SendResult(self, request, context):
														
 
															+        """Missing associated documentation comment in .proto file."""
														
 
															+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
														
 
															+        context.set_details('Method not implemented!')
														
 
															+        raise NotImplementedError('Method not implemented!')
														
 
															-  def HealthCheck(self, request, context):
														
 
															-    """Missing associated documentation comment in .proto file."""
														
 
															-    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
														
 
															-    context.set_details('Method not implemented!')
														
 
															-    raise NotImplementedError('Method not implemented!')
														
 
															+    def SendOpaqueStatus(self, request, context):
														
 
															+        """Missing associated documentation comment in .proto file."""
														
 
															+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
														
 
															+        context.set_details('Method not implemented!')
														
 
															+        raise NotImplementedError('Method not implemented!')
														
 
															+
														
 
															+    def HealthCheck(self, request, context):
														
 
															+        """Missing associated documentation comment in .proto file."""
														
 
															+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
														
 
															+        context.set_details('Method not implemented!')
														
 
															+        raise NotImplementedError('Method not implemented!')
														
 
															 def add_NodeServiceServicer_to_server(servicer, server):
														
 
															-  rpc_method_handlers = {
														
 
															-    'SendPrompt':
														
 
															-      grpc.unary_unary_rpc_method_handler(
														
 
															-        servicer.SendPrompt,
														
 
															-        request_deserializer=node__service__pb2.PromptRequest.FromString,
														
 
															-        response_serializer=node__service__pb2.Tensor.SerializeToString,
														
 
															-      ),
														
 
															-    'SendTensor':
														
 
															-      grpc.unary_unary_rpc_method_handler(
														
 
															-        servicer.SendTensor,
														
 
															-        request_deserializer=node__service__pb2.TensorRequest.FromString,
														
 
															-        response_serializer=node__service__pb2.Tensor.SerializeToString,
														
 
															-      ),
														
 
															-    'GetInferenceResult':
														
 
															-      grpc.unary_unary_rpc_method_handler(
														
 
															-        servicer.GetInferenceResult,
														
 
															-        request_deserializer=node__service__pb2.GetInferenceResultRequest.FromString,
														
 
															-        response_serializer=node__service__pb2.InferenceResult.SerializeToString,
														
 
															-      ),
														
 
															-    'CollectTopology':
														
 
															-      grpc.unary_unary_rpc_method_handler(
														
 
															-        servicer.CollectTopology,
														
 
															-        request_deserializer=node__service__pb2.CollectTopologyRequest.FromString,
														
 
															-        response_serializer=node__service__pb2.Topology.SerializeToString,
														
 
															-      ),
														
 
															-    'SendResult':
														
 
															-      grpc.unary_unary_rpc_method_handler(
														
 
															-        servicer.SendResult,
														
 
															-        request_deserializer=node__service__pb2.SendResultRequest.FromString,
														
 
															-        response_serializer=node__service__pb2.Empty.SerializeToString,
														
 
															-      ),
														
 
															-    'SendOpaqueStatus':
														
 
															-      grpc.unary_unary_rpc_method_handler(
														
 
															-        servicer.SendOpaqueStatus,
														
 
															-        request_deserializer=node__service__pb2.SendOpaqueStatusRequest.FromString,
														
 
															-        response_serializer=node__service__pb2.Empty.SerializeToString,
														
 
															-      ),
														
 
															-    'HealthCheck':
														
 
															-      grpc.unary_unary_rpc_method_handler(
														
 
															-        servicer.HealthCheck,
														
 
															-        request_deserializer=node__service__pb2.HealthCheckRequest.FromString,
														
 
															-        response_serializer=node__service__pb2.HealthCheckResponse.SerializeToString,
														
 
															-      ),
														
 
															-  }
														
 
															-  generic_handler = grpc.method_handlers_generic_handler('node_service.NodeService', rpc_method_handlers)
														
 
															-  server.add_generic_rpc_handlers((generic_handler,))
														
 
															-  server.add_registered_method_handlers('node_service.NodeService', rpc_method_handlers)
														
 
															+    rpc_method_handlers = {
														
 
															+            'SendPrompt': grpc.unary_unary_rpc_method_handler(
														
 
															+                    servicer.SendPrompt,
														
 
															+                    request_deserializer=node__service__pb2.PromptRequest.FromString,
														
 
															+                    response_serializer=node__service__pb2.Tensor.SerializeToString,
														
 
															+            ),
														
 
															+            'SendTensor': grpc.unary_unary_rpc_method_handler(
														
 
															+                    servicer.SendTensor,
														
 
															+                    request_deserializer=node__service__pb2.TensorRequest.FromString,
														
 
															+                    response_serializer=node__service__pb2.Tensor.SerializeToString,
														
 
															+            ),
														
 
															+            'GetInferenceResult': grpc.unary_unary_rpc_method_handler(
														
 
															+                    servicer.GetInferenceResult,
														
 
															+                    request_deserializer=node__service__pb2.GetInferenceResultRequest.FromString,
														
 
															+                    response_serializer=node__service__pb2.InferenceResult.SerializeToString,
														
 
															+            ),
														
 
															+            'CollectTopology': grpc.unary_unary_rpc_method_handler(
														
 
															+                    servicer.CollectTopology,
														
 
															+                    request_deserializer=node__service__pb2.CollectTopologyRequest.FromString,
														
 
															+                    response_serializer=node__service__pb2.Topology.SerializeToString,
														
 
															+            ),
														
 
															+            'SendResult': grpc.unary_unary_rpc_method_handler(
														
 
															+                    servicer.SendResult,
														
 
															+                    request_deserializer=node__service__pb2.SendResultRequest.FromString,
														
 
															+                    response_serializer=node__service__pb2.Empty.SerializeToString,
														
 
															+            ),
														
 
															+            'SendOpaqueStatus': grpc.unary_unary_rpc_method_handler(
														
 
															+                    servicer.SendOpaqueStatus,
														
 
															+                    request_deserializer=node__service__pb2.SendOpaqueStatusRequest.FromString,
														
 
															+                    response_serializer=node__service__pb2.Empty.SerializeToString,
														
 
															+            ),
														
 
															+            'HealthCheck': grpc.unary_unary_rpc_method_handler(
														
 
															+                    servicer.HealthCheck,
														
 
															+                    request_deserializer=node__service__pb2.HealthCheckRequest.FromString,
														
 
															+                    response_serializer=node__service__pb2.HealthCheckResponse.SerializeToString,
														
 
															+            ),
														
 
															+    }
														
 
															+    generic_handler = grpc.method_handlers_generic_handler(
														
 
															+            'node_service.NodeService', rpc_method_handlers)
														
 
															+    server.add_generic_rpc_handlers((generic_handler,))
														
 
															+    server.add_registered_method_handlers('node_service.NodeService', rpc_method_handlers)
														
 
															-# This class is part of an EXPERIMENTAL API.
														
 
															+ # This class is part of an EXPERIMENTAL API.
														
 
															 class NodeService(object):
														
 
															-  """Missing associated documentation comment in .proto file."""
														
 
															-  @staticmethod
														
 
															-  def SendPrompt(request, target, options=(), channel_credentials=None, call_credentials=None, insecure=False, compression=None, wait_for_ready=None, timeout=None, metadata=None):
														
 
															-    return grpc.experimental.unary_unary(
														
 
															-      request,
														
 
															-      target,
														
 
															-      '/node_service.NodeService/SendPrompt',
														
 
															-      node__service__pb2.PromptRequest.SerializeToString,
														
 
															-      node__service__pb2.Tensor.FromString,
														
 
															-      options,
														
 
															-      channel_credentials,
														
 
															-      insecure,
														
 
															-      call_credentials,
														
 
															-      compression,
														
 
															-      wait_for_ready,
														
 
															-      timeout,
														
 
															-      metadata,
														
 
															-      _registered_method=True
														
 
															-    )
														
 
															+    """Missing associated documentation comment in .proto file."""
														
 
															-  @staticmethod
														
 
															-  def SendTensor(request, target, options=(), channel_credentials=None, call_credentials=None, insecure=False, compression=None, wait_for_ready=None, timeout=None, metadata=None):
														
 
															-    return grpc.experimental.unary_unary(
														
 
															-      request,
														
 
															-      target,
														
 
															-      '/node_service.NodeService/SendTensor',
														
 
															-      node__service__pb2.TensorRequest.SerializeToString,
														
 
															-      node__service__pb2.Tensor.FromString,
														
 
															-      options,
														
 
															-      channel_credentials,
														
 
															-      insecure,
														
 
															-      call_credentials,
														
 
															-      compression,
														
 
															-      wait_for_ready,
														
 
															-      timeout,
														
 
															-      metadata,
														
 
															-      _registered_method=True
														
 
															-    )
														
 
															+    @staticmethod
														
 
															+    def SendPrompt(request,
														
 
															+            target,
														
 
															+            options=(),
														
 
															+            channel_credentials=None,
														
 
															+            call_credentials=None,
														
 
															+            insecure=False,
														
 
															+            compression=None,
														
 
															+            wait_for_ready=None,
														
 
															+            timeout=None,
														
 
															+            metadata=None):
														
 
															+        return grpc.experimental.unary_unary(
														
 
															+            request,
														
 
															+            target,
														
 
															+            '/node_service.NodeService/SendPrompt',
														
 
															+            node__service__pb2.PromptRequest.SerializeToString,
														
 
															+            node__service__pb2.Tensor.FromString,
														
 
															+            options,
														
 
															+            channel_credentials,
														
 
															+            insecure,
														
 
															+            call_credentials,
														
 
															+            compression,
														
 
															+            wait_for_ready,
														
 
															+            timeout,
														
 
															+            metadata,
														
 
															+            _registered_method=True)
														
 
															-  @staticmethod
														
 
															-  def GetInferenceResult(request, target, options=(), channel_credentials=None, call_credentials=None, insecure=False, compression=None, wait_for_ready=None, timeout=None, metadata=None):
														
 
															-    return grpc.experimental.unary_unary(
														
 
															-      request,
														
 
															-      target,
														
 
															-      '/node_service.NodeService/GetInferenceResult',
														
 
															-      node__service__pb2.GetInferenceResultRequest.SerializeToString,
														
 
															-      node__service__pb2.InferenceResult.FromString,
														
 
															-      options,
														
 
															-      channel_credentials,
														
 
															-      insecure,
														
 
															-      call_credentials,
														
 
															-      compression,
														
 
															-      wait_for_ready,
														
 
															-      timeout,
														
 
															-      metadata,
														
 
															-      _registered_method=True
														
 
															-    )
														
 
															+    @staticmethod
														
 
															+    def SendTensor(request,
														
 
															+            target,
														
 
															+            options=(),
														
 
															+            channel_credentials=None,
														
 
															+            call_credentials=None,
														
 
															+            insecure=False,
														
 
															+            compression=None,
														
 
															+            wait_for_ready=None,
														
 
															+            timeout=None,
														
 
															+            metadata=None):
														
 
															+        return grpc.experimental.unary_unary(
														
 
															+            request,
														
 
															+            target,
														
 
															+            '/node_service.NodeService/SendTensor',
														
 
															+            node__service__pb2.TensorRequest.SerializeToString,
														
 
															+            node__service__pb2.Tensor.FromString,
														
 
															+            options,
														
 
															+            channel_credentials,
														
 
															+            insecure,
														
 
															+            call_credentials,
														
 
															+            compression,
														
 
															+            wait_for_ready,
														
 
															+            timeout,
														
 
															+            metadata,
														
 
															+            _registered_method=True)
														
 
															-  @staticmethod
														
 
															-  def CollectTopology(request, target, options=(), channel_credentials=None, call_credentials=None, insecure=False, compression=None, wait_for_ready=None, timeout=None, metadata=None):
														
 
															-    return grpc.experimental.unary_unary(
														
 
															-      request,
														
 
															-      target,
														
 
															-      '/node_service.NodeService/CollectTopology',
														
 
															-      node__service__pb2.CollectTopologyRequest.SerializeToString,
														
 
															-      node__service__pb2.Topology.FromString,
														
 
															-      options,
														
 
															-      channel_credentials,
														
 
															-      insecure,
														
 
															-      call_credentials,
														
 
															-      compression,
														
 
															-      wait_for_ready,
														
 
															-      timeout,
														
 
															-      metadata,
														
 
															-      _registered_method=True
														
 
															-    )
														
 
															+    @staticmethod
														
 
															+    def GetInferenceResult(request,
														
 
															+            target,
														
 
															+            options=(),
														
 
															+            channel_credentials=None,
														
 
															+            call_credentials=None,
														
 
															+            insecure=False,
														
 
															+            compression=None,
														
 
															+            wait_for_ready=None,
														
 
															+            timeout=None,
														
 
															+            metadata=None):
														
 
															+        return grpc.experimental.unary_unary(
														
 
															+            request,
														
 
															+            target,
														
 
															+            '/node_service.NodeService/GetInferenceResult',
														
 
															+            node__service__pb2.GetInferenceResultRequest.SerializeToString,
														
 
															+            node__service__pb2.InferenceResult.FromString,
														
 
															+            options,
														
 
															+            channel_credentials,
														
 
															+            insecure,
														
 
															+            call_credentials,
														
 
															+            compression,
														
 
															+            wait_for_ready,
														
 
															+            timeout,
														
 
															+            metadata,
														
 
															+            _registered_method=True)
														
 
															-  @staticmethod
														
 
															-  def SendResult(request, target, options=(), channel_credentials=None, call_credentials=None, insecure=False, compression=None, wait_for_ready=None, timeout=None, metadata=None):
														
 
															-    return grpc.experimental.unary_unary(
														
 
															-      request,
														
 
															-      target,
														
 
															-      '/node_service.NodeService/SendResult',
														
 
															-      node__service__pb2.SendResultRequest.SerializeToString,
														
 
															-      node__service__pb2.Empty.FromString,
														
 
															-      options,
														
 
															-      channel_credentials,
														
 
															-      insecure,
														
 
															-      call_credentials,
														
 
															-      compression,
														
 
															-      wait_for_ready,
														
 
															-      timeout,
														
 
															-      metadata,
														
 
															-      _registered_method=True
														
 
															-    )
														
 
															+    @staticmethod
														
 
															+    def CollectTopology(request,
														
 
															+            target,
														
 
															+            options=(),
														
 
															+            channel_credentials=None,
														
 
															+            call_credentials=None,
														
 
															+            insecure=False,
														
 
															+            compression=None,
														
 
															+            wait_for_ready=None,
														
 
															+            timeout=None,
														
 
															+            metadata=None):
														
 
															+        return grpc.experimental.unary_unary(
														
 
															+            request,
														
 
															+            target,
														
 
															+            '/node_service.NodeService/CollectTopology',
														
 
															+            node__service__pb2.CollectTopologyRequest.SerializeToString,
														
 
															+            node__service__pb2.Topology.FromString,
														
 
															+            options,
														
 
															+            channel_credentials,
														
 
															+            insecure,
														
 
															+            call_credentials,
														
 
															+            compression,
														
 
															+            wait_for_ready,
														
 
															+            timeout,
														
 
															+            metadata,
														
 
															+            _registered_method=True)
														
 
															-  @staticmethod
														
 
															-  def SendOpaqueStatus(request, target, options=(), channel_credentials=None, call_credentials=None, insecure=False, compression=None, wait_for_ready=None, timeout=None, metadata=None):
														
 
															-    return grpc.experimental.unary_unary(
														
 
															-      request,
														
 
															-      target,
														
 
															-      '/node_service.NodeService/SendOpaqueStatus',
														
 
															-      node__service__pb2.SendOpaqueStatusRequest.SerializeToString,
														
 
															-      node__service__pb2.Empty.FromString,
														
 
															-      options,
														
 
															-      channel_credentials,
														
 
															-      insecure,
														
 
															-      call_credentials,
														
 
															-      compression,
														
 
															-      wait_for_ready,
														
 
															-      timeout,
														
 
															-      metadata,
														
 
															-      _registered_method=True
														
 
															-    )
														
 
															+    @staticmethod
														
 
															+    def SendResult(request,
														
 
															+            target,
														
 
															+            options=(),
														
 
															+            channel_credentials=None,
														
 
															+            call_credentials=None,
														
 
															+            insecure=False,
														
 
															+            compression=None,
														
 
															+            wait_for_ready=None,
														
 
															+            timeout=None,
														
 
															+            metadata=None):
														
 
															+        return grpc.experimental.unary_unary(
														
 
															+            request,
														
 
															+            target,
														
 
															+            '/node_service.NodeService/SendResult',
														
 
															+            node__service__pb2.SendResultRequest.SerializeToString,
														
 
															+            node__service__pb2.Empty.FromString,
														
 
															+            options,
														
 
															+            channel_credentials,
														
 
															+            insecure,
														
 
															+            call_credentials,
														
 
															+            compression,
														
 
															+            wait_for_ready,
														
 
															+            timeout,
														
 
															+            metadata,
														
 
															+            _registered_method=True)
														
 
															-  @staticmethod
														
 
															-  def HealthCheck(request, target, options=(), channel_credentials=None, call_credentials=None, insecure=False, compression=None, wait_for_ready=None, timeout=None, metadata=None):
														
 
															-    return grpc.experimental.unary_unary(
														
 
															-      request,
														
 
															-      target,
														
 
															-      '/node_service.NodeService/HealthCheck',
														
 
															-      node__service__pb2.HealthCheckRequest.SerializeToString,
														
 
															-      node__service__pb2.HealthCheckResponse.FromString,
														
 
															-      options,
														
 
															-      channel_credentials,
														
 
															-      insecure,
														
 
															-      call_credentials,
														
 
															-      compression,
														
 
															-      wait_for_ready,
														
 
															-      timeout,
														
 
															-      metadata,
														
 
															-      _registered_method=True
														
 
															-    )
														
 
															+    @staticmethod
														
 
															+    def SendOpaqueStatus(request,
														
 
															+            target,
														
 
															+            options=(),
														
 
															+            channel_credentials=None,
														
 
															+            call_credentials=None,
														
 
															+            insecure=False,
														
 
															+            compression=None,
														
 
															+            wait_for_ready=None,
														
 
															+            timeout=None,
														
 
															+            metadata=None):
														
 
															+        return grpc.experimental.unary_unary(
														
 
															+            request,
														
 
															+            target,
														
 
															+            '/node_service.NodeService/SendOpaqueStatus',
														
 
															+            node__service__pb2.SendOpaqueStatusRequest.SerializeToString,
														
 
															+            node__service__pb2.Empty.FromString,
														
 
															+            options,
														
 
															+            channel_credentials,
														
 
															+            insecure,
														
 
															+            call_credentials,
														
 
															+            compression,
														
 
															+            wait_for_ready,
														
 
															+            timeout,
														
 
															+            metadata,
														
 
															+            _registered_method=True)
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def HealthCheck(request,
														
 
															+            target,
														
 
															+            options=(),
														
 
															+            channel_credentials=None,
														
 
															+            call_credentials=None,
														
 
															+            insecure=False,
														
 
															+            compression=None,
														
 
															+            wait_for_ready=None,
														
 
															+            timeout=None,
														
 
															+            metadata=None):
														
 
															+        return grpc.experimental.unary_unary(
														
 
															+            request,
														
 
															+            target,
														
 
															+            '/node_service.NodeService/HealthCheck',
														
 
															+            node__service__pb2.HealthCheckRequest.SerializeToString,
														
 
															+            node__service__pb2.HealthCheckResponse.FromString,
														
 
															+            options,
														
 
															+            channel_credentials,
														
 
															+            insecure,
														
 
															+            call_credentials,
														
 
															+            compression,
														
 
															+            wait_for_ready,
														
 
															+            timeout,
														
 
															+            metadata,
														
 
															+            _registered_method=True)
														
--- a/exo/networking/peer_handle.py
+++ b/exo/networking/peer_handle.py
@@ -36,7 +36,7 @@ class PeerHandle(ABC):
 
															     pass
														
 
															   @abstractmethod
														
 
															-  async def send_prompt(self, shard: Shard, prompt: str, image_str: Optional[str] = None, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.array]:
														
 
															+  async def send_prompt(self, shard: Shard, prompt: str, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.array]:
														
 
															     pass
														
 
															   @abstractmethod
														
--- a/exo/orchestration/node.py
+++ b/exo/orchestration/node.py
@@ -16,7 +16,7 @@ class Node(ABC):
 
															     pass
														
 
															   @abstractmethod
														
 
															-  async def process_prompt(self, shard: Shard, prompt: str, image_str: Optional[str] = None, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.ndarray]:
														
 
															+  async def process_prompt(self, shard: Shard, prompt: str, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.ndarray]:
														
 
															     pass
														
 
															   @abstractmethod
														
--- a/exo/orchestration/standard_node.py
+++ b/exo/orchestration/standard_node.py
@@ -18,7 +18,6 @@ from exo.download.hf.hf_helpers import RepoProgressEvent
 
															 from exo.inference.inference_engine import get_inference_engine, InferenceEngine
														
 
															 from exo.download.hf.hf_shard_download import HFShardDownloader
														
 
															-
														
 
															 class StandardNode(Node):
														
 
															   def __init__(
														
 
															     self,
														
@@ -40,6 +39,8 @@ class StandardNode(Node):
 
															     self.topology: Topology = Topology()
														
 
															     self.device_capabilities = device_capabilities()
														
 
															     self.buffered_token_output: Dict[str, Tuple[List[int], bool]] = {}
														
 
															+    self.buffered_logits: Dict[str, List[np.ndarray]] = {}
														
 
															+    self.buffered_inputs: Dict[str, List[np.ndarray]] = {}
														
 
															     self.max_generate_tokens = max_generate_tokens
														
 
															     self.topology_viz = topology_viz
														
 
															     self._on_token = AsyncCallbackSystem[str, Tuple[str, List[int], bool]]()
														
@@ -100,8 +101,55 @@ class StandardNode(Node):
 
															   def get_topology_inference_engines(self) -> List[List[str]]:
														
 
															     return self.topology_inference_engines_pool
														
 
															+  
														
 
															+  async def encode_prompt(self, shard: Shard, prompt):
														
 
															+    toks = await self.inference_engine.encode(shard, prompt)
														
 
															+    return toks
														
 
															+  
														
 
															+  async def process_result(
														
 
															+    self,
														
 
															+    shard,
														
 
															+    result: np.ndarray,
														
 
															+    request_id: Optional[str] = None,
														
 
															+    inference_state: Optional[str] = None,
														
 
															+  ):
														
 
															+    if request_id not in self.buffered_token_output:
														
 
															+      self.buffered_token_output[request_id] = ([], False)
														
 
															+    
														
 
															+    if request_id not in self.buffered_logits:
														
 
															+      self.buffered_logits[request_id] = []
														
 
															+
														
 
															+    self.buffered_logits[request_id] += [i for i in np.reshape(result, (-1, 1, result.shape[-1]))]
														
 
															+
														
 
															+    if shard.is_last_layer():
														
 
															+      result = await self.inference_engine.sample(result)
														
 
															+      inference_state = json.dumps({"start_pos": len(self.buffered_logits[request_id]) + 1})
														
 
															+    
														
 
															+    await self.inference_engine.ensure_shard(shard)
														
 
															+    is_finished = result.size == 1 and result.item() == self.inference_engine.tokenizer.eos_token_id or len(self.buffered_token_output[request_id][0]) >= self.max_generate_tokens
														
 
															+
														
 
															+    asyncio.create_task(self.broadcast_result(request_id, self.buffered_token_output[request_id][0], is_finished))  # TODO: this is n^2 communication complexity
														
 
															+
														
 
															+    if result.size == 1:  # we got a new token out
														
 
															+      self.buffered_token_output[request_id][0].append(result.item())
														
 
															+      self.trigger_on_token_callbacks(request_id, self.buffered_token_output[request_id][0], is_finished)
														
 
															+    
														
 
															+    if DEBUG >= 2: print(f"[{request_id}] result size: {result.size}, is finished: {is_finished}, buffered tokens: {len(self.buffered_token_output[request_id][0])}")
														
 
															-  async def process_prompt(self, base_shard: Shard, prompt: str, image_str: Optional[str] = None, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.ndarray]:
														
 
															+    if is_finished:
														
 
															+      self.buffered_token_output[request_id] = (self.buffered_token_output[request_id][0], True)
														
 
															+    else:
														
 
															+      asyncio.create_task(self.forward_to_next_shard(shard, result, request_id, inference_state=inference_state))
														
 
															+
														
 
															+    return np.array(self.buffered_token_output[request_id][0]) if len(self.buffered_token_output[request_id][0]) > 0 else None
														
 
															+
														
 
															+  async def process_prompt(
														
 
															+    self,
														
 
															+    base_shard: Shard,
														
 
															+    prompt: str,
														
 
															+    request_id: Optional[str] = None,
														
 
															+    inference_state: Optional[str] = None
														
 
															+  ) -> Optional[np.ndarray]:
														
 
															     shard = self.get_current_shard(base_shard)
														
 
															     asyncio.create_task(
														
 
															       self.broadcast_opaque_status(
														
@@ -113,14 +161,13 @@ class StandardNode(Node):
 
															           "base_shard": base_shard.to_dict(),
														
 
															           "shard": shard.to_dict(),
														
 
															           "prompt": prompt,
														
 
															-          "image_str": image_str,
														
 
															           "inference_state": inference_state,
														
 
															           "request_id": request_id,
														
 
															         }),
														
 
															       )
														
 
															     )
														
 
															     start_time = time.perf_counter_ns()
														
 
															-    resp = await self._process_prompt(base_shard, prompt, image_str, request_id, inference_state)
														
 
															+    resp = await self._process_prompt(base_shard, prompt, request_id, inference_state)
														
 
															     end_time = time.perf_counter_ns()
														
 
															     elapsed_time_ns = end_time - start_time
														
 
															     asyncio.create_task(
														
@@ -133,7 +180,6 @@ class StandardNode(Node):
 
															           "base_shard": base_shard.to_dict(),
														
 
															           "shard": shard.to_dict(),
														
 
															           "prompt": prompt,
														
 
															-          "image_str": image_str,
														
 
															           "inference_state": inference_state,
														
 
															           "request_id": request_id,
														
 
															           "elapsed_time_ns": elapsed_time_ns,
														
@@ -143,35 +189,20 @@ class StandardNode(Node):
 
															     )
														
 
															     return resp
														
 
															-  async def _process_prompt(self, base_shard: Shard, prompt: str, image_str: Optional[str] = None, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.ndarray]:
														
 
															+  async def _process_prompt(self, base_shard: Shard, prompt: str, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.ndarray]:
														
 
															     if request_id is None:
														
 
															       request_id = str(uuid.uuid4())
														
 
															-    if request_id not in self.buffered_token_output:
														
 
															-      self.buffered_token_output[request_id] = ([], False)
														
 
															     shard = self.get_current_shard(base_shard)
														
 
															-    if DEBUG >= 2: print(f"[{request_id}] process prompt: {base_shard=} {shard=} {prompt=} {image_str=}")
														
 
															+    if DEBUG >= 2: print(f"[{request_id}] process prompt: {base_shard=} {shard=} {prompt=}")
														
 
															     if shard.start_layer != 0:
														
 
															-      if DEBUG >= 2: print(f"[{request_id}] forwarding to next shard: {base_shard=} {shard=} {prompt=} {image_str=}")
														
 
															-      await self.forward_to_next_shard(shard, prompt, request_id, image_str=image_str, inference_state=inference_state)
														
 
															-      return
														
 
															-
														
 
															-    result, inference_state, is_finished = await self.inference_engine.infer_prompt(request_id, shard, prompt, image_str, inference_state=inference_state)
														
 
															-    is_finished = is_finished or len(self.buffered_token_output[request_id][0]) >= self.max_generate_tokens
														
 
															-    if is_finished:
														
 
															-      self.buffered_token_output[request_id] = (self.buffered_token_output[request_id][0], True)
														
 
															-    asyncio.create_task(self.broadcast_result(request_id, self.buffered_token_output[request_id][0], is_finished))  # TODO: this is n^2 communication complexity
														
 
															-
														
 
															-    if result.size == 1:
														
 
															-      self.buffered_token_output[request_id][0].append(result.item())
														
 
															-      self.trigger_on_token_callbacks(request_id, self.buffered_token_output[request_id][0], is_finished)
														
 
															-
														
 
															-    if DEBUG >= 2: print(f"[{request_id}] result size: {result.size}, is finished: {is_finished}, buffered tokens: {len(self.buffered_token_output[request_id][0])}")
														
 
															-
														
 
															-    if not is_finished:
														
 
															-      asyncio.create_task(self.forward_to_next_shard(shard, result, request_id, image_str=image_str, inference_state=inference_state))
														
 
															-
														
 
															-    return np.array(self.buffered_token_output[request_id][0]) if len(self.buffered_token_output[request_id][0]) > 0 else None
														
 
															+      if DEBUG >= 2: print(f"[{request_id}] forwarding to next shard: {base_shard=} {shard=} {prompt=}")
														
 
															+      await self.forward_to_next_shard(shard, prompt, request_id, inference_state=inference_state)
														
 
															+      return None
														
 
															+    else:
														
 
															+      result = await self.inference_engine.infer_prompt(request_id, shard, prompt, inference_state=inference_state)
														
 
															+      ret = await self.process_result(shard, result, request_id, inference_state=inference_state) 
														
 
															+      return result
														
 
															   async def process_tensor(
														
 
															     self,
														
@@ -227,27 +258,13 @@ class StandardNode(Node):
 
															   ) -> Optional[np.ndarray]:
														
 
															     if request_id is None:
														
 
															       request_id = str(uuid.uuid4())
														
 
															-    if request_id not in self.buffered_token_output:
														
 
															-      self.buffered_token_output[request_id] = ([], False)
														
 
															     shard = self.get_current_shard(base_shard)
														
 
															+    if DEBUG >= 1: print(f"[{request_id}] process_tensor: {tensor.size=} {tensor.shape=}")
														
 
															     try:
														
 
															-      if DEBUG >= 1: print(f"[{request_id}] process_tensor: {tensor.size=} {tensor.shape=}")
														
 
															-      result, inference_state, is_finished = await self.inference_engine.infer_tensor(request_id, shard, tensor, inference_state=inference_state)
														
 
															-      is_finished = is_finished or len(self.buffered_token_output[request_id][0]) >= self.max_generate_tokens
														
 
															-      if is_finished:
														
 
															-        self.buffered_token_output[request_id] = (self.buffered_token_output[request_id][0], True)
														
 
															-      asyncio.create_task(self.broadcast_result(request_id, self.buffered_token_output[request_id][0], is_finished))  # TODO: this is n^2 communication complexity
														
 
															-
														
 
															-      if result.size == 1:  # we got a new token out
														
 
															-        self.buffered_token_output[request_id][0].append(result.item())
														
 
															-        self.trigger_on_token_callbacks(request_id, self.buffered_token_output[request_id][0], is_finished)
														
 
															-      if DEBUG >= 2: print(f"[{request_id}] result size: {result.size}, is finished: {is_finished}, buffered tokens: {len(self.buffered_token_output[request_id][0])}")
														
 
															-
														
 
															-      if not is_finished:
														
 
															-        asyncio.create_task(self.forward_to_next_shard(shard, result, request_id, inference_state=inference_state))
														
 
															-
														
 
															-      return np.array(self.buffered_token_output[request_id][0]) if len(self.buffered_token_output[request_id][0]) > 0 else None
														
 
															+      result = await self.inference_engine.infer_tensor(request_id, shard, tensor, inference_state=inference_state)
														
 
															+      ret = await self.process_result(shard, result, request_id, inference_state=inference_state) 
														
 
															+      return ret
														
 
															     except Exception as e:
														
 
															       print(f"Error processing tensor for shard {shard}: {e}")
														
 
															       traceback.print_exc()
														
@@ -258,49 +275,48 @@ class StandardNode(Node):
 
															     base_shard: Shard,
														
 
															     tensor_or_prompt: Union[np.ndarray, str],
														
 
															     request_id: str,
														
 
															-    image_str: Optional[str] = None,
														
 
															     inference_state: Optional[str] = None,
														
 
															   ) -> None:
														
 
															     if not self.partitioning_strategy:
														
 
															       if DEBUG >= 1: print("No partitioning strategy found. Skipping forward.")
														
 
															       return
														
 
															-    shard = self.get_current_shard(base_shard)
														
 
															-    partitions = self.partitioning_strategy.partition(self.topology)
														
 
															-    shards = map_partitions_to_shards(self.partitioning_strategy.partition(self.topology), base_shard.n_layers, base_shard.model_id)
														
 
															-    current_partition_index = next((i for i, p in enumerate(partitions) if p.node_id == self.id), None)
														
 
															+    next_partition_index = self.get_partition_index(offset = 1)
														
 
															     if DEBUG >= 1: print(f"Current partition index: {current_partition_index}")
														
 
															-    if current_partition_index is not None:
														
 
															-      next_partition_index = (current_partition_index+1) % len(partitions)
														
 
															-      next_partition: Partition = partitions[next_partition_index]
														
 
															-      next_shard = shards[next_partition_index]
														
 
															+    if next_partition_index is not None:
														
 
															+      target_id = self.partitioning_strategy.partition(self.topology)[next_partition_index].node_id
														
 
															+      next_shard = self.get_current_shard(base_shard, next_partition_index)
														
 
															       if DEBUG >= 2: print(f"Computed next from: {shard}, {self.topology}. Next partition: {next_partition}")
														
 
															-
														
 
															-      if next_partition.node_id == self.id:
														
 
															-        if isinstance(tensor_or_prompt, np.ndarray):
														
 
															-          await self.process_tensor(shard, tensor_or_prompt, request_id, inference_state=inference_state)
														
 
															+      is_tensor = isinstance(tensor_or_prompt, np.ndarray)
														
 
															+      if target_id == self.id:
														
 
															+        if is_tensor:
														
 
															+          await self.process_tensor(next_shard, tensor_or_prompt, request_id, inference_state=inference_state)
														
 
															         else:
														
 
															-          await self.process_prompt(shard, tensor_or_prompt, image_str, request_id, inference_state=inference_state)
														
 
															-        return
														
 
															-
														
 
															-      target_peer = next((p for p in self.peers if p.id() == next_partition.node_id), None)
														
 
															-      if not target_peer:
														
 
															-        raise ValueError(f"Peer for {next_partition} not found")
														
 
															-
														
 
															-      if DEBUG >= 1: print(f"Sending tensor_or_prompt to {target_peer.id()}: {tensor_or_prompt}")
														
 
															-
														
 
															-      if isinstance(tensor_or_prompt, np.ndarray):
														
 
															-        await target_peer.send_tensor(next_shard, tensor_or_prompt, request_id=request_id, inference_state=inference_state)
														
 
															+          await self.process_prompt(next_shard, tensor_or_prompt, request_id, inference_state=inference_state)
														
 
															       else:
														
 
															-        await target_peer.send_prompt(next_shard, tensor_or_prompt, image_str=image_str, request_id=request_id, inference_state=inference_state)
														
 
															+        target_peer = next((p for p in self.peers if p.id() == target_id), None)
														
 
															+        if not target_peer:
														
 
															+          raise ValueError(f"Peer for {next_partition} not found")
														
 
															+        
														
 
															+        if is_tensor:
														
 
															+          if DEBUG >= 1: print(f"Sending tensor to {target_peer.id()}: {tensor_or_prompt}")
														
 
															+          await target_peer.send_tensor(next_shard, tensor_or_prompt, request_id=request_id, inference_state=inference_state)
														
 
															+        else:
														
 
															+          await target_peer.send_prompt(next_shard, tensor_or_prompt, request_id=request_id, inference_state=inference_state)
														
 
															-  def get_current_shard(self, base_shard: Shard) -> Shard:
														
 
															+  def get_partition_index(self, offset: int = 0):
														
 
															     partitions = self.partitioning_strategy.partition(self.topology)
														
 
															-    shards = map_partitions_to_shards(partitions, base_shard.n_layers, base_shard.model_id)
														
 
															     current_partition_index = next((i for i, p in enumerate(partitions) if p.node_id == self.id), None)
														
 
															     if current_partition_index is None:
														
 
															       raise ValueError(f"No current partition found for node: {self.id}")
														
 
															-    return shards[current_partition_index]
														
 
															+    return (current_partition_index + offset) % len(partitions)
														
 
															+
														
 
															+  def get_current_shard(self, base_shard: Shard, index: Optional[int] = None) -> Shard:
														
 
															+    if index is None:
														
 
															+      index = self.get_partition_index()
														
 
															+    partitions = self.partitioning_strategy.partition(self.topology)
														
 
															+    shards = map_partitions_to_shards(partitions, base_shard.n_layers, base_shard.model_id)
														
 
															+    return shards[index]
														
 
															   async def update_peers(self, wait_for_peers: int = 0) -> bool:
														
 
															     next_peers = await self.discovery.discover_peers(wait_for_peers)
														
@@ -428,7 +444,7 @@ class StandardNode(Node):
 
															   def trigger_on_token_callbacks(self, request_id: str, tokens: List[int], is_finished: bool) -> None:
														
 
															     if DEBUG >= 2: print(f"Triggering all on_token callbacks with {request_id=} num_tokens={len(tokens)} {is_finished=}")
														
 
															     self.on_token.trigger_all(request_id, tokens, is_finished)
														
 
															-
														
 
															+  
														
 
															   async def broadcast_result(self, request_id: str, result: List[int], is_finished: bool) -> None:
														
 
															     async def send_result_to_peer(peer):
														
 
															       try: