1 год назад · 563dcb56b0
--- a/example_user.py
+++ b/example_user.py
@@ -0,0 +1,75 @@
 
				+# In this example, a user is running a home cluster with 3 shards.
			
 
				+# They are prompting the cluster to generate a response to a question.
			
 
				+# The cluster is given the question, and the user is given the response.
			
 
				+
			
 
				+from inference.mlx.sharded_utils import get_model_path, load_tokenizer
			
 
				+from inference.shard import Shard
			
 
				+from networking.peer_handle import PeerHandle
			
 
				+from networking.grpc.grpc_peer_handle import GRPCPeerHandle
			
 
				+from typing import List
			
 
				+import asyncio
			
 
				+import argparse
			
 
				+
			
 
				+path_or_hf_repo = "mlx-community/Meta-Llama-3-8B-Instruct-4bit"
			
 
				+model_path = get_model_path(path_or_hf_repo)
			
 
				+tokenizer_config = {}
			
 
				+tokenizer = load_tokenizer(model_path, tokenizer_config)
			
 
				+
			
 
				+peers: List[PeerHandle] = [
			
 
				+    GRPCPeerHandle(
			
 
				+        "node1",
			
 
				+        "localhost:8080",
			
 
				+    ),
			
 
				+    GRPCPeerHandle(
			
 
				+        "node2",
			
 
				+        "localhost:8081",
			
 
				+    )
			
 
				+]
			
 
				+shards: List[Shard] = [
			
 
				+    # Shard(model_id=path_or_hf_repo, start_layer=0, end_layer=15, n_layers=32),
			
 
				+    # Shard(model_id=path_or_hf_repo, start_layer=16, end_layer=31, n_layers=32),
			
 
				+    Shard(model_id=path_or_hf_repo, start_layer=0, end_layer=30, n_layers=32),
			
 
				+    Shard(model_id=path_or_hf_repo, start_layer=31, end_layer=31, n_layers=32),
			
 
				+]
			
 
				+
			
 
				+async def run_prompt(prompt: str):
			
 
				+    if tokenizer.chat_template is None:
			
 
				+        tokenizer.chat_template = tokenizer.default_chat_template
			
 
				+    if (
			
 
				+        hasattr(tokenizer, "apply_chat_template")
			
 
				+        and tokenizer.chat_template is not None
			
 
				+    ):
			
 
				+        messages = [{"role": "user", "content": prompt}]
			
 
				+        prompt = tokenizer.apply_chat_template(
			
 
				+            messages, tokenize=False, add_generation_prompt=True
			
 
				+        )
			
 
				+
			
 
				+    for peer, shard in zip(peers, shards):
			
 
				+        await peer.connect()
			
 
				+        await peer.reset_shard(shard)
			
 
				+
			
 
				+    tokens = []
			
 
				+    last_output = prompt
			
 
				+
			
 
				+    for _ in range(20):
			
 
				+        for peer, shard in zip(peers, shards):
			
 
				+            if isinstance(last_output, str):
			
 
				+                last_output = await peer.send_prompt(shard, last_output)
			
 
				+                print("prompt output:", last_output)
			
 
				+            else:
			
 
				+                last_output = await peer.send_tensor(shard, last_output)
			
 
				+                print("tensor output:", last_output)
			
 
				+
			
 
				+        if not last_output:
			
 
				+            break
			
 
				+
			
 
				+        tokens.append(last_output.item())
			
 
				+
			
 
				+    print(tokenizer.decode(tokens))
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    parser = argparse.ArgumentParser(description="Run prompt")
			
 
				+    parser.add_argument("--prompt", type=str, help="The prompt to run")
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    asyncio.run(run_prompt(args.prompt))
			
--- a/inference/inference_engine.py
+++ b/inference/inference_engine.py
@@ -9,23 +9,9 @@ class InferenceEngine(ABC):
 
				     async def infer_shard(self, shard: Shard, input_data: np.ndarray) -> np.ndarray:
			
 
				         pass
			
 
				 
			
 
				-    @abstractmethod
			
 
				-    async def reset_shard(self, shard: Shard):
			
 
				+    async def infer_prompt(self, shard: Shard, prompt: str) -> np.ndarray:
			
 
				         pass
			
 
				 
			
 
				-class MLXFixedShardInferenceEngine(InferenceEngine):
			
 
				-    def __init__(self, model: nn.Module, shard: Shard):
			
 
				-        self.model = model
			
 
				-        self.shard = shard
			
 
				-
			
 
				-    async def infer_shard(self, shard: Shard, input_data: np.ndarray) -> np.ndarray:
			
 
				-        if shard != self.shard:
			
 
				-            raise ValueError(f"Shard mismatch: {shard} != {self.shard}")
			
 
				-
			
 
				-        output_data = self.model.process(input_data)
			
 
				-        print("Processed data through model shard")
			
 
				-        return output_data
			
 
				-
			
 
				+    @abstractmethod
			
 
				     async def reset_shard(self, shard: Shard):
			
 
				-        # TODO
			
 
				-        print(f"Resetting shard: {shard}")
			
 
				+        pass
			
--- a/inference/mlx/models/sharded_llama.py
+++ b/inference/mlx/models/sharded_llama.py
@@ -0,0 +1,244 @@
 
				+from dataclasses import dataclass, field
			
 
				+from typing import Dict, Optional, Tuple, Union
			
 
				+
			
 
				+import mlx.core as mx
			
 
				+import mlx.nn as nn
			
 
				+
			
 
				+from mlx_lm.models.base import BaseModelArgs, create_additive_causal_mask
			
 
				+from ...shard import Shard
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class NormalModelArgs(BaseModelArgs):
			
 
				+    model_type: str
			
 
				+    hidden_size: int
			
 
				+    num_hidden_layers: int
			
 
				+    intermediate_size: int
			
 
				+    num_attention_heads: int
			
 
				+    rms_norm_eps: float
			
 
				+    vocab_size: int
			
 
				+    num_key_value_heads: int = None
			
 
				+    attention_bias: bool = False
			
 
				+    mlp_bias: bool = False
			
 
				+    rope_theta: float = 10000
			
 
				+    rope_traditional: bool = False
			
 
				+    rope_scaling: Optional[Dict[str, Union[float, str]]] = None
			
 
				+    tie_word_embeddings: bool = True
			
 
				+
			
 
				+    def __post_init__(self):
			
 
				+        if self.num_key_value_heads is None:
			
 
				+            self.num_key_value_heads = self.num_attention_heads
			
 
				+
			
 
				+        if self.rope_scaling:
			
 
				+            required_keys = {"factor", "type"}
			
 
				+            if not all(key in self.rope_scaling for key in required_keys):
			
 
				+                raise ValueError(f"rope_scaling must contain keys {required_keys}")
			
 
				+
			
 
				+            if self.rope_scaling["type"] != "linear":
			
 
				+                raise ValueError("rope_scaling 'type' currently only supports 'linear'")
			
 
				+@dataclass
			
 
				+class ModelArgs(NormalModelArgs):
			
 
				+    shard: Shard = field(default_factory=lambda: Shard("", 0, 0, 0))
			
 
				+
			
 
				+    def __post_init__(self):
			
 
				+        super().__post_init__()  # Ensure parent initializations are respected
			
 
				+
			
 
				+        if isinstance(self.shard, Shard):
			
 
				+            return
			
 
				+        if not isinstance(self.shard, dict):
			
 
				+            raise TypeError(f"Expected shard to be a Shard instance or a dict, got {type(self.shard)} instead")
			
 
				+
			
 
				+        self.shard = Shard(**self.shard)
			
 
				+
			
 
				+class Attention(nn.Module):
			
 
				+    def __init__(self, args: ModelArgs):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        dim = args.hidden_size
			
 
				+        self.n_heads = n_heads = args.num_attention_heads
			
 
				+        self.n_kv_heads = n_kv_heads = args.num_key_value_heads
			
 
				+
			
 
				+        head_dim = args.hidden_size // n_heads
			
 
				+        self.scale = head_dim**-0.5
			
 
				+        if hasattr(args, "attention_bias"):
			
 
				+            attention_bias = args.attention_bias
			
 
				+        else:
			
 
				+            attention_bias = False
			
 
				+
			
 
				+        self.q_proj = nn.Linear(dim, n_heads * head_dim, bias=attention_bias)
			
 
				+        self.k_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=attention_bias)
			
 
				+        self.v_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=attention_bias)
			
 
				+        self.o_proj = nn.Linear(n_heads * head_dim, dim, bias=attention_bias)
			
 
				+
			
 
				+        rope_scale = (
			
 
				+            1 / args.rope_scaling["factor"]
			
 
				+            if args.rope_scaling is not None and args.rope_scaling["type"] == "linear"
			
 
				+            else 1
			
 
				+        )
			
 
				+        self.rope = nn.RoPE(
			
 
				+            head_dim,
			
 
				+            traditional=args.rope_traditional,
			
 
				+            base=args.rope_theta,
			
 
				+            scale=rope_scale,
			
 
				+        )
			
 
				+
			
 
				+    def __call__(
			
 
				+        self,
			
 
				+        x: mx.array,
			
 
				+        mask: Optional[mx.array] = None,
			
 
				+        cache: Optional[Tuple[mx.array, mx.array]] = None,
			
 
				+    ) -> mx.array:
			
 
				+        B, L, D = x.shape
			
 
				+
			
 
				+        queries, keys, values = self.q_proj(x), self.k_proj(x), self.v_proj(x)
			
 
				+
			
 
				+        # Prepare the queries, keys and values for the attention computation
			
 
				+        queries = queries.reshape(B, L, self.n_heads, -1).transpose(0, 2, 1, 3)
			
 
				+        keys = keys.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
			
 
				+        values = values.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
			
 
				+
			
 
				+        if cache is not None:
			
 
				+            queries = self.rope(queries, offset=cache.offset)
			
 
				+            keys = self.rope(keys, offset=cache.offset)
			
 
				+            keys, values = cache.update_and_fetch(keys, values)
			
 
				+        else:
			
 
				+            queries = self.rope(queries)
			
 
				+            keys = self.rope(keys)
			
 
				+
			
 
				+        output = mx.fast.scaled_dot_product_attention(
			
 
				+            queries, keys, values, scale=self.scale, mask=mask
			
 
				+        )
			
 
				+        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
			
 
				+        return self.o_proj(output)
			
 
				+
			
 
				+
			
 
				+class MLP(nn.Module):
			
 
				+    def __init__(self, args: ModelArgs):
			
 
				+        super().__init__()
			
 
				+
			
 
				+        dim = args.hidden_size
			
 
				+        hidden_dim = args.intermediate_size
			
 
				+        if hasattr(args, "mlp_bias"):
			
 
				+            mlp_bias = args.mlp_bias
			
 
				+        else:
			
 
				+            mlp_bias = False
			
 
				+
			
 
				+        self.gate_proj = nn.Linear(dim, hidden_dim, bias=mlp_bias)
			
 
				+        self.down_proj = nn.Linear(hidden_dim, dim, bias=mlp_bias)
			
 
				+        self.up_proj = nn.Linear(dim, hidden_dim, bias=mlp_bias)
			
 
				+
			
 
				+    def __call__(self, x) -> mx.array:
			
 
				+        return self.down_proj(nn.silu(self.gate_proj(x)) * self.up_proj(x))
			
 
				+
			
 
				+
			
 
				+class TransformerBlock(nn.Module):
			
 
				+    def __init__(self, args: ModelArgs):
			
 
				+        super().__init__()
			
 
				+        self.num_attention_heads = args.num_attention_heads
			
 
				+        self.hidden_size = args.hidden_size
			
 
				+        self.self_attn = Attention(args)
			
 
				+        self.mlp = MLP(args)
			
 
				+        self.input_layernorm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
			
 
				+        self.post_attention_layernorm = nn.RMSNorm(
			
 
				+            args.hidden_size, eps=args.rms_norm_eps
			
 
				+        )
			
 
				+        self.args = args
			
 
				+
			
 
				+    def __call__(
			
 
				+        self,
			
 
				+        x: mx.array,
			
 
				+        mask: Optional[mx.array] = None,
			
 
				+        cache: Optional[Tuple[mx.array, mx.array]] = None,
			
 
				+    ) -> mx.array:
			
 
				+        r = self.self_attn(self.input_layernorm(x), mask, cache)
			
 
				+        h = x + r
			
 
				+        r = self.mlp(self.post_attention_layernorm(h))
			
 
				+        out = h + r
			
 
				+        return out
			
 
				+
			
 
				+
			
 
				+class LlamaModel(nn.Module):
			
 
				+    def __init__(self, args: ModelArgs):
			
 
				+        super().__init__()
			
 
				+        self.args = args
			
 
				+        self.vocab_size = args.vocab_size
			
 
				+        self.num_hidden_layers = args.num_hidden_layers
			
 
				+        assert self.vocab_size > 0
			
 
				+        self.embed_tokens = nn.Embedding(args.vocab_size, args.hidden_size)
			
 
				+        self.layers = [
			
 
				+            TransformerBlock(args=args) for _ in range(args.shard.n_layers)
			
 
				+        ]
			
 
				+        self.norm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
			
 
				+
			
 
				+    def __call__(
			
 
				+        self,
			
 
				+        inputs: mx.array,
			
 
				+        cache=None,
			
 
				+    ):
			
 
				+        if self.args.shard.is_first_layer():
			
 
				+            h = self.embed_tokens(inputs)
			
 
				+        else:
			
 
				+            h = inputs
			
 
				+
			
 
				+        mask = None
			
 
				+        if h.shape[1] > 1:
			
 
				+            mask = create_additive_causal_mask(
			
 
				+                h.shape[1], cache[0].offset if cache is not None else 0
			
 
				+            )
			
 
				+            mask = mask.astype(h.dtype)
			
 
				+
			
 
				+        if cache is None:
			
 
				+            cache = [None] * len(self.layers)
			
 
				+
			
 
				+        for layer, c in zip(self.layers, cache):
			
 
				+            h = layer(h, mask, cache=c)
			
 
				+
			
 
				+        if self.args.shard.is_last_layer():
			
 
				+            return self.norm(h)
			
 
				+        else:
			
 
				+            return h
			
 
				+
			
 
				+
			
 
				+class Model(nn.Module):
			
 
				+    def __init__(self, args: ModelArgs):
			
 
				+        super().__init__()
			
 
				+        self.args = args
			
 
				+        self.model_type = args.model_type
			
 
				+        self.model = LlamaModel(args)
			
 
				+        if not args.tie_word_embeddings:
			
 
				+            self.lm_head = nn.Linear(args.hidden_size, args.vocab_size, bias=False)
			
 
				+
			
 
				+    def __call__(
			
 
				+        self,
			
 
				+        inputs: mx.array,
			
 
				+        cache=None,
			
 
				+    ):
			
 
				+        out = self.model(inputs, cache)
			
 
				+
			
 
				+        if self.args.shard.is_last_layer():
			
 
				+            if self.args.tie_word_embeddings:
			
 
				+                out = self.model.embed_tokens.as_linear(out)
			
 
				+            else:
			
 
				+                out = self.lm_head(out)
			
 
				+
			
 
				+        return out
			
 
				+
			
 
				+
			
 
				+    def sanitize(self, weights):
			
 
				+        # Remove unused precomputed rotary freqs
			
 
				+        return {
			
 
				+            k: v for k, v in weights.items() if "self_attn.rotary_emb.inv_freq" not in k
			
 
				+        }
			
 
				+
			
 
				+    @property
			
 
				+    def layers(self):
			
 
				+        return self.model.layers
			
 
				+
			
 
				+    @property
			
 
				+    def head_dim(self):
			
 
				+        return self.args.hidden_size // self.args.num_attention_heads
			
 
				+
			
 
				+    @property
			
 
				+    def n_kv_heads(self):
			
 
				+        return self.args.num_key_value_heads
			
 
				+
			
--- a/inference/mlx/sharded_inference_engine.py
+++ b/inference/mlx/sharded_inference_engine.py
@@ -0,0 +1,37 @@
 
				+import mlx.nn as nn
			
 
				+import numpy as np
			
 
				+import mlx.core as mx
			
 
				+from ..inference_engine import InferenceEngine
			
 
				+from .sharded_model import StatefulShardedModel
			
 
				+from .sharded_utils import load_shard
			
 
				+from ..shard import Shard
			
 
				+
			
 
				+class MLXFixedShardInferenceEngine(InferenceEngine):
			
 
				+    def __init__(self, model_path: str, shard: Shard):
			
 
				+        print("initializing fixed shard inference", shard)
			
 
				+        self.shard = shard
			
 
				+        model_shard, self.tokenizer = load_shard(model_path, shard)
			
 
				+        self.stateful_sharded_model = StatefulShardedModel(shard, model_shard)
			
 
				+
			
 
				+    async def infer_prompt(self, shard: Shard, prompt: str) -> np.ndarray:
			
 
				+        if shard != self.shard:
			
 
				+            raise ValueError(f"Shard mismatch: {shard} != {self.shard}")
			
 
				+
			
 
				+        output_data = self.stateful_sharded_model.step(mx.array(self.tokenizer.encode(prompt)))
			
 
				+        return np.array(output_data)
			
 
				+
			
 
				+    async def infer_shard(self, shard: Shard, input_data: np.ndarray) -> np.ndarray:
			
 
				+        if shard != self.shard:
			
 
				+            raise ValueError(f"Shard mismatch: {shard} != {self.shard}")
			
 
				+
			
 
				+        print("infer_shard", shard, input_data)
			
 
				+
			
 
				+        output_data = self.stateful_sharded_model.step(mx.array(input_data))
			
 
				+        return np.array(output_data)
			
 
				+
			
 
				+    async def reset_shard(self, shard: Shard):
			
 
				+        if shard != self.shard:
			
 
				+            raise ValueError(f"Shard mismatch: {shard} != {self.shard}")
			
 
				+
			
 
				+        print(f"Resetting shard: {shard}")
			
 
				+        self.stateful_sharded_model.reset()
			
--- a/inference/mlx/sharded_model.py
+++ b/inference/mlx/sharded_model.py
@@ -0,0 +1,56 @@
 
				+from typing import Dict, Generator, Optional, Tuple
			
 
				+
			
 
				+import mlx.core as mx
			
 
				+import mlx.nn as nn
			
 
				+from mlx_lm.models.base import KVCache
			
 
				+from mlx_lm.sample_utils import top_p_sampling
			
 
				+
			
 
				+from ..shard import Shard
			
 
				+
			
 
				+class StatefulShardedModel:
			
 
				+    def __init__(self, shard: Shard, model: nn.Module):
			
 
				+        self.shard = shard
			
 
				+        self.model = model
			
 
				+        self.reset()
			
 
				+
			
 
				+    def step(
			
 
				+        self,
			
 
				+        x,
			
 
				+        temp: float = 0.0,
			
 
				+        top_p: float = 1.0,
			
 
				+        logit_bias: Optional[Dict[int, float]] = None,
			
 
				+    ) -> Generator[Tuple[mx.array, mx.array], None, None]:
			
 
				+        def sample(logits: mx.array) -> Tuple[mx.array, float]:
			
 
				+            if logit_bias:
			
 
				+                indices = mx.array(list(logit_bias.keys()))
			
 
				+                values = mx.array(list(logit_bias.values()))
			
 
				+                logits[:, indices] += values
			
 
				+
			
 
				+            if temp == 0:
			
 
				+                token = mx.argmax(logits, axis=-1)
			
 
				+            else:
			
 
				+                if top_p > 0 and top_p < 1.0:
			
 
				+                    token = top_p_sampling(logits, top_p, temp)
			
 
				+                else:
			
 
				+                    token = mx.random.categorical(logits * (1 / temp))
			
 
				+
			
 
				+            return token
			
 
				+
			
 
				+        y = x
			
 
				+
			
 
				+        output = self.model(y[None] if self.shard.is_first_layer() else y, cache=self.cache)
			
 
				+
			
 
				+        if self.shard.is_last_layer():
			
 
				+            logits = output[:, -1, :]
			
 
				+            y = sample(logits)
			
 
				+            return y
			
 
				+        else:
			
 
				+            return output
			
 
				+
			
 
				+    def reset(self):
			
 
				+        kv_heads = (
			
 
				+            [self.model.n_kv_heads] * len(self.model.layers)
			
 
				+            if isinstance(self.model.n_kv_heads, int)
			
 
				+            else self.model.n_kv_heads
			
 
				+        )
			
 
				+        self.cache = [KVCache(self.model.head_dim, n) for n in kv_heads]
			
--- a/inference/mlx/sharded_utils.py
+++ b/inference/mlx/sharded_utils.py
@@ -0,0 +1,230 @@
 
				+# Adapted from https://github.com/ml-explore/mlx-examples/blob/main/llms/mlx_lm/utils.py
			
 
				+
			
 
				+import glob
			
 
				+import importlib
			
 
				+import json
			
 
				+import logging
			
 
				+from pathlib import Path
			
 
				+from typing import Optional, Tuple
			
 
				+
			
 
				+import mlx.core as mx
			
 
				+import mlx.nn as nn
			
 
				+from huggingface_hub import snapshot_download
			
 
				+from huggingface_hub.utils._errors import RepositoryNotFoundError
			
 
				+from mlx.utils import tree_flatten
			
 
				+from transformers import PreTrainedTokenizer
			
 
				+
			
 
				+from mlx_lm.tokenizer_utils import load_tokenizer, TokenizerWrapper
			
 
				+from mlx_lm.tuner.utils import apply_lora_layers
			
 
				+
			
 
				+from ..shard import Shard
			
 
				+
			
 
				+class ModelNotFoundError(Exception):
			
 
				+    def __init__(self, message):
			
 
				+        self.message = message
			
 
				+        super().__init__(self.message)
			
 
				+
			
 
				+MODEL_REMAPPING = {
			
 
				+    "mistral": "llama",  # mistral is compatible with llama
			
 
				+    "phi-msft": "phixtral",
			
 
				+}
			
 
				+
			
 
				+def _get_classes(config: dict):
			
 
				+    """
			
 
				+    Retrieve the model and model args classes based on the configuration.
			
 
				+
			
 
				+    Args:
			
 
				+        config (dict): The model configuration.
			
 
				+
			
 
				+    Returns:
			
 
				+        A tuple containing the Model class and the ModelArgs class.
			
 
				+    """
			
 
				+    model_type = config["model_type"]
			
 
				+    model_type = MODEL_REMAPPING.get(model_type, model_type)
			
 
				+    try:
			
 
				+        arch = importlib.import_module(f"inference.mlx.models.{model_type}")
			
 
				+    except ImportError:
			
 
				+        msg = f"Model type {model_type} not supported."
			
 
				+        logging.error(msg)
			
 
				+        raise ValueError(msg)
			
 
				+
			
 
				+    return arch.Model, arch.ModelArgs
			
 
				+
			
 
				+def load_config(model_path: Path) -> dict:
			
 
				+    try:
			
 
				+        with open(model_path / "config.json", "r") as f:
			
 
				+            config = json.load(f)
			
 
				+    except FileNotFoundError:
			
 
				+        logging.error(f"Config file not found in {model_path}")
			
 
				+        raise
			
 
				+    return config
			
 
				+
			
 
				+def load_model_shard(
			
 
				+    model_path: Path,
			
 
				+    shard: Shard,
			
 
				+    lazy: bool = False,
			
 
				+    model_config: dict = {},
			
 
				+) -> nn.Module:
			
 
				+    """
			
 
				+    Load and initialize the model from a given path.
			
 
				+
			
 
				+    Args:
			
 
				+        model_path (Path): The path to load the model from.
			
 
				+        lazy (bool): If False eval the model parameters to make sure they are
			
 
				+            loaded in memory before returning, otherwise they will be loaded
			
 
				+            when needed. Default: ``False``
			
 
				+        model_config(dict, optional): Configuration parameters for the model.
			
 
				+            Defaults to an empty dictionary.
			
 
				+
			
 
				+    Returns:
			
 
				+        nn.Module: The loaded and initialized model.
			
 
				+
			
 
				+    Raises:
			
 
				+        FileNotFoundError: If the weight files (.safetensors) are not found.
			
 
				+        ValueError: If the model class or args class are not found or cannot be instantiated.
			
 
				+    """
			
 
				+
			
 
				+    config = load_config(model_path)
			
 
				+    config.update(model_config)
			
 
				+
			
 
				+    # TODO hack
			
 
				+    config["model_type"] = f"sharded_{config['model_type']}"
			
 
				+    config["shard"] = {
			
 
				+        "model_id": model_path.name,
			
 
				+        "start_layer": shard.start_layer,
			
 
				+        "end_layer": shard.end_layer,
			
 
				+        "n_layers": shard.n_layers
			
 
				+    }
			
 
				+
			
 
				+    weight_files = glob.glob(str(model_path / "model*.safetensors"))
			
 
				+
			
 
				+    if not weight_files:
			
 
				+        # Try weight for back-compat
			
 
				+        weight_files = glob.glob(str(model_path / "weight*.safetensors"))
			
 
				+
			
 
				+    if not weight_files:
			
 
				+        logging.error(f"No safetensors found in {model_path}")
			
 
				+        raise FileNotFoundError(f"No safetensors found in {model_path}")
			
 
				+
			
 
				+    weights = {}
			
 
				+    for wf in weight_files:
			
 
				+        weights.update(mx.load(wf))
			
 
				+
			
 
				+    model_class, model_args_class = _get_classes(config=config)
			
 
				+
			
 
				+    model_args = model_args_class.from_dict(config)
			
 
				+    model = model_class(model_args)
			
 
				+
			
 
				+    if hasattr(model, "sanitize"):
			
 
				+        weights = model.sanitize(weights)
			
 
				+
			
 
				+    if (quantization := config.get("quantization", None)) is not None:
			
 
				+        # Handle legacy models which may not have everything quantized
			
 
				+        def class_predicate(p, m):
			
 
				+            if not hasattr(m, "to_quantized"):
			
 
				+                return False
			
 
				+            return f"{p}.scales" in weights
			
 
				+
			
 
				+        nn.quantize(
			
 
				+            model,
			
 
				+            **quantization,
			
 
				+            class_predicate=class_predicate,
			
 
				+        )
			
 
				+
			
 
				+    filtered_weights = {}
			
 
				+    for k, v in weights.items():
			
 
				+        if k.startswith("model.layers."):
			
 
				+            layer_num = int(k.split('.')[2])
			
 
				+            if shard.start_layer <= layer_num <= shard.end_layer:
			
 
				+                new_key = f"model.layers.{layer_num - shard.start_layer}." + '.'.join(k.split('.')[3:])
			
 
				+                filtered_weights[new_key] = v
			
 
				+        else:
			
 
				+            filtered_weights[k] = v
			
 
				+    weights = filtered_weights
			
 
				+
			
 
				+    model.load_weights(list(weights.items()), strict=False)
			
 
				+
			
 
				+    if not lazy:
			
 
				+        mx.eval(model.parameters())
			
 
				+
			
 
				+    model.eval()
			
 
				+    return model
			
 
				+
			
 
				+def get_model_path(path_or_hf_repo: str, revision: Optional[str] = None) -> Path:
			
 
				+    """
			
 
				+    Ensures the model is available locally. If the path does not exist locally,
			
 
				+    it is downloaded from the Hugging Face Hub.
			
 
				+
			
 
				+    Args:
			
 
				+        path_or_hf_repo (str): The local path or Hugging Face repository ID of the model.
			
 
				+        revision (str, optional): A revision id which can be a branch name, a tag, or a commit hash.
			
 
				+
			
 
				+    Returns:
			
 
				+        Path: The path to the model.
			
 
				+    """
			
 
				+    model_path = Path(path_or_hf_repo)
			
 
				+    if not model_path.exists():
			
 
				+        try:
			
 
				+            model_path = Path(
			
 
				+                snapshot_download(
			
 
				+                    repo_id=path_or_hf_repo,
			
 
				+                    revision=revision,
			
 
				+                    allow_patterns=[
			
 
				+                        "*.json",
			
 
				+                        "*.safetensors",
			
 
				+                        "*.py",
			
 
				+                        "tokenizer.model",
			
 
				+                        "*.tiktoken",
			
 
				+                        "*.txt",
			
 
				+                    ],
			
 
				+                )
			
 
				+            )
			
 
				+        except RepositoryNotFoundError:
			
 
				+            raise ModelNotFoundError(
			
 
				+                f"Model not found for path or HF repo: {path_or_hf_repo}.\n"
			
 
				+                "Please make sure you specified the local path or Hugging Face"
			
 
				+                " repo id correctly.\nIf you are trying to access a private or"
			
 
				+                " gated Hugging Face repo, make sure you are authenticated:\n"
			
 
				+                "https://huggingface.co/docs/huggingface_hub/en/guides/cli#huggingface-cli-login"
			
 
				+            ) from None
			
 
				+    return model_path
			
 
				+
			
 
				+
			
 
				+def load_shard(
			
 
				+    path_or_hf_repo: str,
			
 
				+    shard: Shard,
			
 
				+    tokenizer_config={},
			
 
				+    model_config={},
			
 
				+    adapter_path: Optional[str] = None,
			
 
				+    lazy: bool = False,
			
 
				+) -> Tuple[nn.Module, TokenizerWrapper]:
			
 
				+    """
			
 
				+    Load the model and tokenizer from a given path or a huggingface repository.
			
 
				+
			
 
				+    Args:
			
 
				+        path_or_hf_repo (Path): The path or the huggingface repository to load the model from.
			
 
				+        tokenizer_config (dict, optional): Configuration parameters specifically for the tokenizer.
			
 
				+            Defaults to an empty dictionary.
			
 
				+        model_config(dict, optional): Configuration parameters specifically for the model.
			
 
				+            Defaults to an empty dictionary.
			
 
				+        adapter_path (str, optional): Path to the LoRA adapters. If provided, applies LoRA layers
			
 
				+            to the model. Default: ``None``.
			
 
				+        lazy (bool): If False eval the model parameters to make sure they are
			
 
				+            loaded in memory before returning, otherwise they will be loaded
			
 
				+            when needed. Default: ``False``
			
 
				+    Returns:
			
 
				+        Tuple[nn.Module, TokenizerWrapper]: A tuple containing the loaded model and tokenizer.
			
 
				+
			
 
				+    Raises:
			
 
				+        FileNotFoundError: If config file or safetensors are not found.
			
 
				+        ValueError: If model class or args class are not found.
			
 
				+    """
			
 
				+    model_path = get_model_path(path_or_hf_repo)
			
 
				+
			
 
				+    model = load_model_shard(model_path, shard, lazy, model_config)
			
 
				+    if adapter_path is not None:
			
 
				+        model = apply_lora_layers(model, adapter_path)
			
 
				+        model.eval()
			
 
				+    tokenizer = load_tokenizer(model_path, tokenizer_config)
			
 
				+
			
 
				+    return model, tokenizer
			
--- a/inference/shard.py
+++ b/inference/shard.py
@@ -3,6 +3,12 @@ from dataclasses import dataclass
 
				 @dataclass
			
 
				 class Shard:
			
 
				     model_id: str
			
 
				-    n_layers: int
			
 
				     start_layer: int
			
 
				     end_layer: int
			
 
				+    n_layers: int
			
 
				+
			
 
				+    def is_first_layer(self) -> bool:
			
 
				+        return self.start_layer == 0
			
 
				+
			
 
				+    def is_last_layer(self) -> bool:
			
 
				+        return self.end_layer == self.n_layers - 1
			
--- a/main.py
+++ b/main.py
@@ -5,19 +5,10 @@ import mlx.core as mx
 
				 import mlx.nn as nn
			
 
				 from orchestration.standard_node import StandardNode
			
 
				 from networking.grpc.grpc_server import GRPCServer
			
 
				-from inference.inference_engine import MLXFixedShardInferenceEngine
			
 
				+from inference.mlx.sharded_inference_engine import MLXFixedShardInferenceEngine
			
 
				 from inference.shard import Shard
			
 
				 from networking.grpc.grpc_discovery import GRPCDiscovery
			
 
				 
			
 
				-class SimpleMLXModel(nn.Module):
			
 
				-    def __init__(self):
			
 
				-        super(SimpleMLXModel, self).__init__()
			
 
				-        self.linear = nn.Linear(10, 5)  # Example dimensions
			
 
				-
			
 
				-    def forward(self, x):
			
 
				-        return self.linear(x)
			
 
				-
			
 
				-
			
 
				 # parse args
			
 
				 parser = argparse.ArgumentParser(description="Initialize GRPC Discovery")
			
 
				 parser.add_argument("--node-id", type=str, default="node1", help="Node ID")
			
@@ -25,15 +16,19 @@ parser.add_argument("--node-host", type=str, default="0.0.0.0", help="Node host"
 
				 parser.add_argument("--node-port", type=int, default=8080, help="Node port")
			
 
				 parser.add_argument("--listen-port", type=int, default=5678, help="Listening port for discovery")
			
 
				 parser.add_argument("--broadcast-port", type=int, default=5678, help="Broadcast port for discovery")
			
 
				+parser.add_argument("--model-id", type=str, default="mlx-community/Meta-Llama-3-8B-Instruct-4bit", help="Path to the model")
			
 
				+parser.add_argument("--n-layers", type=int, default=32, help="Number of layers in the model")
			
 
				+parser.add_argument("--start-layer", type=int, default=0, help="Start layer index")
			
 
				+parser.add_argument("--end-layer", type=int, default=31, help="End layer index")
			
 
				 args = parser.parse_args()
			
 
				 
			
 
				-mlx_model = SimpleMLXModel()
			
 
				-inference_engine = MLXFixedShardInferenceEngine(mlx_model, shard=Shard(model_id="test", n_layers=32, start_layer=0, end_layer=31))
			
 
				+inference_engine = MLXFixedShardInferenceEngine(args.model_id, shard=Shard(model_id=args.model_id, n_layers=args.n_layers, start_layer=args.start_layer, end_layer=args.end_layer))
			
 
				 discovery = GRPCDiscovery(args.node_id, args.node_port, args.listen_port, args.broadcast_port)
			
 
				 node = StandardNode(args.node_id, None, inference_engine, discovery)
			
 
				 server = GRPCServer(node, args.node_host, args.node_port)
			
 
				 node.server = server
			
 
				 
			
 
				+
			
 
				 async def shutdown(signal, loop):
			
 
				     """Gracefully shutdown the server and close the asyncio loop."""
			
 
				     print(f"Received exit signal {signal.name}...")
			
@@ -56,10 +51,6 @@ async def main():
 
				 
			
 
				     await node.start()
			
 
				 
			
 
				-    await asyncio.sleep(5)
			
 
				-    print("Sending reset shard request")
			
 
				-    await node.peers[0].reset_shard(f"regards from {node.id}")
			
 
				-
			
 
				     await asyncio.Event().wait()
			
 
				 
			
 
				 if __name__ == "__main__":
			
--- a/networking/grpc/grpc_peer_handle.py
+++ b/networking/grpc/grpc_peer_handle.py
@@ -7,6 +7,7 @@ from . import node_service_pb2
 
				 from . import node_service_pb2_grpc
			
 
				 
			
 
				 from ..peer_handle import PeerHandle
			
 
				+from inference.shard import Shard
			
 
				 
			
 
				 class GRPCPeerHandle(PeerHandle):
			
 
				     def __init__(self, id: str, address: str):
			
@@ -23,25 +24,38 @@ class GRPCPeerHandle(PeerHandle):
 
				     async def disconnect(self):
			
 
				         await self.channel.close()
			
 
				 
			
 
				-    async def send_prompt(self, prompt: str) -> None:
			
 
				-        request = node_service_pb2.PromptRequest(prompt=prompt)
			
 
				-        await self.stub.SendPrompt(request)
			
 
				+    async def send_prompt(self, shard: Shard, prompt: str) -> Optional[np.array]:
			
 
				+        request = node_service_pb2.PromptRequest(prompt=prompt, shard=node_service_pb2.Shard(model_id=shard.model_id, start_layer=shard.start_layer, end_layer=shard.end_layer, n_layers=shard.n_layers))
			
 
				+        response = await self.stub.SendPrompt(request)
			
 
				         print(f"Sent prompt to {self.address}: {prompt}")
			
 
				 
			
 
				-    async def send_tensor(self, tensor: np.ndarray, target: Optional[str] = None) -> None:
			
 
				+        if not response.tensor_data or not response.shape or not response.dtype:
			
 
				+            return None
			
 
				+
			
 
				+        return np.frombuffer(response.tensor_data, dtype=np.dtype(response.dtype)).reshape(response.shape)
			
 
				+
			
 
				+    async def send_tensor(self, shard: Shard, tensor: np.ndarray, target: Optional[str] = None) -> Optional[np.array]:
			
 
				         request = node_service_pb2.TensorRequest(
			
 
				-            tensor_data=tensor.tobytes(),
			
 
				-            shape=tensor.shape,
			
 
				-            dtype=str(tensor.dtype),
			
 
				+            shard=node_service_pb2.Shard(model_id=shard.model_id, start_layer=shard.start_layer, end_layer=shard.end_layer, n_layers=shard.n_layers),
			
 
				+            tensor = node_service_pb2.Tensor(
			
 
				+                tensor_data=tensor.tobytes(),
			
 
				+                shape=tensor.shape,
			
 
				+                dtype=str(tensor.dtype)
			
 
				+            ),
			
 
				             target=target
			
 
				         )
			
 
				-        await self.stub.SendTensor(request)
			
 
				+        response = await self.stub.SendTensor(request)
			
 
				         if target:
			
 
				             print(f"Sent tensor to {self.address} with target {target}: shape {tensor.shape}")
			
 
				         else:
			
 
				             print(f"Sent tensor to {self.address}: shape {tensor.shape}")
			
 
				 
			
 
				-    async def reset_shard(self, shard_id: str) -> None:
			
 
				-        request = node_service_pb2.ResetShardRequest(shard_id=shard_id)
			
 
				+        if not response.tensor_data or not response.shape or not response.dtype:
			
 
				+            return None
			
 
				+
			
 
				+        return np.frombuffer(response.tensor_data, dtype=np.dtype(response.dtype)).reshape(response.shape)
			
 
				+
			
 
				+    async def reset_shard(self, shard: Shard) -> None:
			
 
				+        request = node_service_pb2.ResetShardRequest(shard=node_service_pb2.Shard(model_id=shard.model_id, start_layer=shard.start_layer, end_layer=shard.end_layer, n_layers=shard.n_layers))
			
 
				         await self.stub.ResetShard(request)
			
 
				-        print(f"Reset shard {shard_id} on {self.address}")
			
 
				+        print(f"Reset shard {shard} on {self.address}")
			
--- a/networking/grpc/grpc_server.py
+++ b/networking/grpc/grpc_server.py
@@ -4,6 +4,7 @@ import numpy as np
 
				 
			
 
				 from . import node_service_pb2
			
 
				 from . import node_service_pb2_grpc
			
 
				+from inference.shard import Shard
			
 
				 
			
 
				 from orchestration import Node
			
 
				 
			
@@ -28,30 +29,24 @@ class GRPCServer(node_service_pb2_grpc.NodeServiceServicer):
 
				             print("Server stopped")
			
 
				 
			
 
				     async def SendPrompt(self, request, context):
			
 
				+        shard = Shard(model_id=request.shard.model_id, start_layer=request.shard.start_layer, end_layer=request.shard.end_layer, n_layers=request.shard.n_layers)
			
 
				         prompt = request.prompt
			
 
				         target = request.target if request.HasField('target') else None
			
 
				-        if target and target != self.node.node_id:
			
 
				-            await self.node.process_prompt(prompt, target)
			
 
				-        else:
			
 
				-            # Process the prompt locally
			
 
				-            # You'd need to implement this method in the Node class
			
 
				-            await self.node.process_prompt(prompt)
			
 
				-        return node_service_pb2.Empty()
			
 
				+        result = await self.node.process_prompt(shard, prompt, target)
			
 
				+        tensor_data = result.tobytes() if result is not None else None
			
 
				+        return node_service_pb2.Tensor(tensor_data=tensor_data, shape=result.shape, dtype=str(result.dtype))
			
 
				 
			
 
				     async def SendTensor(self, request, context):
			
 
				-        tensor = np.frombuffer(request.tensor_data, dtype=np.dtype(request.dtype)).reshape(request.shape)
			
 
				+        shard = Shard(model_id=request.shard.model_id, start_layer=request.shard.start_layer, end_layer=request.shard.end_layer, n_layers=request.shard.n_layers)
			
 
				+        tensor = np.frombuffer(request.tensor.tensor_data, dtype=np.dtype(request.tensor.dtype)).reshape(request.tensor.shape)
			
 
				         target = request.target if request.HasField('target') else None
			
 
				-        if target and target != self.node.node_id:
			
 
				-            await self.node.process_tensor(tensor, target)
			
 
				-        else:
			
 
				-            # Process the tensor locally
			
 
				-            await self.node.inference_strategy.process_inference(tensor)
			
 
				-        return node_service_pb2.Empty()
			
 
				+        result = await self.node.process_tensor(shard, tensor, target)
			
 
				+        print("SendTensor tensor result", result)
			
 
				+        tensor_data = result.tobytes() if result is not None else None
			
 
				+        return node_service_pb2.Tensor(tensor_data=tensor_data, shape=result.shape, dtype=str(result.dtype))
			
 
				 
			
 
				     async def ResetShard(self, request, context):
			
 
				-        print(f"Received ResetShard request: {request}")
			
 
				-        # TODO
			
 
				-        # shard_id = request.shard_id
			
 
				-        # You'd need to implement this method in the Node class
			
 
				-        # await self.node.reset_shard(shard_id)
			
 
				+        shard = Shard(model_id=request.shard.model_id, start_layer=request.shard.start_layer, end_layer=request.shard.end_layer, n_layers=request.shard.n_layers)
			
 
				+        print(f"Received ResetShard request: {shard}")
			
 
				+        await self.node.reset_shard(shard)
			
 
				         return node_service_pb2.Empty()
			
--- a/networking/grpc/node_service.proto
+++ b/networking/grpc/node_service.proto
@@ -3,25 +3,38 @@ syntax = "proto3";
 
				 package node_service;
			
 
				 
			
 
				 service NodeService {
			
 
				-  rpc SendPrompt (PromptRequest) returns (Empty) {}
			
 
				-  rpc SendTensor (TensorRequest) returns (Empty) {}
			
 
				+  rpc SendPrompt (PromptRequest) returns (Tensor) {}
			
 
				+  rpc SendTensor (TensorRequest) returns (Tensor) {}
			
 
				   rpc ResetShard (ResetShardRequest) returns (Empty) {}
			
 
				 }
			
 
				 
			
 
				+message Shard {
			
 
				+  string model_id = 1;
			
 
				+  int32 start_layer = 2;
			
 
				+  int32 end_layer = 3;
			
 
				+  int32 n_layers = 4;
			
 
				+}
			
 
				+
			
 
				 message PromptRequest {
			
 
				-  string prompt = 1;
			
 
				-  optional string target = 2;
			
 
				+  Shard shard = 1;
			
 
				+  string prompt = 2;
			
 
				+  optional string target = 3;
			
 
				 }
			
 
				 
			
 
				 message TensorRequest {
			
 
				+  Shard shard = 1;
			
 
				+  Tensor tensor = 2;
			
 
				+  optional string target = 3;
			
 
				+}
			
 
				+
			
 
				+message Tensor {
			
 
				   bytes tensor_data = 1;
			
 
				   repeated int32 shape = 2;
			
 
				   string dtype = 3;
			
 
				-  optional string target = 4;
			
 
				 }
			
 
				 
			
 
				 message ResetShardRequest {
			
 
				-  string shard_id = 1;
			
 
				+  Shard shard = 1;
			
 
				 }
			
 
				 
			
 
				 message Empty {}
			
--- a/networking/grpc/node_service_pb2.py
+++ b/networking/grpc/node_service_pb2.py
@@ -14,21 +14,25 @@ _sym_db = _symbol_database.Default()
 
				 
			
 
				 
			
 
				 
			
 
				-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x12node_service.proto\x12\x0cnode_service\"?\n\rPromptRequest\x12\x0e\n\x06prompt\x18\x01 \x01(\t\x12\x13\n\x06target\x18\x02 \x01(\tH\x00\x88\x01\x01\x42\t\n\x07_target\"b\n\rTensorRequest\x12\x13\n\x0btensor_data\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\x05\x12\r\n\x05\x64type\x18\x03 \x01(\t\x12\x13\n\x06target\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\t\n\x07_target\"%\n\x11ResetShardRequest\x12\x10\n\x08shard_id\x18\x01 \x01(\t\"\x07\n\x05\x45mpty2\xd7\x01\n\x0bNodeService\x12@\n\nSendPrompt\x12\x1b.node_service.PromptRequest\x1a\x13.node_service.Empty\"\x00\x12@\n\nSendTensor\x12\x1b.node_service.TensorRequest\x1a\x13.node_service.Empty\"\x00\x12\x44\n\nResetShard\x12\x1f.node_service.ResetShardRequest\x1a\x13.node_service.Empty\"\x00\x62\x06proto3')
			
 
				+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x12node_service.proto\x12\x0cnode_service\"S\n\x05Shard\x12\x10\n\x08model_id\x18\x01 \x01(\t\x12\x13\n\x0bstart_layer\x18\x02 \x01(\x05\x12\x11\n\tend_layer\x18\x03 \x01(\x05\x12\x10\n\x08n_layers\x18\x04 \x01(\x05\"c\n\rPromptRequest\x12\"\n\x05shard\x18\x01 \x01(\x0b\x32\x13.node_service.Shard\x12\x0e\n\x06prompt\x18\x02 \x01(\t\x12\x13\n\x06target\x18\x03 \x01(\tH\x00\x88\x01\x01\x42\t\n\x07_target\"y\n\rTensorRequest\x12\"\n\x05shard\x18\x01 \x01(\x0b\x32\x13.node_service.Shard\x12$\n\x06tensor\x18\x02 \x01(\x0b\x32\x14.node_service.Tensor\x12\x13\n\x06target\x18\x03 \x01(\tH\x00\x88\x01\x01\x42\t\n\x07_target\";\n\x06Tensor\x12\x13\n\x0btensor_data\x18\x01 \x01(\x0c\x12\r\n\x05shape\x18\x02 \x03(\x05\x12\r\n\x05\x64type\x18\x03 \x01(\t\"7\n\x11ResetShardRequest\x12\"\n\x05shard\x18\x01 \x01(\x0b\x32\x13.node_service.Shard\"\x07\n\x05\x45mpty2\xd9\x01\n\x0bNodeService\x12\x41\n\nSendPrompt\x12\x1b.node_service.PromptRequest\x1a\x14.node_service.Tensor\"\x00\x12\x41\n\nSendTensor\x12\x1b.node_service.TensorRequest\x1a\x14.node_service.Tensor\"\x00\x12\x44\n\nResetShard\x12\x1f.node_service.ResetShardRequest\x1a\x13.node_service.Empty\"\x00\x62\x06proto3')
			
 
				 
			
 
				 _globals = globals()
			
 
				 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
			
 
				 _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'node_service_pb2', _globals)
			
 
				 if not _descriptor._USE_C_DESCRIPTORS:
			
 
				   DESCRIPTOR._loaded_options = None
			
 
				-  _globals['_PROMPTREQUEST']._serialized_start=36
			
 
				-  _globals['_PROMPTREQUEST']._serialized_end=99
			
 
				-  _globals['_TENSORREQUEST']._serialized_start=101
			
 
				-  _globals['_TENSORREQUEST']._serialized_end=199
			
 
				-  _globals['_RESETSHARDREQUEST']._serialized_start=201
			
 
				-  _globals['_RESETSHARDREQUEST']._serialized_end=238
			
 
				-  _globals['_EMPTY']._serialized_start=240
			
 
				-  _globals['_EMPTY']._serialized_end=247
			
 
				-  _globals['_NODESERVICE']._serialized_start=250
			
 
				-  _globals['_NODESERVICE']._serialized_end=465
			
 
				+  _globals['_SHARD']._serialized_start=36
			
 
				+  _globals['_SHARD']._serialized_end=119
			
 
				+  _globals['_PROMPTREQUEST']._serialized_start=121
			
 
				+  _globals['_PROMPTREQUEST']._serialized_end=220
			
 
				+  _globals['_TENSORREQUEST']._serialized_start=222
			
 
				+  _globals['_TENSORREQUEST']._serialized_end=343
			
 
				+  _globals['_TENSOR']._serialized_start=345
			
 
				+  _globals['_TENSOR']._serialized_end=404
			
 
				+  _globals['_RESETSHARDREQUEST']._serialized_start=406
			
 
				+  _globals['_RESETSHARDREQUEST']._serialized_end=461
			
 
				+  _globals['_EMPTY']._serialized_start=463
			
 
				+  _globals['_EMPTY']._serialized_end=470
			
 
				+  _globals['_NODESERVICE']._serialized_start=473
			
 
				+  _globals['_NODESERVICE']._serialized_end=690
			
 
				 # @@protoc_insertion_point(module_scope)
			
--- a/networking/grpc/node_service_pb2_grpc.py
+++ b/networking/grpc/node_service_pb2_grpc.py
@@ -42,12 +42,12 @@ class NodeServiceStub(object):
 
				         self.SendPrompt = channel.unary_unary(
			
 
				                 '/node_service.NodeService/SendPrompt',
			
 
				                 request_serializer=node__service__pb2.PromptRequest.SerializeToString,
			
 
				-                response_deserializer=node__service__pb2.Empty.FromString,
			
 
				+                response_deserializer=node__service__pb2.Tensor.FromString,
			
 
				                 _registered_method=True)
			
 
				         self.SendTensor = channel.unary_unary(
			
 
				                 '/node_service.NodeService/SendTensor',
			
 
				                 request_serializer=node__service__pb2.TensorRequest.SerializeToString,
			
 
				-                response_deserializer=node__service__pb2.Empty.FromString,
			
 
				+                response_deserializer=node__service__pb2.Tensor.FromString,
			
 
				                 _registered_method=True)
			
 
				         self.ResetShard = channel.unary_unary(
			
 
				                 '/node_service.NodeService/ResetShard',
			
@@ -83,12 +83,12 @@ def add_NodeServiceServicer_to_server(servicer, server):
 
				             'SendPrompt': grpc.unary_unary_rpc_method_handler(
			
 
				                     servicer.SendPrompt,
			
 
				                     request_deserializer=node__service__pb2.PromptRequest.FromString,
			
 
				-                    response_serializer=node__service__pb2.Empty.SerializeToString,
			
 
				+                    response_serializer=node__service__pb2.Tensor.SerializeToString,
			
 
				             ),
			
 
				             'SendTensor': grpc.unary_unary_rpc_method_handler(
			
 
				                     servicer.SendTensor,
			
 
				                     request_deserializer=node__service__pb2.TensorRequest.FromString,
			
 
				-                    response_serializer=node__service__pb2.Empty.SerializeToString,
			
 
				+                    response_serializer=node__service__pb2.Tensor.SerializeToString,
			
 
				             ),
			
 
				             'ResetShard': grpc.unary_unary_rpc_method_handler(
			
 
				                     servicer.ResetShard,
			
@@ -122,7 +122,7 @@ class NodeService(object):
 
				             target,
			
 
				             '/node_service.NodeService/SendPrompt',
			
 
				             node__service__pb2.PromptRequest.SerializeToString,
			
 
				-            node__service__pb2.Empty.FromString,
			
 
				+            node__service__pb2.Tensor.FromString,
			
 
				             options,
			
 
				             channel_credentials,
			
 
				             insecure,
			
@@ -149,7 +149,7 @@ class NodeService(object):
 
				             target,
			
 
				             '/node_service.NodeService/SendTensor',
			
 
				             node__service__pb2.TensorRequest.SerializeToString,
			
 
				-            node__service__pb2.Empty.FromString,
			
 
				+            node__service__pb2.Tensor.FromString,
			
 
				             options,
			
 
				             channel_credentials,
			
 
				             insecure,
			
--- a/networking/peer_handle.py
+++ b/networking/peer_handle.py
@@ -1,5 +1,7 @@
 
				 from abc import ABC, abstractmethod
			
 
				-from typing import Any
			
 
				+from typing import Optional
			
 
				+import numpy as np
			
 
				+from inference.shard import Shard
			
 
				 
			
 
				 class PeerHandle(ABC):
			
 
				     def id(self) -> str:
			
@@ -14,13 +16,13 @@ class PeerHandle(ABC):
 
				         pass
			
 
				 
			
 
				     @abstractmethod
			
 
				-    async def send_prompt(self, prompt: str) -> None:
			
 
				+    async def send_prompt(self, shard: Shard, prompt: str) -> Optional[np.array]:
			
 
				         pass
			
 
				 
			
 
				     @abstractmethod
			
 
				-    async def send_tensor(self, tensor: Any) -> None:
			
 
				+    async def send_tensor(self, shard: Shard, tensor: np.array) -> Optional[np.array]:
			
 
				         pass
			
 
				 
			
 
				     @abstractmethod
			
 
				-    async def reset_shard(self, shard_id: str) -> None:
			
 
				+    async def reset_shard(self, shard: Shard) -> None:
			
 
				         pass
			
--- a/orchestration/node.py
+++ b/orchestration/node.py
@@ -1,10 +1,11 @@
 
				 from typing import Optional
			
 
				 import numpy as np
			
 
				 from abc import ABC, abstractmethod
			
 
				+from inference.shard import Shard
			
 
				 
			
 
				 class Node(ABC):
			
 
				     @abstractmethod
			
 
				-    def start(self) -> None:
			
 
				+    def start(self, wait_for_peers: int = 0) -> None:
			
 
				         pass
			
 
				 
			
 
				     @abstractmethod
			
@@ -12,13 +13,13 @@ class Node(ABC):
 
				         pass
			
 
				 
			
 
				     @abstractmethod
			
 
				-    def process_tensor(self, tensor: np.ndarray, target: Optional[str] = None) -> None:
			
 
				+    def process_tensor(self, shard: Shard, tensor: np.ndarray, target: Optional[str] = None) -> None:
			
 
				         pass
			
 
				 
			
 
				     @abstractmethod
			
 
				-    def process_prompt(self, prompt: str, target: Optional[str] = None) -> None:
			
 
				+    def process_prompt(self, shard: Shard, prompt: str, target: Optional[str] = None) -> None:
			
 
				         pass
			
 
				 
			
 
				     @abstractmethod
			
 
				-    def reset_shard(self, shard_id: str) -> None:
			
 
				+    def reset_shard(self, shard: Shard) -> None:
			
 
				         pass
			
--- a/orchestration/standard_node.py
+++ b/orchestration/standard_node.py
@@ -13,10 +13,10 @@ class StandardNode(Node):
 
				         self.peers: List[PeerHandle] = {}
			
 
				         self.ring_order: List[str] = []
			
 
				 
			
 
				-    async def start(self) -> None:
			
 
				+    async def start(self, wait_for_peers: int = 0) -> None:
			
 
				         await self.server.start()
			
 
				         await self.discovery.start()
			
 
				-        self.peers = await self.discovery.discover_peers()
			
 
				+        self.peers = await self.discovery.discover_peers(wait_for_peers)
			
 
				         print(f"Starting with the following peers: {self.peers}")
			
 
				         print("Connecting to peers...")
			
 
				         for peer in self.peers:
			
@@ -27,19 +27,35 @@ class StandardNode(Node):
 
				         await self.discovery.stop()
			
 
				         await self.server.stop()
			
 
				 
			
 
				-    async def process_tensor(self, tensor: np.ndarray, target: Optional[str] = None) -> None:
			
 
				-        result = await self.inference_engine.process_shard(tensor)
			
 
				-
			
 
				+    async def process_prompt(self, shard: Shard, prompt: str, target: Optional[str] = None) -> Optional[np.array]:
			
 
				+        print("Process prompt", shard, prompt, target)
			
 
				+        result = await self.inference_engine.infer_prompt(shard, prompt)
			
 
				+        # Implement prompt processing logic
			
 
				+        print(f"Got result from prompt: {prompt}. Result: {result}")
			
 
				+        # You might want to initiate inference here
			
 
				         if target:
			
 
				-            if not filter(lambda p: p.id() == target, self.peers):
			
 
				+            target_peer = next((p for p in self.peers if p.id() == target), None)
			
 
				+            if not target_peer:
			
 
				                 raise ValueError(f"Peer {target} not found")
			
 
				 
			
 
				-            await self.peers[target].send_tensor(result)
			
 
				+            await target_peer.send_tensor(result)
			
 
				 
			
 
				-    async def process_prompt(self, prompt: str) -> None:
			
 
				+        return result
			
 
				+
			
 
				+    async def process_tensor(self, shard: Shard, tensor: np.ndarray, target: Optional[str] = None) -> None:
			
 
				+        print("Process tensor", shard, tensor)
			
 
				+        result = await self.inference_engine.infer_shard(shard, tensor)
			
 
				         # Implement prompt processing logic
			
 
				-        print(f"Processing prompt: {prompt}")
			
 
				-        # You might want to initiate inference here
			
 
				+        print(f"Got result from prompt: {len(tensor)}. Result: {result}")
			
 
				+
			
 
				+        if target:
			
 
				+            target_peer = next((p for p in self.peers if p.id() == target), None)
			
 
				+            if not target_peer:
			
 
				+                raise ValueError(f"Peer {target} not found")
			
 
				+
			
 
				+            await target_peer.send_tensor(result)
			
 
				+
			
 
				+        return result
			
 
				 
			
 
				     async def reset_shard(self, shard: Shard) -> None:
			
 
				         # Implement shard reset logic