8 months ago · e7201292de
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -223,30 +223,6 @@ jobs:
 
				       - checkout
			
 
				       - run: system_profiler SPHardwareDataType
			
 
				 
			
 
				-  # chatgpt_api_integration_test_tinygrad:
			
 
				-  #   macos:
			
 
				-  #     xcode: "16.0.0"
			
 
				-  #   resource_class: m2pro.large
			
 
				-  #   steps:
			
 
				-  #     - checkout
			
 
				-  #     - run:
			
 
				-  #         name: Set up Python
			
 
				-  #         command: |
			
 
				-  #           brew install python@3.12
			
 
				-  #           python3.12 -m venv env
			
 
				-  #           source env/bin/activate
			
 
				-  #     - run:
			
 
				-  #         name: Install dependencies
			
 
				-  #         command: |
			
 
				-  #           source env/bin/activate
			
 
				-  #           pip install --upgrade pip
			
 
				-  #           pip install .
			
 
				-  #     - run_chatgpt_api_test:
			
 
				-  #         inference_engine: tinygrad
			
 
				-  #         model_id: llama-3-8b
			
 
				-  #         prompt: "Keep responses concise. Who was the king of pop?"
			
 
				-  #         expected_output: "Michael Jackson"
			
 
				-
			
 
				 workflows:
			
 
				   version: 2
			
 
				   build_and_test:
			
@@ -256,4 +232,3 @@ workflows:
 
				       - chatgpt_api_integration_test_mlx
			
 
				       - chatgpt_api_integration_test_dummy
			
 
				       - test_macos_m1
			
 
				-      # - chatgpt_api_integration_test_tinygrad
			
--- a/exo/inference/debug_inference_engine.py
+++ b/exo/inference/debug_inference_engine.py
@@ -1,59 +0,0 @@
 
				-from exo.inference.inference_engine import InferenceEngine
			
 
				-from exo.inference.shard import Shard
			
 
				-from exo.inference.tinygrad.inference import TinygradDynamicShardInferenceEngine
			
 
				-import asyncio
			
 
				-import numpy as np
			
 
				-
			
 
				-
			
 
				-# An inference engine should work the same for any number of Shards, as long as the Shards are continuous.
			
 
				-async def test_inference_engine(inference_engine_1: InferenceEngine, inference_engine_2: InferenceEngine, model_id: str):
			
 
				-  from exo.inference.tinygrad.inference import Tokenizer
			
 
				-  from pathlib import Path
			
 
				-
			
 
				-  _tokenizer = Tokenizer(str(Path(model_id)/"tokenizer.model"))
			
 
				-
			
 
				-  prompt = "In a single word only, what is the last name of the president of the United States? "
			
 
				-  resp_full, inference_state_full, _ = await inference_engine_1.infer_prompt("A", shard=Shard(model_id=model_id, start_layer=0, end_layer=31, n_layers=32), prompt=prompt)
			
 
				-  next_resp_full, _next_inference_state_full, _ = await inference_engine_1.infer_tensor(
			
 
				-    "A",
			
 
				-    shard=Shard(model_id=model_id, start_layer=0, end_layer=31, n_layers=32),
			
 
				-    input_data=resp_full,
			
 
				-    inference_state=inference_state_full,
			
 
				-  )
			
 
				-
			
 
				-  resp1, inference_state_1, _ = await inference_engine_1.infer_prompt("B", shard=Shard(model_id=model_id, start_layer=0, end_layer=30, n_layers=32), prompt=prompt)
			
 
				-  resp2, inference_state_2, _ = await inference_engine_2.infer_tensor(
			
 
				-    "B",
			
 
				-    shard=Shard(model_id=model_id, start_layer=31, end_layer=31, n_layers=32),
			
 
				-    input_data=resp1,
			
 
				-    inference_state=inference_state_1,
			
 
				-  )
			
 
				-  resp3, inference_state_3, _ = await inference_engine_1.infer_tensor(
			
 
				-    "B",
			
 
				-    shard=Shard(model_id=model_id, start_layer=0, end_layer=30, n_layers=32),
			
 
				-    input_data=resp2,
			
 
				-    inference_state=inference_state_2,
			
 
				-  )
			
 
				-  resp4, _inference_state_4, _ = await inference_engine_2.infer_tensor(
			
 
				-    "B",
			
 
				-    shard=Shard(model_id=model_id, start_layer=31, end_layer=31, n_layers=32),
			
 
				-    input_data=resp3,
			
 
				-    inference_state=inference_state_3,
			
 
				-  )
			
 
				-
			
 
				-  print(f"{resp2=}")
			
 
				-  print(f"full: {_tokenizer.decode(resp_full)}")
			
 
				-  print(f"next full: {_tokenizer.decode(next_resp_full)}")
			
 
				-  print(f"resp2: {_tokenizer.decode(resp2)}")
			
 
				-  print(f"{resp4=}")
			
 
				-  print(f"resp4: {_tokenizer.decode(resp4)}")
			
 
				-
			
 
				-  assert np.array_equal(resp_full, resp2)
			
 
				-  assert np.array_equal(next_resp_full, resp4)
			
 
				-
			
 
				-
			
 
				-asyncio.run(test_inference_engine(
			
 
				-  TinygradDynamicShardInferenceEngine(),
			
 
				-  TinygradDynamicShardInferenceEngine(),
			
 
				-  "llama3-8b-sfr",
			
 
				-))
			
--- a/exo/inference/inference_engine.py
+++ b/exo/inference/inference_engine.py
@@ -15,22 +15,3 @@ class InferenceEngine(ABC):
 
				   @abstractmethod
			
 
				   async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
			
 
				     pass
			
 
				-
			
 
				-
			
 
				-def get_inference_engine(inference_engine_name: str, shard_downloader: 'ShardDownloader'):
			
 
				-  if DEBUG >= 2:
			
 
				-    print(f"get_inference_engine called with: {inference_engine_name}")
			
 
				-  if inference_engine_name == "mlx":
			
 
				-    from exo.inference.mlx.sharded_inference_engine import MLXDynamicShardInferenceEngine
			
 
				-
			
 
				-    return MLXDynamicShardInferenceEngine(shard_downloader)
			
 
				-  elif inference_engine_name == "tinygrad":
			
 
				-    from exo.inference.tinygrad.inference import TinygradDynamicShardInferenceEngine
			
 
				-    import tinygrad.helpers
			
 
				-    tinygrad.helpers.DEBUG.value = int(os.getenv("TINYGRAD_DEBUG", default="0"))
			
 
				-
			
 
				-    return TinygradDynamicShardInferenceEngine(shard_downloader)
			
 
				-  elif inference_engine_name == "dummy":
			
 
				-    from exo.inference.dummy_inference_engine import DummyInferenceEngine
			
 
				-    return DummyInferenceEngine()
			
 
				-  raise ValueError(f"Unsupported inference engine: {inference_engine_name}")
			
--- a/exo/inference/mlx/test_sharded_llava.py
+++ b/exo/inference/mlx/test_sharded_llava.py
@@ -1,64 +0,0 @@
 
				-import codecs
			
 
				-import asyncio
			
 
				-import requests
			
 
				-from PIL import Image
			
 
				-from io import BytesIO
			
 
				-
			
 
				-import mlx.core as mx
			
 
				-from mlx_lm.models.base import KVCache
			
 
				-
			
 
				-from exo.inference.mlx.sharded_model import StatefulShardedModel
			
 
				-from exo.inference.mlx.sharded_utils import load_shard
			
 
				-from exo.inference.shard import Shard
			
 
				-
			
 
				-shard_full = Shard("llava", 0, 31, 32)
			
 
				-shard1 = Shard("llava", 0, 12, 32)
			
 
				-shard2 = Shard("llava", 13, 31, 32)
			
 
				-
			
 
				-model_path = "llava-hf/llava-1.5-7b-hf"
			
 
				-
			
 
				-full_model_shard, full_processor = asyncio.run(load_shard(model_path, shard=shard_full))
			
 
				-model_shard1, processor1 = asyncio.run(load_shard(model_path, shard=shard1))
			
 
				-model_shard2, processor2 = asyncio.run(load_shard(model_path, shard=shard2))
			
 
				-
			
 
				-full = StatefulShardedModel(shard_full, full_model_shard)
			
 
				-m1 = StatefulShardedModel(shard1, model_shard1)
			
 
				-m2 = StatefulShardedModel(shard2, model_shard2)
			
 
				-
			
 
				-PROMPT = "USER: <image>\nWhat are these?\nASSISTANT:"
			
 
				-IMAGE_FILE = "http://images.cocodataset.org/val2017/000000039769.jpg"
			
 
				-response = requests.get(IMAGE_FILE)
			
 
				-img = Image.open(BytesIO(response.content))
			
 
				-prompt = codecs.decode(PROMPT, "unicode_escape")
			
 
				-inputs = full_processor(prompt, img, return_tensors="np")
			
 
				-pixel_values = mx.array(inputs["pixel_values"])
			
 
				-input_ids = mx.array(inputs["input_ids"])
			
 
				-
			
 
				-print(prompt)
			
 
				-y = full.step("full", input_ids, pixel_values, temp=0)
			
 
				-full_generated_tokens = [y.item()]
			
 
				-
			
 
				-for _ in range(13):
			
 
				-  y = full.step("full", y, temp=0)
			
 
				-  full_generated_tokens.append(y.item())
			
 
				-
			
 
				-full_response = full_processor.tokenizer.decode(full_generated_tokens)
			
 
				-print("full response:", full_response)
			
 
				-
			
 
				-inputs = processor1(prompt, img, return_tensors="np")
			
 
				-pixel_values = mx.array(inputs["pixel_values"])
			
 
				-input_ids = mx.array(inputs["input_ids"])
			
 
				-
			
 
				-y = m1.step("shard", input_ids, pixel_values, temp=0)
			
 
				-y = m2.step("shard", y, temp=0)
			
 
				-full_generated_tokens = [y.item()]
			
 
				-
			
 
				-for _ in range(13):
			
 
				-  y = m1.step("shard", y, temp=0)
			
 
				-  y = m2.step("shard", y, temp=0)
			
 
				-  full_generated_tokens.append(y.item())
			
 
				-
			
 
				-sharded_response = processor2.tokenizer.decode(full_generated_tokens)
			
 
				-print("sharded response:", sharded_response)
			
 
				-
			
 
				-assert full_response == sharded_response
			
--- a/exo/inference/test_inference_engine.py
+++ b/exo/inference/test_inference_engine.py
@@ -45,12 +45,3 @@ async def test_inference_engine(inference_engine_1: InferenceEngine, inference_e
 
				 
			
 
				 
			
 
				 asyncio.run(test_inference_engine(MLXDynamicShardInferenceEngine(HFShardDownloader()), MLXDynamicShardInferenceEngine(HFShardDownloader()), "mlx-community/Llama-3.2-1B-Instruct-4bit", 16))
			
 
				-
			
 
				-if os.getenv("RUN_TINYGRAD", default="0") == "1":
			
 
				-  import tinygrad
			
 
				-  import os
			
 
				-  from exo.inference.tinygrad.inference import TinygradDynamicShardInferenceEngine
			
 
				-  tinygrad.helpers.DEBUG.value = int(os.getenv("TINYGRAD_DEBUG", default="0"))
			
 
				-  asyncio.run(
			
 
				-    test_inference_engine(TinygradDynamicShardInferenceEngine(HFShardDownloader()), TinygradDynamicShardInferenceEngine(HFShardDownloader()), "TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-8B-R", 32)
			
 
				-  )
			
--- a/exo/inference/tinygrad/__init__.py
+++ b/exo/inference/tinygrad/__init__.py
--- a/exo/inference/tinygrad/inference.py
+++ b/exo/inference/tinygrad/inference.py
@@ -1,101 +0,0 @@
 
				-from pathlib import Path
			
 
				-import json
			
 
				-import os
			
 
				-from exo.inference.tinygrad.models.llama import Transformer, convert_from_huggingface, fix_bf16
			
 
				-from exo.inference.shard import Shard
			
 
				-from exo.inference.tokenizers import resolve_tokenizer
			
 
				-from tinygrad.nn.state import load_state_dict
			
 
				-from tinygrad import Tensor, nn, Context
			
 
				-from exo.inference.inference_engine import InferenceEngine
			
 
				-from typing import Optional, Tuple
			
 
				-import numpy as np
			
 
				-from exo.inference.tinygrad.tinygrad_helpers import concat_weights, load
			
 
				-from exo.download.shard_download import ShardDownloader
			
 
				-from concurrent.futures import ThreadPoolExecutor
			
 
				-import asyncio
			
 
				-
			
 
				-Tensor.no_grad = True
			
 
				-# default settings
			
 
				-TEMPERATURE = int(os.getenv("TEMPERATURE", 0.85))
			
 
				-TOP_K = 25
			
 
				-TOP_P = 0.9
			
 
				-ALPHA_F = 0.1
			
 
				-ALPHA_P = 0.0
			
 
				-MODEL_PARAMS = {
			
 
				-  "8B": {"args": {"dim": 4096, "n_heads": 32, "n_kv_heads": 8, "n_layers": 32, "norm_eps": 1e-5, "rope_theta": 500000, "vocab_size": 128256, "hidden_dim": 14336}, "files": 1},
			
 
				-  "70B": {"args": {"dim": 8192, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-5, "rope_theta": 500000, "vocab_size": 128256, "hidden_dim": 28672}, "files": 8}
			
 
				-}
			
 
				-
			
 
				-
			
 
				-def build_transformer(model_path: Path, shard: Shard, model_size="8B", device=None):
			
 
				-  # build model
			
 
				-  linear = nn.Linear
			
 
				-  with Context(THREEFRY=0):
			
 
				-    model = Transformer(**MODEL_PARAMS[model_size]["args"], linear=linear, max_context=8192, jit=True, shard=shard)
			
 
				-
			
 
				-  # load weights
			
 
				-  if model_path.is_dir():
			
 
				-    if (model_path/"model.safetensors.index.json").exists(): weights = load(str(model_path/"model.safetensors.index.json"), shard)
			
 
				-    elif (model_path/"model.safetensors").exists(): weights = load(str(model_path/"model.safetensors"), shard)
			
 
				-    else: weights = concat_weights([load(str(model_path/f"consolidated.{i:02d}.pth"), shard) for i in range(MODEL_PARAMS[model_size]["files"])], device[0] if isinstance(device, tuple) else device)
			
 
				-  else:
			
 
				-    weights = load(str(model_path), shard)
			
 
				-  weights = convert_from_huggingface(weights, model, MODEL_PARAMS[model_size]["args"]["n_heads"], MODEL_PARAMS[model_size]["args"]["n_kv_heads"])
			
 
				-  weights = fix_bf16(weights)
			
 
				-
			
 
				-  with Context(BEAM=0):
			
 
				-    # replace weights in model
			
 
				-    load_state_dict(model, weights, strict=False, consume=False)  # consume=True
			
 
				-  return model
			
 
				-
			
 
				-
			
 
				-class TinygradDynamicShardInferenceEngine(InferenceEngine):
			
 
				-  def __init__(self, shard_downloader: ShardDownloader):
			
 
				-    self.shard = None
			
 
				-    self.shard_downloader = shard_downloader
			
 
				-    self.executor = ThreadPoolExecutor(max_workers=1)
			
 
				-
			
 
				-  async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> (np.ndarray, str, bool):
			
 
				-    await self.ensure_shard(shard)
			
 
				-    start_pos = json.loads(inference_state or "{}").get("start_pos", 0)
			
 
				-    n_captured_toks = json.loads(inference_state or "{}").get("n_captured_toks", 0)
			
 
				-
			
 
				-    toks = await asyncio.get_event_loop().run_in_executor(self.executor, self.tokenizer.encode, prompt)
			
 
				-    h = await asyncio.get_event_loop().run_in_executor(self.executor, lambda: self.model(Tensor([toks]), start_pos, TEMPERATURE).realize())
			
 
				-
			
 
				-    if h.shape == (1,):
			
 
				-      start_pos += len(toks)
			
 
				-      start_pos += 1
			
 
				-      n_captured_toks = 0
			
 
				-      return np.array([[h.item()]]), json.dumps({"start_pos": start_pos, "n_captured_toks": n_captured_toks}), h.item() == self.tokenizer.eos_token_id
			
 
				-    else:
			
 
				-      n_captured_toks = len(toks)
			
 
				-      return h.numpy(), json.dumps({"start_pos": start_pos, "n_captured_toks": n_captured_toks}), False
			
 
				-
			
 
				-  async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
			
 
				-    await self.ensure_shard(shard)
			
 
				-    start_pos = json.loads(inference_state or "{}").get("start_pos", 0)
			
 
				-    n_captured_toks = json.loads(inference_state or "{}").get("n_captured_toks", 0)
			
 
				-
			
 
				-    h = await asyncio.get_event_loop().run_in_executor(self.executor, lambda: self.model(Tensor(input_data), start_pos, TEMPERATURE).realize())
			
 
				-
			
 
				-    if h.shape == (1,):
			
 
				-      start_pos += n_captured_toks
			
 
				-      start_pos += 1
			
 
				-      n_captured_toks = 0
			
 
				-      return np.array([[h.item()]]), json.dumps({"start_pos": start_pos, "n_captured_toks": n_captured_toks}), h.item() == self.tokenizer.eos_token_id
			
 
				-    else:
			
 
				-      return h.numpy(), json.dumps({"start_pos": start_pos, "n_captured_toks": n_captured_toks}), False
			
 
				-
			
 
				-  async def ensure_shard(self, shard: Shard):
			
 
				-    if self.shard == shard:
			
 
				-      return
			
 
				-
			
 
				-    model_path = await self.shard_downloader.ensure_shard(shard)
			
 
				-
			
 
				-    if self.shard != shard:
			
 
				-      self.model = await asyncio.get_event_loop().run_in_executor(self.executor, build_transformer, model_path, shard, "8B" if "8b" in shard.model_id.lower() else "70B")
			
 
				-
			
 
				-      tokenizer_path = str((model_path if model_path.is_dir() else model_path.parent))
			
 
				-      self.tokenizer = await resolve_tokenizer(tokenizer_path)
			
 
				-      self.shard = shard
			
--- a/exo/inference/tinygrad/models/__init__.py
+++ b/exo/inference/tinygrad/models/__init__.py
--- a/exo/inference/tinygrad/models/llama.py
+++ b/exo/inference/tinygrad/models/llama.py
@@ -1,257 +0,0 @@
 
				-from typing import Tuple, Union, Optional, Dict, Any
			
 
				-from tinygrad import Tensor, Variable, TinyJit, dtypes, nn, Device
			
 
				-from tinygrad.helpers import getenv
			
 
				-
			
 
				-
			
 
				-# https://github.com/facebookresearch/llama/blob/1076b9c51c77ad06e9d7ba8a4c6df775741732bd/llama/model.py#L47
			
 
				-def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, dtype=dtypes.half) -> Tensor:
			
 
				-  freqs = 1.0/(theta**(Tensor.arange(0, dim, 2)[:(dim // 2)]/dim))
			
 
				-  freqs = Tensor.arange(end).unsqueeze(dim=1)*freqs.unsqueeze(dim=0)
			
 
				-  # TODO: move dtype outside this
			
 
				-  return Tensor.stack(freqs.cos().cast(dtype), freqs.sin().cast(dtype), dim=-1).reshape(1, end, 1, dim // 2, 2)
			
 
				-
			
 
				-
			
 
				-# (a+i*b) * (c+i*d) = (ac-bd) + i*(ad+bc)
			
 
				-def complex_mult(A, c, d):
			
 
				-  a, b = A[..., 0:1], A[..., 1:2]
			
 
				-  ro = a*c - b*d
			
 
				-  co = a*d + b*c
			
 
				-  return ro.cat(co, dim=-1)
			
 
				-
			
 
				-
			
 
				-def apply_rotary_emb(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> Tuple[Tensor, Tensor]:
			
 
				-  assert freqs_cis.shape[1] == xq.shape[1] == xk.shape[1], f"freqs_cis shape mismatch {freqs_cis.shape} xq:{xq.shape} xk:{xk.shape}"
			
 
				-  xq = xq.reshape(*xq.shape[0:-1], -1, 2)
			
 
				-  xk = xk.reshape(*xk.shape[0:-1], -1, 2)
			
 
				-  assert len(xq.shape) == len(xk.shape) == len(freqs_cis.shape) == 5
			
 
				-  c, d = freqs_cis[..., 0:1], freqs_cis[..., 1:2]
			
 
				-  xq_out = complex_mult(xq, c, d)
			
 
				-  xk_out = complex_mult(xk, c, d)
			
 
				-  return xq_out.flatten(3), xk_out.flatten(3)
			
 
				-
			
 
				-
			
 
				-def repeat_kv(x: Tensor, n_rep: int) -> Tensor:
			
 
				-  bs, seqlen, n_kv_heads, head_dim = x.shape
			
 
				-  if n_rep == 1: return x
			
 
				-  # NOTE: this is different from x.repeat((1, 1, n_rep, 1))
			
 
				-  return x.repeat((1, 1, 1, n_rep)).reshape(bs, seqlen, n_kv_heads*n_rep, head_dim)
			
 
				-
			
 
				-
			
 
				-class Attention:
			
 
				-  def __init__(self, dim, n_heads, n_kv_heads, max_context, linear=nn.Linear):
			
 
				-    self.n_heads = n_heads
			
 
				-    self.n_kv_heads = n_kv_heads if n_kv_heads is not None else n_heads  # n_kv_heads != n_heads implies MQA [arxiv/2307.09288, A.2.1]
			
 
				-    self.head_dim = dim // n_heads
			
 
				-    self.n_rep = self.n_heads // self.n_kv_heads
			
 
				-    self.max_context = max_context
			
 
				-
			
 
				-    self.wq = linear(dim, self.n_heads*self.head_dim, bias=False)
			
 
				-    self.wk = linear(dim, self.n_kv_heads*self.head_dim, bias=False)
			
 
				-    self.wv = linear(dim, self.n_kv_heads*self.head_dim, bias=False)
			
 
				-    self.wo = linear(self.n_heads*self.head_dim, dim, bias=False)
			
 
				-
			
 
				-  def __call__(self, x: Tensor, start_pos: Union[Variable, int], freqs_cis: Tensor, mask: Optional[Tensor]) -> Tensor:
			
 
				-    if getenv("WQKV"):
			
 
				-      if not hasattr(self, 'wqkv'): self.wqkv = Tensor.cat(self.wq.weight, self.wk.weight, self.wv.weight)
			
 
				-      xqkv = x @ self.wqkv.T
			
 
				-      xq, xk, xv = xqkv.split([self.wq.weight.shape[0], self.wk.weight.shape[0], self.wv.weight.shape[0]], dim=2)
			
 
				-    else:
			
 
				-      xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
			
 
				-
			
 
				-    xq = xq.reshape(xq.shape[0], xq.shape[1], self.n_heads, self.head_dim)
			
 
				-    xk = xk.reshape(xk.shape[0], xk.shape[1], self.n_kv_heads, self.head_dim)
			
 
				-    xv = xv.reshape(xv.shape[0], xv.shape[1], self.n_kv_heads, self.head_dim)
			
 
				-
			
 
				-    xq, xk = apply_rotary_emb(xq, xk, freqs_cis)
			
 
				-    bsz, seqlen, _, _ = xq.shape
			
 
				-
			
 
				-    # create kv cache
			
 
				-    if not hasattr(self, "cache_kv"):
			
 
				-      self.cache_kv = Tensor.zeros(2, bsz, self.max_context, self.n_kv_heads, self.head_dim, dtype=x.dtype).contiguous().realize()
			
 
				-      if isinstance(x.device, tuple):
			
 
				-        # TODO: instead of specifying how to shard, it can follow how xk and xv are being sharded
			
 
				-        self.cache_kv.shard_((x.device), axis=3 if getenv("SHARD_KVCACHE") else None).realize()
			
 
				-
			
 
				-    # update the cache
			
 
				-    assert xk.dtype == xv.dtype == self.cache_kv.dtype, f"{xk.dtype=}, {xv.dtype=}, {self.cache_kv.dtype=}"
			
 
				-    self.cache_kv.shrink((None, None, (start_pos, start_pos + seqlen), None, None)).assign(Tensor.stack(xk, xv)).realize()
			
 
				-
			
 
				-    keys = self.cache_kv[0].shrink((None, (0, start_pos + seqlen), None, None)) if start_pos > 0 else xk
			
 
				-    values = self.cache_kv[1].shrink((None, (0, start_pos + seqlen), None, None)) if start_pos > 0 else xv
			
 
				-
			
 
				-    keys, values = repeat_kv(keys, self.n_rep), repeat_kv(values, self.n_rep)
			
 
				-    xq, keys, values = xq.transpose(1, 2), keys.transpose(1, 2), values.transpose(1, 2)
			
 
				-    attn = xq.scaled_dot_product_attention(keys, values, mask).transpose(1, 2)
			
 
				-    attn = attn.reshape(bsz, seqlen, -1)
			
 
				-    return self.wo(attn)
			
 
				-
			
 
				-
			
 
				-class FeedForward:
			
 
				-  def __init__(self, dim: int, hidden_dim: int, linear=nn.Linear):
			
 
				-    self.w1 = linear(dim, hidden_dim, bias=False)
			
 
				-    self.w2 = linear(hidden_dim, dim, bias=False)
			
 
				-    self.w3 = linear(dim, hidden_dim, bias=False)  # the gate in Gated Linear Unit
			
 
				-
			
 
				-  def __call__(self, x: Tensor) -> Tensor:
			
 
				-    return self.w2(self.w1(x).silu()*self.w3(x))  # SwiGLU [arxiv/2002.05202, eq (5)]
			
 
				-
			
 
				-
			
 
				-class TransformerBlock:
			
 
				-  def __init__(self, dim: int, hidden_dim: int, n_heads: int, n_kv_heads: int, norm_eps: float, max_context: int, linear=nn.Linear, feed_forward=FeedForward):
			
 
				-    self.attention = Attention(dim, n_heads, n_kv_heads, max_context, linear)
			
 
				-    self.feed_forward = feed_forward(dim, hidden_dim, linear)
			
 
				-    self.attention_norm = nn.RMSNorm(dim, norm_eps)
			
 
				-    self.ffn_norm = nn.RMSNorm(dim, norm_eps)
			
 
				-
			
 
				-  def __call__(self, x: Tensor, start_pos: Union[Variable, int], freqs_cis: Tensor, mask: Optional[Tensor]):
			
 
				-    h = x + self.attention(self.attention_norm(x), start_pos, freqs_cis, mask)
			
 
				-    return (h + self.feed_forward(self.ffn_norm(h))).contiguous()
			
 
				-
			
 
				-
			
 
				-# standard openai sampling
			
 
				-def sample(logits: Tensor, temp: float, k: int, p: float, af: float, ap: float):
			
 
				-  assert logits.ndim == 1, "only works on 1d tensors"
			
 
				-  assert 0 <= p <= 1, "p must be between 0 and 1"
			
 
				-  assert 0 <= k <= logits.numel(), "k must be between 0 and numel"
			
 
				-
			
 
				-  # if temperature is very low just use argmax
			
 
				-  if temp < 1e-6: return logits.argmax().reshape(1)
			
 
				-
			
 
				-  # alpha sampling
			
 
				-  if af or ap:
			
 
				-    if not hasattr(sample, "alpha_counter"):
			
 
				-      setattr(sample, "alpha_counter", Tensor.zeros_like(logits, dtype=dtypes.int32).contiguous())
			
 
				-    logits = logits - (sample.alpha_counter*af + (sample.alpha_counter > 0)*ap)
			
 
				-
			
 
				-  # replace NaNs with -inf
			
 
				-  logits = (logits != logits).where(-float("inf"), logits)
			
 
				-
			
 
				-  # softmax
			
 
				-  t = (logits/temp).softmax()
			
 
				-
			
 
				-  counter, counter2 = Tensor.arange(t.numel(), device=logits.device).contiguous(), Tensor.arange(t.numel() - 1, -1, -1, device=logits.device).contiguous()
			
 
				-  # top k
			
 
				-  if k:
			
 
				-    output, output_indices = Tensor.zeros(k, device=logits.device).contiguous(), Tensor.zeros(k, device=logits.device, dtype=dtypes.int32).contiguous()
			
 
				-    for i in range(k):
			
 
				-      t_argmax = (t.numel() - ((t == (t_max := t.max()))*counter2).max() - 1).cast(dtypes.default_int)
			
 
				-      output = output + t_max.unsqueeze(0).pad(((i, k - i - 1),))
			
 
				-      output_indices = output_indices + t_argmax.unsqueeze(0).pad(((i, k - i - 1),))
			
 
				-      t = (counter == t_argmax).where(0, t)
			
 
				-
			
 
				-    # approximate top p
			
 
				-    # because we are already limited to top k elements we can do top p "without sorting"
			
 
				-    output_cumsum = output[::-1]._cumsum()[::-1] + t.sum()
			
 
				-    output = (output_cumsum >= (1 - p))*output
			
 
				-    output_indices = (output_cumsum >= (1 - p))*output_indices
			
 
				-
			
 
				-    # sample
			
 
				-    output_idx = output.multinomial()
			
 
				-    output_token = output_indices[output_idx]
			
 
				-  else:
			
 
				-    output_token = t.multinomial()
			
 
				-
			
 
				-  # increase alpha counter
			
 
				-  if af or ap:
			
 
				-    sample.alpha_counter = (counter == output_token).where(sample.alpha_counter + 1, sample.alpha_counter)
			
 
				-
			
 
				-  return output_token
			
 
				-
			
 
				-
			
 
				-from exo.inference.shard import Shard
			
 
				-
			
 
				-
			
 
				-class Transformer:
			
 
				-  def __init__(
			
 
				-    self,
			
 
				-    dim: int,
			
 
				-    hidden_dim: int,
			
 
				-    n_heads: int,
			
 
				-    n_layers: int,
			
 
				-    norm_eps: float,
			
 
				-    vocab_size,
			
 
				-    shard: Shard = None,
			
 
				-    linear=nn.Linear,
			
 
				-    n_kv_heads=None,
			
 
				-    rope_theta=10000,
			
 
				-    max_context=1024,
			
 
				-    jit=True,
			
 
				-    feed_forward=FeedForward
			
 
				-  ):
			
 
				-    self.layers = [TransformerBlock(dim, hidden_dim, n_heads, n_kv_heads, norm_eps, max_context, linear, feed_forward=feed_forward) for _ in range(n_layers)]
			
 
				-    self.norm = nn.RMSNorm(dim, norm_eps)
			
 
				-    self.tok_embeddings = nn.Embedding(vocab_size, dim)
			
 
				-    self.output = nn.Linear(dim, vocab_size, bias=False)
			
 
				-    self.max_context = max_context
			
 
				-    self.freqs_cis = precompute_freqs_cis(dim // n_heads, self.max_context*2, rope_theta).contiguous()
			
 
				-    self.forward_jit = TinyJit(self.forward) if jit else None
			
 
				-    self.shard = shard
			
 
				-
			
 
				-  def forward(self, x: Tensor, start_pos: Union[Variable, int], temperature: float, top_k: int, top_p: float, alpha_f: float, alpha_p: float):
			
 
				-    seqlen = x.shape[1]
			
 
				-    freqs_cis = self.freqs_cis.shrink((None, (start_pos, start_pos + seqlen), None, None, None))
			
 
				-    mask = Tensor.full((1, 1, seqlen, start_pos + seqlen), float("-100000000"), dtype=x.dtype, device=x.device).triu(start_pos + 1).realize() if seqlen > 1 else None
			
 
				-
			
 
				-    if self.shard.is_first_layer():
			
 
				-      h = self.tok_embeddings(x)
			
 
				-    else:
			
 
				-      h = x
			
 
				-
			
 
				-    for i in range(self.shard.start_layer, self.shard.end_layer + 1):
			
 
				-      layer = self.layers[i]
			
 
				-      h = layer(h, start_pos, freqs_cis, mask)
			
 
				-
			
 
				-    if self.shard.is_last_layer():
			
 
				-      logits = self.output(self.norm(h)).float()[:, -1, :]
			
 
				-      return sample(logits.flatten(), temperature, top_k, top_p, alpha_f, alpha_p).realize()
			
 
				-    else:
			
 
				-      return h
			
 
				-
			
 
				-  def __call__(self, tokens: Tensor, start_pos: Variable, temperature: float = 0.0, top_k: int = 0, top_p: float = 0.8, alpha_f: float = 0.0, alpha_p: float = 0.0):
			
 
				-    # TODO: better way to handle the first call v.s. the rest?
			
 
				-    if tokens.shape[0:2] == (1, 1) and self.forward_jit is not None:
			
 
				-      return self.forward_jit(tokens, Variable("start_pos", 0, self.max_context).bind(start_pos), temperature, top_k, top_p, alpha_f, alpha_p)
			
 
				-    return self.forward(tokens, start_pos, temperature, top_k, top_p, alpha_f, alpha_p)
			
 
				-
			
 
				-
			
 
				-# *** helpers ***
			
 
				-
			
 
				-
			
 
				-def convert_from_huggingface(weights: Dict[str, Tensor], model: Transformer, n_heads: int, n_kv_heads: int):
			
 
				-  def permute(v: Tensor, n_heads: int):
			
 
				-    return v.reshape(n_heads, 2, v.shape[0] // n_heads // 2, v.shape[1]).transpose(1, 2).reshape(*v.shape[:2])
			
 
				-
			
 
				-  keymap = {
			
 
				-    "model.embed_tokens.weight": "tok_embeddings.weight",
			
 
				-    **{f"model.layers.{l}.input_layernorm.weight": f"layers.{l}.attention_norm.weight"
			
 
				-       for l in range(len(model.layers))},
			
 
				-    **{f"model.layers.{l}.self_attn.{x}_proj.weight": f"layers.{l}.attention.w{x}.weight"
			
 
				-       for x in ["q", "k", "v", "o"]
			
 
				-       for l in range(len(model.layers))},
			
 
				-    **{f"model.layers.{l}.post_attention_layernorm.weight": f"layers.{l}.ffn_norm.weight"
			
 
				-       for l in range(len(model.layers))},
			
 
				-    **{f"model.layers.{l}.mlp.{x}_proj.weight": f"layers.{l}.feed_forward.w{y}.weight"
			
 
				-       for x, y in {"gate": "1", "down": "2", "up": "3"}.items()
			
 
				-       for l in range(len(model.layers))},
			
 
				-    "model.norm.weight": "norm.weight",
			
 
				-    "lm_head.weight": "output.weight",
			
 
				-  }
			
 
				-  sd = {}
			
 
				-  for k, v in weights.items():
			
 
				-    if ".rotary_emb." in k: continue
			
 
				-    v = v.to(Device.DEFAULT)
			
 
				-    if "model.layers" in k:
			
 
				-      if "q_proj" in k:
			
 
				-        v = permute(v, n_heads)
			
 
				-      elif "k_proj" in k:
			
 
				-        v = permute(v, n_kv_heads)
			
 
				-    sd[keymap[k]] = v
			
 
				-  return sd
			
 
				-
			
 
				-
			
 
				-def fix_bf16(weights: Dict[Any, Tensor]):
			
 
				-  if getenv("SUPPORT_BF16", 1):
			
 
				-    # TODO: without casting to float16, 70B llama OOM on tinybox.
			
 
				-    return {k: v.cast(dtypes.float16) if v.dtype == dtypes.bfloat16 else v for k, v in weights.items()}
			
 
				-  # TODO: check if device supports bf16
			
 
				-  return {k: v.llvm_bf16_cast(dtypes.half).to(v.device) if v.dtype == dtypes.bfloat16 else v for k, v in weights.items()}
			
--- a/exo/inference/tinygrad/tinygrad_helpers.py
+++ b/exo/inference/tinygrad/tinygrad_helpers.py
@@ -1,47 +0,0 @@
 
				-from tinygrad.nn.state import safe_load, torch_load
			
 
				-from tinygrad import Tensor
			
 
				-from pathlib import Path
			
 
				-import json
			
 
				-from typing import List
			
 
				-from exo.inference.shard import Shard
			
 
				-from exo.helpers import DEBUG
			
 
				-from exo.download.hf.hf_helpers import get_allow_patterns
			
 
				-from fnmatch import fnmatch
			
 
				-
			
 
				-
			
 
				-# **** helper functions ****
			
 
				-def concat_weights(models, device=None):
			
 
				-  def convert(name) -> Tensor:
			
 
				-    disk_tensors: List[Tensor] = [model[name] for model in models]
			
 
				-    if len(disk_tensors) == 1 or len(disk_tensors[0].shape) == 1:
			
 
				-      return disk_tensors[0].to(device=device)
			
 
				-    axis = 1 if name.endswith(".attention.wo.weight") or name.endswith(".feed_forward.w2.weight") else 0
			
 
				-    lazy_tensors = [data.to(device=device) for data in disk_tensors]
			
 
				-    return lazy_tensors[0].cat(*lazy_tensors[1:], dim=axis)
			
 
				-
			
 
				-  return {name: convert(name) for name in {name: None for model in models for name in model}}
			
 
				-
			
 
				-
			
 
				-def load(fn: str, shard: Shard):
			
 
				-  if fn.endswith('.index.json'):
			
 
				-    with open(fn) as fp:
			
 
				-      weight_map = json.load(fp)['weight_map']
			
 
				-    parts = {}
			
 
				-    filtered_weight_map = {}
			
 
				-    allow_patterns = get_allow_patterns(weight_map, shard)
			
 
				-    for k, n in weight_map.items():
			
 
				-      if allow_patterns is not None and not any(fnmatch(n, r) for r in allow_patterns):
			
 
				-        continue
			
 
				-      if k.startswith("model.layers."):
			
 
				-        layer_num = int(k.split('.')[2])
			
 
				-        if layer_num < shard.start_layer or layer_num > shard.end_layer:
			
 
				-          continue
			
 
				-
			
 
				-      parts[n] = load(str(Path(fn).parent/Path(n).name), shard)
			
 
				-      filtered_weight_map[k] = n
			
 
				-    if DEBUG >= 2: print(f"Excluded model param keys for {shard=}: {sorted(set(weight_map.keys()) - set(filtered_weight_map.keys()))}")
			
 
				-    return {k: parts[n][k] for k, n in filtered_weight_map.items()}
			
 
				-  elif fn.endswith(".safetensors"):
			
 
				-    return safe_load(fn)
			
 
				-  else:
			
 
				-    return torch_load(fn)
			
--- a/exo/inference/tokenizers.py
+++ b/exo/inference/tokenizers.py
@@ -53,7 +53,7 @@ async def _resolve_tokenizer(model_id_or_local_path: Union[str, PathLike]):
 
				     if DEBUG >= 4: print(f"Trying AutoTokenizer for {model_id_or_local_path}")
			
 
				     return AutoTokenizer.from_pretrained(model_id_or_local_path, trust_remote_code=True)
			
 
				   except Exception as e:
			
 
				-    if DEBUG >= 4: print(f"Failed to load tokenizer for {model_id_or_local_path}. Falling back to tinygrad tokenizer. Error: {e}")
			
 
				+    if DEBUG >= 4: print(f"Failed to load tokenizer for {model_id_or_local_path}. Error: {e}")
			
 
				     if DEBUG >= 4: print(traceback.format_exc())
			
 
				 
			
 
				   raise ValueError(f"[TODO] Unsupported model: {model_id_or_local_path}")
			
--- a/exo/main.py
+++ b/exo/main.py
@@ -19,7 +19,7 @@ from exo.download.shard_download import ShardDownloader, RepoProgressEvent, Noop
 
				 from exo.download.hf.hf_shard_download import HFShardDownloader
			
 
				 from exo.helpers import print_yellow_exo, find_available_port, DEBUG, get_system_info, get_or_create_node_id, get_all_ip_addresses, terminal_link
			
 
				 from exo.inference.shard import Shard
			
 
				-from exo.inference.inference_engine import get_inference_engine, InferenceEngine
			
 
				+from exo.inference.inference_engine import InferenceEngine
			
 
				 from exo.inference.dummy_inference_engine import DummyInferenceEngine
			
 
				 from exo.inference.tokenizers import resolve_tokenizer
			
 
				 from exo.orchestration.node import Node
			
@@ -45,14 +45,12 @@ parser.add_argument("--wait-for-peers", type=int, default=0, help="Number of pee
 
				 parser.add_argument("--chatgpt-api-port", type=int, default=8000, help="ChatGPT API port")
			
 
				 parser.add_argument("--chatgpt-api-response-timeout", type=int, default=90, help="ChatGPT API response timeout in seconds")
			
 
				 parser.add_argument("--max-generate-tokens", type=int, default=10000, help="Max tokens to generate in each request")
			
 
				-parser.add_argument("--inference-engine", type=str, default=None, help="Inference engine to use (mlx, tinygrad, or dummy)")
			
 
				 parser.add_argument("--disable-tui", action=argparse.BooleanOptionalAction, help="Disable TUI")
			
 
				 parser.add_argument("--run-model", type=str, help="Specify a model to run directly")
			
 
				 parser.add_argument("--prompt", type=str, help="Prompt for the model when using --run-model", default="Who are you?")
			
 
				 parser.add_argument("--tailscale-api-key", type=str, default=None, help="Tailscale API key")
			
 
				 parser.add_argument("--tailnet-name", type=str, default=None, help="Tailnet name")
			
 
				 args = parser.parse_args()
			
 
				-print(f"Selected inference engine: {args.inference_engine}")
			
 
				 
			
 
				 print_yellow_exo()
			
 
				 
			
@@ -60,12 +58,10 @@ system_info = get_system_info()
 
				 print(f"Detected system: {system_info}")
			
 
				 
			
 
				 shard_downloader: ShardDownloader = HFShardDownloader(quick_check=args.download_quick_check,
			
 
				-                                                      max_parallel_downloads=args.max_parallel_downloads) if args.inference_engine != "dummy" else NoopShardDownloader()
			
 
				-inference_engine_name = args.inference_engine or ("mlx" if system_info == "Apple Silicon Mac" else "tinygrad")
			
 
				-print(f"Inference engine name after selection: {inference_engine_name}")
			
 
				+                                                      max_parallel_downloads=args.max_parallel_downloads)
			
 
				 
			
 
				-inference_engine = get_inference_engine(inference_engine_name, shard_downloader)
			
 
				-print(f"Using inference engine: {inference_engine.__class__.__name__} with shard downloader: {shard_downloader.__class__.__name__}")
			
 
				+from exo.inference.mlx.sharded_inference_engine import MLXDynamicShardInferenceEngine
			
 
				+inference_engine = MLXDynamicShardInferenceEngine(shard_downloader)
			
 
				 
			
 
				 if args.node_port is None:
			
 
				   args.node_port = find_available_port(args.node_host)
			
--- a/exo/models.py
+++ b/exo/models.py
@@ -6,24 +6,19 @@ model_base_shards = {
 
				   "llama-3.2-3b": {"MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Llama-3.2-3B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=28),},
			
 
				   "llama-3.1-8b": {
			
 
				     "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3.1-8B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=32),
			
 
				-    "TinygradDynamicShardInferenceEngine": Shard(model_id="mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated", start_layer=0, end_layer=0, n_layers=32),
			
 
				   },
			
 
				   "llama-3.1-70b": {
			
 
				     "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3.1-70B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=80),
			
 
				-    "TinygradDynamicShardInferenceEngine": Shard(model_id="NousResearch/Meta-Llama-3.1-70B-Instruct", start_layer=0, end_layer=0, n_layers=80),
			
 
				   },
			
 
				   "llama-3.1-70b-bf16": {
			
 
				     "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3.1-70B-Instruct-bf16-CORRECTED", start_layer=0, end_layer=0, n_layers=80),
			
 
				-    "TinygradDynamicShardInferenceEngine": Shard(model_id="NousResearch/Meta-Llama-3.1-70B-Instruct", start_layer=0, end_layer=0, n_layers=80),
			
 
				   },
			
 
				   "llama-3.1-405b": {"MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3.1-405B-4bit", start_layer=0, end_layer=0, n_layers=126),},
			
 
				   "llama-3-8b": {
			
 
				     "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3-8B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=32),
			
 
				-    "TinygradDynamicShardInferenceEngine": Shard(model_id="TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-8B-R", start_layer=0, end_layer=0, n_layers=32),
			
 
				   },
			
 
				   "llama-3-70b": {
			
 
				     "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3-70B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=80),
			
 
				-    "TinygradDynamicShardInferenceEngine": Shard(model_id="TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-70B-R", start_layer=0, end_layer=0, n_layers=80),
			
 
				   },
			
 
				   ### mistral
			
 
				   "mistral-nemo": {"MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Mistral-Nemo-Instruct-2407-4bit", start_layer=0, end_layer=0, n_layers=40),},
			
@@ -44,6 +39,4 @@ model_base_shards = {
 
				   ### nemotron
			
 
				   "nemotron-70b": {"MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/nvidia_Llama-3.1-Nemotron-70B-Instruct-HF_4bit", start_layer=0, end_layer=0, n_layers=80),},
			
 
				   "nemotron-70b-bf16": {"MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Llama-3.1-Nemotron-70B-Instruct-HF-bf16", start_layer=0, end_layer=0, n_layers=80),},
			
 
				-  # dummy
			
 
				-  "dummy": {"DummyInferenceEngine": Shard(model_id="dummy", start_layer=0, end_layer=7, n_layers=8),},
			
 
				 }
			
--- a/exo/orchestration/node.py
+++ b/exo/orchestration/node.py
@@ -14,7 +14,7 @@ from exo import DEBUG
 
				 from exo.helpers import AsyncCallbackSystem
			
 
				 from exo.viz.topology_viz import TopologyViz
			
 
				 from exo.download.hf.hf_helpers import RepoProgressEvent
			
 
				-from exo.inference.inference_engine import get_inference_engine, InferenceEngine
			
 
				+from exo.inference.inference_engine import InferenceEngine
			
 
				 from exo.download.hf.hf_shard_download import HFShardDownloader
			
 
				 
			
 
				 
			
@@ -63,10 +63,6 @@ class Node:
 
				   def on_node_status(self, request_id, opaque_status):
			
 
				     try:
			
 
				       status_data = json.loads(opaque_status)
			
 
				-      if status_data.get("type", "") == "supported_inference_engines":
			
 
				-        node_id = status_data.get("node_id")
			
 
				-        engines = status_data.get("engines", [])
			
 
				-        self.topology_inference_engines_pool.append(engines)
			
 
				       if status_data.get("type", "") == "node_status":
			
 
				         if status_data.get("status", "").startswith("start_"):
			
 
				           self.current_topology.active_node_id = status_data.get("node_id")
			
@@ -84,22 +80,6 @@ class Node:
 
				       if DEBUG >= 1: print(f"Error updating visualization: {e}")
			
 
				       if DEBUG >= 1: traceback.print_exc()
			
 
				 
			
 
				-  def get_supported_inference_engines(self):
			
 
				-    supported_engine_names = []
			
 
				-    if self.inference_engine.__class__.__name__ == 'MLXDynamicShardInferenceEngine':
			
 
				-      supported_engine_names.append('mlx')
			
 
				-      supported_engine_names.append('tinygrad')
			
 
				-    else:
			
 
				-      supported_engine_names.append('tinygrad')
			
 
				-    return supported_engine_names
			
 
				-
			
 
				-  async def broadcast_supported_engines(self, supported_engines_names: List[str]):
			
 
				-    status_message = json.dumps({"type": "supported_inference_engines", "node_id": self.id, "engines": supported_engines_names})
			
 
				-    await self.broadcast_opaque_status("", status_message)
			
 
				-
			
 
				-  def get_topology_inference_engines(self) -> List[List[str]]:
			
 
				-    return self.topology_inference_engines_pool
			
 
				-
			
 
				   async def process_prompt(self, base_shard: Shard, prompt: str, image_str: Optional[str] = None, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.ndarray]:
			
 
				     shard = self.get_current_shard(base_shard)
			
 
				     asyncio.create_task(
			
@@ -352,17 +332,6 @@ class Node:
 
				     self.peers = next_peers
			
 
				     return len(peers_added) > 0 or len(peers_removed) > 0 or len(peers_updated) > 0
			
 
				 
			
 
				-  async def select_best_inference_engine(self):
			
 
				-    supported_engines = self.get_supported_inference_engines()
			
 
				-    await self.broadcast_supported_engines(supported_engines)
			
 
				-    if len(self.get_topology_inference_engines()):
			
 
				-      if any(len(engines) == 1 and "tinygrad" in engines for engines in self.get_topology_inference_engines()):
			
 
				-        if DEBUG >= 1: print("Found node with only tinygrad, using tinygrad on all nodes")
			
 
				-        self.inference_engine = get_inference_engine("tinygrad", self.shard_downloader)
			
 
				-      else:
			
 
				-        if DEBUG >= 1: print("All nodes can use mlx, using mlx for inference")
			
 
				-        self.inference_engine = get_inference_engine("mlx", self.shard_downloader)
			
 
				-
			
 
				   async def periodic_topology_collection(self, interval: int):
			
 
				     while True:
			
 
				       await asyncio.sleep(interval)
			
@@ -371,7 +340,6 @@ class Node:
 
				         if DEBUG >= 2: print(f"{did_peers_change=}")
			
 
				         if did_peers_change:
			
 
				           await self.collect_topology()
			
 
				-          await self.select_best_inference_engine()
			
 
				       except Exception as e:
			
 
				         print(f"Error collecting topology: {e}")
			
 
				         traceback.print_exc()
			
--- a/exo/topology/device_capabilities.py
+++ b/exo/topology/device_capabilities.py
@@ -142,8 +142,6 @@ CHIP_FLOPS.update({f"{key} Laptop GPU": value for key, value in CHIP_FLOPS.items
 
				 def device_capabilities() -> DeviceCapabilities:
			
 
				   if psutil.MACOS:
			
 
				     return mac_device_capabilities()
			
 
				-  elif psutil.LINUX:
			
 
				-    return linux_device_capabilities()
			
 
				   else:
			
 
				     return DeviceCapabilities(
			
 
				       model="Unknown Device",
			
@@ -171,42 +169,3 @@ def mac_device_capabilities() -> DeviceCapabilities:
 
				 
			
 
				   # Assuming static values for other attributes for demonstration
			
 
				   return DeviceCapabilities(model=model_id, chip=chip_id, memory=memory, flops=CHIP_FLOPS.get(chip_id, DeviceFlops(fp32=0, fp16=0, int8=0)))
			
 
				-
			
 
				-
			
 
				-def linux_device_capabilities() -> DeviceCapabilities:
			
 
				-  import psutil
			
 
				-  from tinygrad import Device
			
 
				-
			
 
				-  if DEBUG >= 2: print(f"tinygrad {Device.DEFAULT=}")
			
 
				-  if Device.DEFAULT == "CUDA" or Device.DEFAULT == "NV" or Device.DEFAULT == "GPU":
			
 
				-    import pynvml
			
 
				-
			
 
				-    pynvml.nvmlInit()
			
 
				-    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
			
 
				-    gpu_raw_name = pynvml.nvmlDeviceGetName(handle).upper()
			
 
				-    gpu_name = gpu_raw_name.rsplit(" ", 1)[0] if gpu_raw_name.endswith("GB") else gpu_raw_name
			
 
				-    gpu_memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
			
 
				-
			
 
				-    if DEBUG >= 2: print(f"NVIDIA device {gpu_name=} {gpu_memory_info=}")
			
 
				-
			
 
				-    return DeviceCapabilities(
			
 
				-      model=f"Linux Box ({gpu_name})",
			
 
				-      chip=gpu_name,
			
 
				-      memory=gpu_memory_info.total // 2**20,
			
 
				-      flops=CHIP_FLOPS.get(gpu_name, DeviceFlops(fp32=0, fp16=0, int8=0)),
			
 
				-    )
			
 
				-  elif Device.DEFAULT == "AMD":
			
 
				-    # TODO AMD support
			
 
				-    return DeviceCapabilities(
			
 
				-      model="Linux Box (AMD)",
			
 
				-      chip="Unknown AMD",
			
 
				-      memory=psutil.virtual_memory().total // 2**20,
			
 
				-      flops=DeviceFlops(fp32=0, fp16=0, int8=0),
			
 
				-    )
			
 
				-  else:
			
 
				-    return DeviceCapabilities(
			
 
				-      model=f"Linux Box (Device: {Device.DEFAULT})",
			
 
				-      chip=f"Unknown Chip (Device: {Device.DEFAULT})",
			
 
				-      memory=psutil.virtual_memory().total // 2**20,
			
 
				-      flops=DeviceFlops(fp32=0, fp16=0, int8=0),
			
 
				-    )
			
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,6 @@ install_requires = [
 
				   "tqdm==4.66.4",
			
 
				   "transformers==4.43.3",
			
 
				   "uuid==1.30",
			
 
				-  "tinygrad @ git+https://github.com/tinygrad/tinygrad.git@232edcfd4f8b388807c64fb1817a7668ce27cbad",
			
 
				 ]
			
 
				 
			
 
				 extras_require = {