8 months ago · 916b906de8
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -184,30 +184,6 @@ jobs:
 
															           prompt: "Keep responses concise. Who was the king of pop?"
														
 
															           expected_output: "Michael Jackson"
														
 
															-  chatgpt_api_integration_test_dummy:
														
 
															-    macos:
														
 
															-      xcode: "16.0.0"
														
 
															-    resource_class: m2pro.large
														
 
															-    steps:
														
 
															-      - checkout
														
 
															-      - run:
														
 
															-          name: Set up Python
														
 
															-          command: |
														
 
															-            brew install python@3.12
														
 
															-            python3.12 -m venv env
														
 
															-            source env/bin/activate
														
 
															-      - run:
														
 
															-          name: Install dependencies
														
 
															-          command: |
														
 
															-            source env/bin/activate
														
 
															-            pip install --upgrade pip
														
 
															-            pip install .
														
 
															-      - run_chatgpt_api_test:
														
 
															-          inference_engine: dummy
														
 
															-          model_id: dummy-model
														
 
															-          prompt: "Dummy prompt."
														
 
															-          expected_output: "dummy"
														
 
															-
														
 
															   test_macos_m1:
														
 
															     macos:
														
 
															       xcode: "16.0.0"
														
@@ -223,5 +199,4 @@ workflows:
 
															       - unit_test
														
 
															       - discovery_integration_test
														
 
															       - chatgpt_api_integration_test_mlx
														
 
															-      - chatgpt_api_integration_test_dummy
														
 
															       - test_macos_m1
														
--- a/exo/inference/dummy_inference_engine.py
+++ b/exo/inference/dummy_inference_engine.py
@@ -1,65 +0,0 @@
 
															-from typing import Optional, Tuple, TYPE_CHECKING
														
 
															-import numpy as np
														
 
															-import asyncio
														
 
															-import json
														
 
															-from exo.inference.inference_engine import InferenceEngine
														
 
															-from exo.inference.shard import Shard
														
 
															-
														
 
															-
														
 
															-class DummyInferenceEngine(InferenceEngine):
														
 
															-  def __init__(self):
														
 
															-    self.shard = None
														
 
															-    self.vocab_size = 1000
														
 
															-    self.eos_token_id = 0
														
 
															-    self.latency_mean = 0.1
														
 
															-    self.latency_stddev = 0.02
														
 
															-
														
 
															-  async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
														
 
															-    try:
														
 
															-      await self.ensure_shard(shard)
														
 
															-
														
 
															-      # Generate random tokens
														
 
															-      output_length = np.random.randint(1, 10)
														
 
															-      output = np.random.randint(1, self.vocab_size, size=(1, output_length))
														
 
															-
														
 
															-      # Simulate latency
														
 
															-      await asyncio.sleep(max(0, np.random.normal(self.latency_mean, self.latency_stddev)))
														
 
															-
														
 
															-      # Randomly decide if finished
														
 
															-      is_finished = np.random.random() < 0.2
														
 
															-      if is_finished:
														
 
															-        output = np.array([[self.eos_token_id]])
														
 
															-
														
 
															-      new_state = json.dumps({"dummy_state": "some_value"})
														
 
															-
														
 
															-      return output, new_state, is_finished
														
 
															-    except Exception as e:
														
 
															-      print(f"Error in DummyInferenceEngine.infer_prompt: {str(e)}")
														
 
															-      return np.array([[self.eos_token_id]]), json.dumps({"error": str(e)}), True
														
 
															-
														
 
															-  async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
														
 
															-    await self.ensure_shard(shard)
														
 
															-    state = json.loads(inference_state or "{}")
														
 
															-    start_pos = state.get("start_pos", 0)
														
 
															-
														
 
															-    output_length = np.random.randint(1, 10)
														
 
															-    output = np.random.randint(1, self.vocab_size, size=(1, output_length))
														
 
															-
														
 
															-    await asyncio.sleep(max(0, np.random.normal(self.latency_mean, self.latency_stddev)))
														
 
															-
														
 
															-    is_finished = np.random.random() < 0.2
														
 
															-    if is_finished:
														
 
															-      output = np.array([[self.eos_token_id]])
														
 
															-
														
 
															-    start_pos += input_data.shape[1] + output_length
														
 
															-    new_state = json.dumps({"start_pos": start_pos})
														
 
															-
														
 
															-    return output, new_state, is_finished
														
 
															-
														
 
															-  async def ensure_shard(self, shard: Shard):
														
 
															-    if self.shard == shard:
														
 
															-      return
														
 
															-    # Simulate shard loading without making any API calls
														
 
															-    await asyncio.sleep(0.1)  # Simulate a short delay
														
 
															-    self.shard = shard
														
 
															-    print(f"DummyInferenceEngine: Simulated loading of shard {shard.model_id}")
														
--- a/exo/inference/mlx/test_sharded_model.py
+++ b/exo/inference/mlx/test_sharded_model.py
@@ -1,52 +0,0 @@
 
															-from exo.inference.shard import Shard
														
 
															-import mlx.core as mx
														
 
															-import mlx.nn as nn
														
 
															-from typing import Optional
														
 
															-import numpy as np
														
 
															-
														
 
															-
														
 
															-class DummyModel(nn.Module):
														
 
															-  def __init__(self, shard: Optional[Shard] = None):
														
 
															-    self.shard = shard
														
 
															-    self.layers = [
														
 
															-      nn.Linear(8, 128),
														
 
															-      nn.Linear(128, 128),
														
 
															-      nn.Linear(128, 128),
														
 
															-      nn.Linear(128, 128),
														
 
															-      nn.Linear(128, 8),
														
 
															-    ]
														
 
															-
														
 
															-    self.n_kv_heads = 4
														
 
															-    self.head_dim = 4
														
 
															-
														
 
															-  def __call__(self, x, cache=None):
														
 
															-    if self.shard:
														
 
															-      for layer in self.layers[self.shard.start_layer:self.shard.end_layer + 1]:
														
 
															-        x = layer(x)
														
 
															-      if self.shard.is_last_layer():
														
 
															-        x = x.reshape((1, 2, 4))
														
 
															-    else:
														
 
															-      for layer in self.layers:
														
 
															-        x = layer(x)
														
 
															-      x = x.reshape((1, 2, 4))
														
 
															-
														
 
															-    return x
														
 
															-
														
 
															-
														
 
															-model = DummyModel()
														
 
															-model.save_weights("./test_weights.npz")
														
 
															-n_layers = 5
														
 
															-shard1 = Shard("test", 0, n_layers // 2, n_layers)
														
 
															-sharded_model1 = DummyModel(shard1)
														
 
															-shard2 = Shard("test", n_layers//2 + 1, n_layers - 1, n_layers)
														
 
															-sharded_model2 = DummyModel(shard2)
														
 
															-
														
 
															-model.load_weights("./test_weights.npz")
														
 
															-sharded_model1.load_weights("./test_weights.npz")
														
 
															-sharded_model2.load_weights("./test_weights.npz")
														
 
															-
														
 
															-fullresp = model(mx.array([1, 2, 3, 4, 5, 6, 7, 8]))
														
 
															-resp1 = sharded_model1(mx.array([1, 2, 3, 4, 5, 6, 7, 8]))
														
 
															-resp2 = sharded_model2(resp1)
														
 
															-
														
 
															-assert np.all(np.array(fullresp) == np.array(resp2))
														
--- a/exo/inference/test_dummy_inference_engine.py
+++ b/exo/inference/test_dummy_inference_engine.py
@@ -1,61 +0,0 @@
 
															-import pytest
														
 
															-import json
														
 
															-import numpy as np
														
 
															-from exo.inference.dummy_inference_engine import DummyInferenceEngine
														
 
															-from exo.inference.shard import Shard
														
 
															-
														
 
															-
														
 
															-class MockShardDownloader:
														
 
															-  async def ensure_shard(self, shard):
														
 
															-    pass
														
 
															-
														
 
															-
														
 
															-@pytest.mark.asyncio
														
 
															-async def test_dummy_inference_specific():
														
 
															-  engine = DummyInferenceEngine(MockShardDownloader())
														
 
															-  test_shard = Shard(model_id="test_model", start_layer=0, end_layer=1, n_layers=1)
														
 
															-  test_prompt = "This is a test prompt"
														
 
															-
														
 
															-  result, state, is_finished = await engine.infer_prompt("test_request", test_shard, test_prompt)
														
 
															-
														
 
															-  print(f"Inference result shape: {result.shape}")
														
 
															-  print(f"Inference state: {state}")
														
 
															-  print(f"Is finished: {is_finished}")
														
 
															-
														
 
															-  assert result.shape[0] == 1, "Result should be a 2D array with first dimension 1"
														
 
															-  assert isinstance(json.loads(state), dict), "State should be a valid JSON string"
														
 
															-  assert isinstance(is_finished, bool), "is_finished should be a boolean"
														
 
															-
														
 
															-
														
 
															-@pytest.mark.asyncio
														
 
															-async def test_dummy_inference_engine():
														
 
															-  # Initialize the DummyInferenceEngine
														
 
															-  engine = DummyInferenceEngine(MockShardDownloader())
														
 
															-
														
 
															-  # Create a test shard
														
 
															-  shard = Shard(model_id="test_model", start_layer=0, end_layer=1, n_layers=1)
														
 
															-
														
 
															-  # Test infer_prompt
														
 
															-  output, state, is_finished = await engine.infer_prompt("test_id", shard, "Test prompt")
														
 
															-
														
 
															-  assert isinstance(output, np.ndarray), "Output should be a numpy array"
														
 
															-  assert output.ndim == 2, "Output should be 2-dimensional"
														
 
															-  assert isinstance(state, str), "State should be a string"
														
 
															-  assert isinstance(is_finished, bool), "is_finished should be a boolean"
														
 
															-
														
 
															-  # Test infer_tensor
														
 
															-  input_tensor = np.array([[1, 2, 3]])
														
 
															-  output, state, is_finished = await engine.infer_tensor("test_id", shard, input_tensor)
														
 
															-
														
 
															-  assert isinstance(output, np.ndarray), "Output should be a numpy array"
														
 
															-  assert output.ndim == 2, "Output should be 2-dimensional"
														
 
															-  assert isinstance(state, str), "State should be a string"
														
 
															-  assert isinstance(is_finished, bool), "is_finished should be a boolean"
														
 
															-
														
 
															-  print("All tests passed!")
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-  import asyncio
														
 
															-  asyncio.run(test_dummy_inference_engine())
														
 
															-  asyncio.run(test_dummy_inference_specific())
														
--- a/exo/inference/tokenizers.py
+++ b/exo/inference/tokenizers.py
@@ -8,20 +8,7 @@ from exo.download.hf.hf_helpers import get_local_snapshot_dir
 
															 from exo.helpers import DEBUG
														
 
															-class DummyTokenizer:
														
 
															-  def __init__(self):
														
 
															-    self.eos_token_id = 0
														
 
															-
														
 
															-  def apply_chat_template(self, messages, tokenize=True, add_generation_prompt=True):
														
 
															-    return [1, 2, 3]
														
 
															-
														
 
															-  def decode(self, tokens):
														
 
															-    return "dummy"
														
 
															-
														
 
															-
														
 
															 async def resolve_tokenizer(model_id: str):
														
 
															-  if model_id == "dummy":
														
 
															-    return DummyTokenizer()
														
 
															   local_path = await get_local_snapshot_dir(model_id)
														
 
															   if DEBUG >= 2: print(f"Checking if local path exists to load tokenizer from local {local_path=}")
														
 
															   try:
														
--- a/exo/main.py
+++ b/exo/main.py
@@ -20,7 +20,6 @@ from exo.download.hf.hf_shard_download import HFShardDownloader
 
															 from exo.helpers import print_yellow_exo, find_available_port, DEBUG, get_system_info, get_or_create_node_id, get_all_ip_addresses, terminal_link
														
 
															 from exo.inference.shard import Shard
														
 
															 from exo.inference.inference_engine import InferenceEngine
														
 
															-from exo.inference.dummy_inference_engine import DummyInferenceEngine
														
 
															 from exo.inference.tokenizers import resolve_tokenizer
														
 
															 from exo.orchestration.node import Node
														
 
															 from exo.models import model_base_shards
														
--- a/test/test_tokenizers.py
+++ b/test/test_tokenizers.py
@@ -24,7 +24,7 @@ def test_tokenizer(name, tokenizer, verbose=False):
 
															     strip_tokens = lambda s: s.lstrip(tokenizer.decode([tokenizer.bos_token_id])).rstrip(tokenizer.decode([tokenizer.eos_token_id]))
														
 
															     assert text == strip_tokens(decoded) == strip_tokens(reconstructed)
														
 
															-ignore = ["TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-70B-R", "mlx-community/DeepSeek-Coder-V2-Lite-Instruct-4bit-mlx", "mlx-community/DeepSeek-V2.5-MLX-AQ4_1_64", "llava-hf/llava-1.5-7b-hf", "mlx-community/Qwen*", "dummy"]
														
 
															+ignore = ["TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-70B-R", "mlx-community/DeepSeek-Coder-V2-Lite-Instruct-4bit-mlx", "mlx-community/DeepSeek-V2.5-MLX-AQ4_1_64", "llava-hf/llava-1.5-7b-hf", "mlx-community/Qwen*"]
														
 
															 ignore_pattern = re.compile(r"^(" + "|".join(model.replace("*", ".*") for model in ignore) + r")")
														
 
															 models = [shard.model_id for shards in model_base_shards.values() for shard in shards.values() if not ignore_pattern.match(shard.model_id)]