Browse Source

get rid of dummy inference

Alex Cheema 6 tháng trước cách đây
mục cha
commit
916b906de8

+ 0 - 25
.circleci/config.yml

@@ -184,30 +184,6 @@ jobs:
           prompt: "Keep responses concise. Who was the king of pop?"
           prompt: "Keep responses concise. Who was the king of pop?"
           expected_output: "Michael Jackson"
           expected_output: "Michael Jackson"
 
 
-  chatgpt_api_integration_test_dummy:
-    macos:
-      xcode: "16.0.0"
-    resource_class: m2pro.large
-    steps:
-      - checkout
-      - run:
-          name: Set up Python
-          command: |
-            brew install python@3.12
-            python3.12 -m venv env
-            source env/bin/activate
-      - run:
-          name: Install dependencies
-          command: |
-            source env/bin/activate
-            pip install --upgrade pip
-            pip install .
-      - run_chatgpt_api_test:
-          inference_engine: dummy
-          model_id: dummy-model
-          prompt: "Dummy prompt."
-          expected_output: "dummy"
-
   test_macos_m1:
   test_macos_m1:
     macos:
     macos:
       xcode: "16.0.0"
       xcode: "16.0.0"
@@ -223,5 +199,4 @@ workflows:
       - unit_test
       - unit_test
       - discovery_integration_test
       - discovery_integration_test
       - chatgpt_api_integration_test_mlx
       - chatgpt_api_integration_test_mlx
-      - chatgpt_api_integration_test_dummy
       - test_macos_m1
       - test_macos_m1

+ 0 - 65
exo/inference/dummy_inference_engine.py

@@ -1,65 +0,0 @@
-from typing import Optional, Tuple, TYPE_CHECKING
-import numpy as np
-import asyncio
-import json
-from exo.inference.inference_engine import InferenceEngine
-from exo.inference.shard import Shard
-
-
-class DummyInferenceEngine(InferenceEngine):
-  def __init__(self):
-    self.shard = None
-    self.vocab_size = 1000
-    self.eos_token_id = 0
-    self.latency_mean = 0.1
-    self.latency_stddev = 0.02
-
-  async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
-    try:
-      await self.ensure_shard(shard)
-
-      # Generate random tokens
-      output_length = np.random.randint(1, 10)
-      output = np.random.randint(1, self.vocab_size, size=(1, output_length))
-
-      # Simulate latency
-      await asyncio.sleep(max(0, np.random.normal(self.latency_mean, self.latency_stddev)))
-
-      # Randomly decide if finished
-      is_finished = np.random.random() < 0.2
-      if is_finished:
-        output = np.array([[self.eos_token_id]])
-
-      new_state = json.dumps({"dummy_state": "some_value"})
-
-      return output, new_state, is_finished
-    except Exception as e:
-      print(f"Error in DummyInferenceEngine.infer_prompt: {str(e)}")
-      return np.array([[self.eos_token_id]]), json.dumps({"error": str(e)}), True
-
-  async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
-    await self.ensure_shard(shard)
-    state = json.loads(inference_state or "{}")
-    start_pos = state.get("start_pos", 0)
-
-    output_length = np.random.randint(1, 10)
-    output = np.random.randint(1, self.vocab_size, size=(1, output_length))
-
-    await asyncio.sleep(max(0, np.random.normal(self.latency_mean, self.latency_stddev)))
-
-    is_finished = np.random.random() < 0.2
-    if is_finished:
-      output = np.array([[self.eos_token_id]])
-
-    start_pos += input_data.shape[1] + output_length
-    new_state = json.dumps({"start_pos": start_pos})
-
-    return output, new_state, is_finished
-
-  async def ensure_shard(self, shard: Shard):
-    if self.shard == shard:
-      return
-    # Simulate shard loading without making any API calls
-    await asyncio.sleep(0.1)  # Simulate a short delay
-    self.shard = shard
-    print(f"DummyInferenceEngine: Simulated loading of shard {shard.model_id}")

+ 0 - 52
exo/inference/mlx/test_sharded_model.py

@@ -1,52 +0,0 @@
-from exo.inference.shard import Shard
-import mlx.core as mx
-import mlx.nn as nn
-from typing import Optional
-import numpy as np
-
-
-class DummyModel(nn.Module):
-  def __init__(self, shard: Optional[Shard] = None):
-    self.shard = shard
-    self.layers = [
-      nn.Linear(8, 128),
-      nn.Linear(128, 128),
-      nn.Linear(128, 128),
-      nn.Linear(128, 128),
-      nn.Linear(128, 8),
-    ]
-
-    self.n_kv_heads = 4
-    self.head_dim = 4
-
-  def __call__(self, x, cache=None):
-    if self.shard:
-      for layer in self.layers[self.shard.start_layer:self.shard.end_layer + 1]:
-        x = layer(x)
-      if self.shard.is_last_layer():
-        x = x.reshape((1, 2, 4))
-    else:
-      for layer in self.layers:
-        x = layer(x)
-      x = x.reshape((1, 2, 4))
-
-    return x
-
-
-model = DummyModel()
-model.save_weights("./test_weights.npz")
-n_layers = 5
-shard1 = Shard("test", 0, n_layers // 2, n_layers)
-sharded_model1 = DummyModel(shard1)
-shard2 = Shard("test", n_layers//2 + 1, n_layers - 1, n_layers)
-sharded_model2 = DummyModel(shard2)
-
-model.load_weights("./test_weights.npz")
-sharded_model1.load_weights("./test_weights.npz")
-sharded_model2.load_weights("./test_weights.npz")
-
-fullresp = model(mx.array([1, 2, 3, 4, 5, 6, 7, 8]))
-resp1 = sharded_model1(mx.array([1, 2, 3, 4, 5, 6, 7, 8]))
-resp2 = sharded_model2(resp1)
-
-assert np.all(np.array(fullresp) == np.array(resp2))

+ 0 - 61
exo/inference/test_dummy_inference_engine.py

@@ -1,61 +0,0 @@
-import pytest
-import json
-import numpy as np
-from exo.inference.dummy_inference_engine import DummyInferenceEngine
-from exo.inference.shard import Shard
-
-
-class MockShardDownloader:
-  async def ensure_shard(self, shard):
-    pass
-
-
-@pytest.mark.asyncio
-async def test_dummy_inference_specific():
-  engine = DummyInferenceEngine(MockShardDownloader())
-  test_shard = Shard(model_id="test_model", start_layer=0, end_layer=1, n_layers=1)
-  test_prompt = "This is a test prompt"
-
-  result, state, is_finished = await engine.infer_prompt("test_request", test_shard, test_prompt)
-
-  print(f"Inference result shape: {result.shape}")
-  print(f"Inference state: {state}")
-  print(f"Is finished: {is_finished}")
-
-  assert result.shape[0] == 1, "Result should be a 2D array with first dimension 1"
-  assert isinstance(json.loads(state), dict), "State should be a valid JSON string"
-  assert isinstance(is_finished, bool), "is_finished should be a boolean"
-
-
-@pytest.mark.asyncio
-async def test_dummy_inference_engine():
-  # Initialize the DummyInferenceEngine
-  engine = DummyInferenceEngine(MockShardDownloader())
-
-  # Create a test shard
-  shard = Shard(model_id="test_model", start_layer=0, end_layer=1, n_layers=1)
-
-  # Test infer_prompt
-  output, state, is_finished = await engine.infer_prompt("test_id", shard, "Test prompt")
-
-  assert isinstance(output, np.ndarray), "Output should be a numpy array"
-  assert output.ndim == 2, "Output should be 2-dimensional"
-  assert isinstance(state, str), "State should be a string"
-  assert isinstance(is_finished, bool), "is_finished should be a boolean"
-
-  # Test infer_tensor
-  input_tensor = np.array([[1, 2, 3]])
-  output, state, is_finished = await engine.infer_tensor("test_id", shard, input_tensor)
-
-  assert isinstance(output, np.ndarray), "Output should be a numpy array"
-  assert output.ndim == 2, "Output should be 2-dimensional"
-  assert isinstance(state, str), "State should be a string"
-  assert isinstance(is_finished, bool), "is_finished should be a boolean"
-
-  print("All tests passed!")
-
-
-if __name__ == "__main__":
-  import asyncio
-  asyncio.run(test_dummy_inference_engine())
-  asyncio.run(test_dummy_inference_specific())

+ 0 - 13
exo/inference/tokenizers.py

@@ -8,20 +8,7 @@ from exo.download.hf.hf_helpers import get_local_snapshot_dir
 from exo.helpers import DEBUG
 from exo.helpers import DEBUG
 
 
 
 
-class DummyTokenizer:
-  def __init__(self):
-    self.eos_token_id = 0
-
-  def apply_chat_template(self, messages, tokenize=True, add_generation_prompt=True):
-    return [1, 2, 3]
-
-  def decode(self, tokens):
-    return "dummy"
-
-
 async def resolve_tokenizer(model_id: str):
 async def resolve_tokenizer(model_id: str):
-  if model_id == "dummy":
-    return DummyTokenizer()
   local_path = await get_local_snapshot_dir(model_id)
   local_path = await get_local_snapshot_dir(model_id)
   if DEBUG >= 2: print(f"Checking if local path exists to load tokenizer from local {local_path=}")
   if DEBUG >= 2: print(f"Checking if local path exists to load tokenizer from local {local_path=}")
   try:
   try:

+ 0 - 1
exo/main.py

@@ -20,7 +20,6 @@ from exo.download.hf.hf_shard_download import HFShardDownloader
 from exo.helpers import print_yellow_exo, find_available_port, DEBUG, get_system_info, get_or_create_node_id, get_all_ip_addresses, terminal_link
 from exo.helpers import print_yellow_exo, find_available_port, DEBUG, get_system_info, get_or_create_node_id, get_all_ip_addresses, terminal_link
 from exo.inference.shard import Shard
 from exo.inference.shard import Shard
 from exo.inference.inference_engine import InferenceEngine
 from exo.inference.inference_engine import InferenceEngine
-from exo.inference.dummy_inference_engine import DummyInferenceEngine
 from exo.inference.tokenizers import resolve_tokenizer
 from exo.inference.tokenizers import resolve_tokenizer
 from exo.orchestration.node import Node
 from exo.orchestration.node import Node
 from exo.models import model_base_shards
 from exo.models import model_base_shards

+ 1 - 1
test/test_tokenizers.py

@@ -24,7 +24,7 @@ def test_tokenizer(name, tokenizer, verbose=False):
     strip_tokens = lambda s: s.lstrip(tokenizer.decode([tokenizer.bos_token_id])).rstrip(tokenizer.decode([tokenizer.eos_token_id]))
     strip_tokens = lambda s: s.lstrip(tokenizer.decode([tokenizer.bos_token_id])).rstrip(tokenizer.decode([tokenizer.eos_token_id]))
     assert text == strip_tokens(decoded) == strip_tokens(reconstructed)
     assert text == strip_tokens(decoded) == strip_tokens(reconstructed)
 
 
-ignore = ["TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-70B-R", "mlx-community/DeepSeek-Coder-V2-Lite-Instruct-4bit-mlx", "mlx-community/DeepSeek-V2.5-MLX-AQ4_1_64", "llava-hf/llava-1.5-7b-hf", "mlx-community/Qwen*", "dummy"]
+ignore = ["TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-70B-R", "mlx-community/DeepSeek-Coder-V2-Lite-Instruct-4bit-mlx", "mlx-community/DeepSeek-V2.5-MLX-AQ4_1_64", "llava-hf/llava-1.5-7b-hf", "mlx-community/Qwen*"]
 ignore_pattern = re.compile(r"^(" + "|".join(model.replace("*", ".*") for model in ignore) + r")")
 ignore_pattern = re.compile(r"^(" + "|".join(model.replace("*", ".*") for model in ignore) + r")")
 models = [shard.model_id for shards in model_base_shards.values() for shard in shards.values() if not ignore_pattern.match(shard.model_id)]
 models = [shard.model_id for shards in model_base_shards.values() for shard in shards.values() if not ignore_pattern.match(shard.model_id)]