1 rok pred · 46d618abed
--- a/exo/api/chatgpt_api.py
+++ b/exo/api/chatgpt_api.py
@@ -175,7 +175,7 @@ class ChatGPTAPI:
 
				             if DEBUG >= 2:
			
 
				                 import traceback
			
 
				                 traceback.print_exc()
			
 
				-            return web.json_response({'detail': f"Error processing prompt (see logs): {str(e)}"}, status=500)
			
 
				+            return web.json_response({'detail': f"Error processing prompt (see logs with DEBUG>=2): {str(e)}"}, status=500)
			
 
				 
			
 
				         try:
			
 
				             if DEBUG >= 2: print(f"Waiting for response to finish. timeout={self.response_timeout_secs}s")
			
--- a/exo/inference/debug_inference_engine.py
+++ b/exo/inference/debug_inference_engine.py
@@ -0,0 +1,43 @@
 
				+from exo.inference.mlx.sharded_inference_engine import MLXDynamicShardInferenceEngine
			
 
				+from exo.inference.inference_engine import InferenceEngine
			
 
				+from exo.inference.shard import Shard
			
 
				+from exo.inference.tinygrad.inference import TinygradDynamicShardInferenceEngine
			
 
				+import asyncio
			
 
				+import numpy as np
			
 
				+
			
 
				+# An inference engine should work the same for any number of Shards, as long as the Shards are continuous.
			
 
				+async def test_inference_engine(inference_engine_1: InferenceEngine, inference_engine_2: InferenceEngine, model_id: str):
			
 
				+    from exo.inference.tinygrad.inference import Tokenizer
			
 
				+    from pathlib import Path
			
 
				+    _tokenizer = Tokenizer(str(Path(model_id) / "tokenizer.model"))
			
 
				+
			
 
				+    prompt = "In a single word only, what is the last name of the president of the United States? "
			
 
				+    resp_full, inference_state_full, _ = await inference_engine_1.infer_prompt(shard=Shard(model_id=model_id, start_layer=0, end_layer=31, n_layers=32), prompt=prompt)
			
 
				+    next_resp_full, next_inference_state_full, _ = await inference_engine_1.infer_tensor(shard=Shard(model_id=model_id, start_layer=0, end_layer=31, n_layers=32), input_data=resp_full, inference_state=inference_state_full)
			
 
				+
			
 
				+    await inference_engine_1.reset_shard(shard=Shard(model_id=model_id, start_layer=0, end_layer=30, n_layers=32))
			
 
				+    resp1, inference_state_1, _ = await inference_engine_1.infer_prompt(shard=Shard(model_id=model_id, start_layer=0, end_layer=30, n_layers=32), prompt=prompt)
			
 
				+
			
 
				+    await inference_engine_2.reset_shard(shard=Shard(model_id=model_id, start_layer=31, end_layer=31, n_layers=32))
			
 
				+    resp2, inference_state_2, _ = await inference_engine_2.infer_tensor(shard=Shard(model_id=model_id, start_layer=31, end_layer=31, n_layers=32), input_data=resp1, inference_state=inference_state_1)
			
 
				+
			
 
				+    # don't reset the second time
			
 
				+    resp3, inference_state_3, _ = await inference_engine_1.infer_tensor(shard=Shard(model_id=model_id, start_layer=0, end_layer=30, n_layers=32), input_data=resp2, inference_state=inference_state_2)
			
 
				+    resp4, inference_state_4, _ = await inference_engine_2.infer_tensor(shard=Shard(model_id=model_id, start_layer=31, end_layer=31, n_layers=32), input_data=resp3, inference_state=inference_state_3)
			
 
				+
			
 
				+    print(f"{resp2=}")
			
 
				+    print(f"full: {_tokenizer.decode(resp_full)}")
			
 
				+    print(f"next full: {_tokenizer.decode(next_resp_full)}")
			
 
				+    print(f"resp2: {_tokenizer.decode(resp2)}")
			
 
				+    print(f"{resp4=}")
			
 
				+    print(f"resp4: {_tokenizer.decode(resp4)}")
			
 
				+
			
 
				+    assert np.array_equal(resp_full, resp2)
			
 
				+    assert np.array_equal(next_resp_full, resp4)
			
 
				+
			
 
				+
			
 
				+asyncio.run(test_inference_engine(
			
 
				+    TinygradDynamicShardInferenceEngine(),
			
 
				+    TinygradDynamicShardInferenceEngine(),
			
 
				+    "/Users/alex/Library/Caches/tinygrad/downloads/llama3-8b-sfr",
			
 
				+))
			
--- a/exo/inference/test_inference_engine.py
+++ b/exo/inference/test_inference_engine.py
@@ -2,29 +2,36 @@ from exo.inference.mlx.sharded_inference_engine import MLXDynamicShardInferenceE
 
				 from exo.inference.inference_engine import InferenceEngine
			
 
				 from exo.inference.shard import Shard
			
 
				 from exo.inference.tinygrad.inference import TinygradDynamicShardInferenceEngine
			
 
				+import asyncio
			
 
				 import numpy as np
			
 
				 
			
 
				 # An inference engine should work the same for any number of Shards, as long as the Shards are continuous.
			
 
				-async def test_inference_engine(inference_engine: InferenceEngine, model_id: str):
			
 
				+async def test_inference_engine(inference_engine_1: InferenceEngine, inference_engine_2: InferenceEngine, model_id: str):
			
 
				     prompt = "In a single word only, what is the capital of Japan? "
			
 
				-    resp_full, inference_state_full, _ = await inference_engine.infer_prompt(shard=Shard(model_id=model_id, start_layer=0, end_layer=31, n_layers=32), prompt=prompt)
			
 
				+    resp_full, inference_state_full, _ = await inference_engine_1.infer_prompt(shard=Shard(model_id=model_id, start_layer=0, end_layer=31, n_layers=32), prompt=prompt)
			
 
				+    next_resp_full, next_inference_state_full, _ = await inference_engine_1.infer_tensor(shard=Shard(model_id=model_id, start_layer=0, end_layer=31, n_layers=32), input_data=resp_full, inference_state=inference_state_full)
			
 
				 
			
 
				-    await inference_engine.reset_shard(shard=Shard(model_id=model_id, start_layer=0, end_layer=10, n_layers=32))
			
 
				-    resp1, inference_state, _ = await inference_engine.infer_prompt(shard=Shard(model_id=model_id, start_layer=0, end_layer=10, n_layers=32), prompt=prompt)
			
 
				+    await inference_engine_1.reset_shard(shard=Shard(model_id=model_id, start_layer=0, end_layer=30, n_layers=32))
			
 
				+    resp1, inference_state_1, _ = await inference_engine_1.infer_prompt(shard=Shard(model_id=model_id, start_layer=0, end_layer=30, n_layers=32), prompt=prompt)
			
 
				 
			
 
				-    await inference_engine.reset_shard(shard=Shard(model_id=model_id, start_layer=11, end_layer=31, n_layers=32))
			
 
				-    resp2, _, _ = await inference_engine.infer_tensor(shard=Shard(model_id=model_id, start_layer=11, end_layer=31, n_layers=32), input_data=resp1, inference_state=inference_state)
			
 
				+    await inference_engine_2.reset_shard(shard=Shard(model_id=model_id, start_layer=31, end_layer=31, n_layers=32))
			
 
				+    resp2, inference_state_2, _ = await inference_engine_2.infer_tensor(shard=Shard(model_id=model_id, start_layer=31, end_layer=31, n_layers=32), input_data=resp1, inference_state=inference_state_1)
			
 
				 
			
 
				-    assert np.array_equal(resp_full, resp2)
			
 
				+    # don't reset the second time
			
 
				+    resp3, inference_state_3, _ = await inference_engine_1.infer_tensor(shard=Shard(model_id=model_id, start_layer=0, end_layer=30, n_layers=32), input_data=resp2, inference_state=inference_state_2)
			
 
				+    resp4, inference_state_4, _ = await inference_engine_2.infer_tensor(shard=Shard(model_id=model_id, start_layer=31, end_layer=31, n_layers=32), input_data=resp3, inference_state=inference_state_3)
			
 
				 
			
 
				-import asyncio
			
 
				+    assert np.array_equal(resp_full, resp2)
			
 
				+    assert np.array_equal(next_resp_full, resp4)
			
 
				 
			
 
				 asyncio.run(test_inference_engine(
			
 
				+    MLXDynamicShardInferenceEngine(),
			
 
				     MLXDynamicShardInferenceEngine(),
			
 
				     "mlx-community/Meta-Llama-3-8B-Instruct-4bit",
			
 
				 ))
			
 
				 
			
 
				 asyncio.run(test_inference_engine(
			
 
				+    TinygradDynamicShardInferenceEngine(),
			
 
				     TinygradDynamicShardInferenceEngine(),
			
 
				     "/Users/alex/Library/Caches/tinygrad/downloads/llama3-8b-sfr",
			
 
				-))
			
 
				+))
			
--- a/exo/inference/tinygrad/inference.py
+++ b/exo/inference/tinygrad/inference.py
@@ -124,19 +124,7 @@ TOP_P = 0.9
 
				 ALPHA_F = 0.1
			
 
				 ALPHA_P = 0.0
			
 
				 
			
 
				-last_seen_toks = []
			
 
				 def prefill(model, toks, start_pos=0):
			
 
				-  global last_seen_toks
			
 
				-
			
 
				-  # we can skip part of the prompt if it is the same as last and start_pos=0
			
 
				-  if start_pos == 0:
			
 
				-    for i, (a, b) in enumerate(zip(toks, last_seen_toks)):
			
 
				-      if a != b: break
			
 
				-    else: i = min(len(toks), len(last_seen_toks))
			
 
				-    start_pos += i
			
 
				-    last_seen_toks = toks
			
 
				-    toks = toks[i:]
			
 
				-
			
 
				   # prefill the model
			
 
				   for tok in tqdm(toks):
			
 
				     GlobalCounters.reset()
			
@@ -155,9 +143,10 @@ class TinygradDynamicShardInferenceEngine(InferenceEngine):
 
				             return encode_role(role) + self.tokenizer.encode(content.strip()) + [self.tokenizer.special_tokens["<|eot_id|>"]]
			
 
				 
			
 
				         await self.ensure_shard(shard)
			
 
				+        start_pos = json.loads(inference_state)["start_pos"] if inference_state else 0
			
 
				 
			
 
				         toks = [self.tokenizer.bos_id] + encode_message("user", prompt) + encode_role("assistant")
			
 
				-        start_pos = prefill(self.model, toks[:-1])
			
 
				+        start_pos = prefill(self.model, toks[:-1], start_pos=start_pos)
			
 
				         last_tok = toks[-1]
			
 
				 
			
 
				         output_data = np.array([self.model(Tensor([[last_tok]]), start_pos, TEMPERATURE, TOP_K, TOP_P, ALPHA_F, ALPHA_P).tolist()])
			
@@ -186,15 +175,16 @@ class TinygradDynamicShardInferenceEngine(InferenceEngine):
 
				             return
			
 
				 
			
 
				         model_path = Path(shard.model_id)
			
 
				-        models_dir = Path(_cache_dir) / "downloads"
			
 
				+        models_dir = Path(_cache_dir) / "tinygrad" / "downloads"
			
 
				         model_path = models_dir / shard.model_id
			
 
				+        size = "8B"
			
 
				         if model_path.exists():
			
 
				             model = model_path
			
 
				         else:
			
 
				             from tinygrad.helpers import fetch
			
 
				 
			
 
				             if DEBUG >= 2: print(f"Downloading tinygrad model {shard.model_id}...")
			
 
				-            if shard.model_id == "llama3-8b-sfr":
			
 
				+            if shard.model_id.lower().find("llama3-8b-sfr") != -1:
			
 
				                 fetch("https://huggingface.co/bofenghuang/Meta-Llama-3-8B/resolve/main/original/tokenizer.model", "tokenizer.model", subdir=shard.model_id)
			
 
				                 fetch("https://huggingface.co/TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-8B-R/resolve/main/model-00001-of-00004.safetensors", "model-00001-of-00004.safetensors", subdir=shard.model_id)
			
 
				                 fetch("https://huggingface.co/TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-8B-R/resolve/main/model-00002-of-00004.safetensors", "model-00002-of-00004.safetensors", subdir=shard.model_id)
			
@@ -202,7 +192,7 @@ class TinygradDynamicShardInferenceEngine(InferenceEngine):
 
				                 fetch("https://huggingface.co/TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-8B-R/resolve/main/model-00004-of-00004.safetensors", "model-00004-of-00004.safetensors", subdir=shard.model_id)
			
 
				                 model = fetch("https://huggingface.co/TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-8B-R/raw/main/model.safetensors.index.json", "model.safetensors.index.json", subdir=shard.model_id)
			
 
				                 size = "8B"
			
 
				-            elif shard.model_id == "llama3-70b-sfr":
			
 
				+            elif shard.model_id.lower().find("llama3-70b-sfr") != -1:
			
 
				                 raise NotImplementedError("llama3-70b-sfr is not implemented for tinygrad")
			
 
				                 # fetch("https://huggingface.co/bofenghuang/Meta-Llama-3-70B/resolve/main/original/tokenizer.model", "tokenizer.model", subdir=shard.model_id)
			
 
				                 # fetch("https://huggingface.co/TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-70B-R/resolve/main/model-00001-of-00004.safetensors", "model-00001-of-00004.safetensors", subdir=shard.model_id)
			
--- a/exo/inference/tinygrad/models/llama.py
+++ b/exo/inference/tinygrad/models/llama.py
@@ -165,6 +165,8 @@ class Transformer:
 
				 
			
 
				     for i, layer in enumerate(self.layers):
			
 
				       h = layer(h, start_pos, freqs_cis, mask)
			
 
				+      # if i == 0 or i == len(self.layers) - 1:
			
 
				+      #   print(f"layer {i}: {str(h.numpy())[:60]}")
			
 
				 
			
 
				     if self.shard.is_last_layer():
			
 
				         logits = self.output(self.norm(h)).float()[:, -1, :]
			
--- a/exo/orchestration/standard_node.py
+++ b/exo/orchestration/standard_node.py
@@ -91,8 +91,8 @@ class StandardNode(Node):
 
				 
			
 
				             return np.array(self.buffered_token_output[request_id][0]) if len(self.buffered_token_output[request_id][0]) > 0 else None
			
 
				         except Exception as e:
			
 
				-            import traceback
			
 
				             print(f"Error processing tensor for shard {shard}: {e}")
			
 
				+            import traceback
			
 
				             traceback.print_exc()
			
 
				             return None
			
 
				 
			
--- a/setup.py
+++ b/setup.py
@@ -5,6 +5,7 @@ import sys
 
				 install_requires = [
			
 
				     "aiohttp==3.9.5",
			
 
				     "aiohttp_cors==0.7.0",
			
 
				+    "blobfile==2.1.1",
			
 
				     "grpcio==1.64.1",
			
 
				     "grpcio-tools==1.64.1",
			
 
				     "huggingface-hub==0.23.4",