Alex Cheema 9 mēneši atpakaļ
vecāks
revīzija
46d618abed

+ 1 - 1
exo/api/chatgpt_api.py

@@ -175,7 +175,7 @@ class ChatGPTAPI:
             if DEBUG >= 2:
                 import traceback
                 traceback.print_exc()
-            return web.json_response({'detail': f"Error processing prompt (see logs): {str(e)}"}, status=500)
+            return web.json_response({'detail': f"Error processing prompt (see logs with DEBUG>=2): {str(e)}"}, status=500)
 
         try:
             if DEBUG >= 2: print(f"Waiting for response to finish. timeout={self.response_timeout_secs}s")

+ 43 - 0
exo/inference/debug_inference_engine.py

@@ -0,0 +1,43 @@
+from exo.inference.mlx.sharded_inference_engine import MLXDynamicShardInferenceEngine
+from exo.inference.inference_engine import InferenceEngine
+from exo.inference.shard import Shard
+from exo.inference.tinygrad.inference import TinygradDynamicShardInferenceEngine
+import asyncio
+import numpy as np
+
+# An inference engine should work the same for any number of Shards, as long as the Shards are continuous.
+async def test_inference_engine(inference_engine_1: InferenceEngine, inference_engine_2: InferenceEngine, model_id: str):
+    from exo.inference.tinygrad.inference import Tokenizer
+    from pathlib import Path
+    _tokenizer = Tokenizer(str(Path(model_id) / "tokenizer.model"))
+
+    prompt = "In a single word only, what is the last name of the president of the United States? "
+    resp_full, inference_state_full, _ = await inference_engine_1.infer_prompt(shard=Shard(model_id=model_id, start_layer=0, end_layer=31, n_layers=32), prompt=prompt)
+    next_resp_full, next_inference_state_full, _ = await inference_engine_1.infer_tensor(shard=Shard(model_id=model_id, start_layer=0, end_layer=31, n_layers=32), input_data=resp_full, inference_state=inference_state_full)
+
+    await inference_engine_1.reset_shard(shard=Shard(model_id=model_id, start_layer=0, end_layer=30, n_layers=32))
+    resp1, inference_state_1, _ = await inference_engine_1.infer_prompt(shard=Shard(model_id=model_id, start_layer=0, end_layer=30, n_layers=32), prompt=prompt)
+
+    await inference_engine_2.reset_shard(shard=Shard(model_id=model_id, start_layer=31, end_layer=31, n_layers=32))
+    resp2, inference_state_2, _ = await inference_engine_2.infer_tensor(shard=Shard(model_id=model_id, start_layer=31, end_layer=31, n_layers=32), input_data=resp1, inference_state=inference_state_1)
+
+    # don't reset the second time
+    resp3, inference_state_3, _ = await inference_engine_1.infer_tensor(shard=Shard(model_id=model_id, start_layer=0, end_layer=30, n_layers=32), input_data=resp2, inference_state=inference_state_2)
+    resp4, inference_state_4, _ = await inference_engine_2.infer_tensor(shard=Shard(model_id=model_id, start_layer=31, end_layer=31, n_layers=32), input_data=resp3, inference_state=inference_state_3)
+
+    print(f"{resp2=}")
+    print(f"full: {_tokenizer.decode(resp_full)}")
+    print(f"next full: {_tokenizer.decode(next_resp_full)}")
+    print(f"resp2: {_tokenizer.decode(resp2)}")
+    print(f"{resp4=}")
+    print(f"resp4: {_tokenizer.decode(resp4)}")
+
+    assert np.array_equal(resp_full, resp2)
+    assert np.array_equal(next_resp_full, resp4)
+
+
+asyncio.run(test_inference_engine(
+    TinygradDynamicShardInferenceEngine(),
+    TinygradDynamicShardInferenceEngine(),
+    "/Users/alex/Library/Caches/tinygrad/downloads/llama3-8b-sfr",
+))

+ 16 - 9
exo/inference/test_inference_engine.py

@@ -2,29 +2,36 @@ from exo.inference.mlx.sharded_inference_engine import MLXDynamicShardInferenceE
 from exo.inference.inference_engine import InferenceEngine
 from exo.inference.shard import Shard
 from exo.inference.tinygrad.inference import TinygradDynamicShardInferenceEngine
+import asyncio
 import numpy as np
 
 # An inference engine should work the same for any number of Shards, as long as the Shards are continuous.
-async def test_inference_engine(inference_engine: InferenceEngine, model_id: str):
+async def test_inference_engine(inference_engine_1: InferenceEngine, inference_engine_2: InferenceEngine, model_id: str):
     prompt = "In a single word only, what is the capital of Japan? "
-    resp_full, inference_state_full, _ = await inference_engine.infer_prompt(shard=Shard(model_id=model_id, start_layer=0, end_layer=31, n_layers=32), prompt=prompt)
+    resp_full, inference_state_full, _ = await inference_engine_1.infer_prompt(shard=Shard(model_id=model_id, start_layer=0, end_layer=31, n_layers=32), prompt=prompt)
+    next_resp_full, next_inference_state_full, _ = await inference_engine_1.infer_tensor(shard=Shard(model_id=model_id, start_layer=0, end_layer=31, n_layers=32), input_data=resp_full, inference_state=inference_state_full)
 
-    await inference_engine.reset_shard(shard=Shard(model_id=model_id, start_layer=0, end_layer=10, n_layers=32))
-    resp1, inference_state, _ = await inference_engine.infer_prompt(shard=Shard(model_id=model_id, start_layer=0, end_layer=10, n_layers=32), prompt=prompt)
+    await inference_engine_1.reset_shard(shard=Shard(model_id=model_id, start_layer=0, end_layer=30, n_layers=32))
+    resp1, inference_state_1, _ = await inference_engine_1.infer_prompt(shard=Shard(model_id=model_id, start_layer=0, end_layer=30, n_layers=32), prompt=prompt)
 
-    await inference_engine.reset_shard(shard=Shard(model_id=model_id, start_layer=11, end_layer=31, n_layers=32))
-    resp2, _, _ = await inference_engine.infer_tensor(shard=Shard(model_id=model_id, start_layer=11, end_layer=31, n_layers=32), input_data=resp1, inference_state=inference_state)
+    await inference_engine_2.reset_shard(shard=Shard(model_id=model_id, start_layer=31, end_layer=31, n_layers=32))
+    resp2, inference_state_2, _ = await inference_engine_2.infer_tensor(shard=Shard(model_id=model_id, start_layer=31, end_layer=31, n_layers=32), input_data=resp1, inference_state=inference_state_1)
 
-    assert np.array_equal(resp_full, resp2)
+    # don't reset the second time
+    resp3, inference_state_3, _ = await inference_engine_1.infer_tensor(shard=Shard(model_id=model_id, start_layer=0, end_layer=30, n_layers=32), input_data=resp2, inference_state=inference_state_2)
+    resp4, inference_state_4, _ = await inference_engine_2.infer_tensor(shard=Shard(model_id=model_id, start_layer=31, end_layer=31, n_layers=32), input_data=resp3, inference_state=inference_state_3)
 
-import asyncio
+    assert np.array_equal(resp_full, resp2)
+    assert np.array_equal(next_resp_full, resp4)
 
 asyncio.run(test_inference_engine(
+    MLXDynamicShardInferenceEngine(),
     MLXDynamicShardInferenceEngine(),
     "mlx-community/Meta-Llama-3-8B-Instruct-4bit",
 ))
 
 asyncio.run(test_inference_engine(
+    TinygradDynamicShardInferenceEngine(),
     TinygradDynamicShardInferenceEngine(),
     "/Users/alex/Library/Caches/tinygrad/downloads/llama3-8b-sfr",
-))
+))

+ 6 - 16
exo/inference/tinygrad/inference.py

@@ -124,19 +124,7 @@ TOP_P = 0.9
 ALPHA_F = 0.1
 ALPHA_P = 0.0
 
-last_seen_toks = []
 def prefill(model, toks, start_pos=0):
-  global last_seen_toks
-
-  # we can skip part of the prompt if it is the same as last and start_pos=0
-  if start_pos == 0:
-    for i, (a, b) in enumerate(zip(toks, last_seen_toks)):
-      if a != b: break
-    else: i = min(len(toks), len(last_seen_toks))
-    start_pos += i
-    last_seen_toks = toks
-    toks = toks[i:]
-
   # prefill the model
   for tok in tqdm(toks):
     GlobalCounters.reset()
@@ -155,9 +143,10 @@ class TinygradDynamicShardInferenceEngine(InferenceEngine):
             return encode_role(role) + self.tokenizer.encode(content.strip()) + [self.tokenizer.special_tokens["<|eot_id|>"]]
 
         await self.ensure_shard(shard)
+        start_pos = json.loads(inference_state)["start_pos"] if inference_state else 0
 
         toks = [self.tokenizer.bos_id] + encode_message("user", prompt) + encode_role("assistant")
-        start_pos = prefill(self.model, toks[:-1])
+        start_pos = prefill(self.model, toks[:-1], start_pos=start_pos)
         last_tok = toks[-1]
 
         output_data = np.array([self.model(Tensor([[last_tok]]), start_pos, TEMPERATURE, TOP_K, TOP_P, ALPHA_F, ALPHA_P).tolist()])
@@ -186,15 +175,16 @@ class TinygradDynamicShardInferenceEngine(InferenceEngine):
             return
 
         model_path = Path(shard.model_id)
-        models_dir = Path(_cache_dir) / "downloads"
+        models_dir = Path(_cache_dir) / "tinygrad" / "downloads"
         model_path = models_dir / shard.model_id
+        size = "8B"
         if model_path.exists():
             model = model_path
         else:
             from tinygrad.helpers import fetch
 
             if DEBUG >= 2: print(f"Downloading tinygrad model {shard.model_id}...")
-            if shard.model_id == "llama3-8b-sfr":
+            if shard.model_id.lower().find("llama3-8b-sfr") != -1:
                 fetch("https://huggingface.co/bofenghuang/Meta-Llama-3-8B/resolve/main/original/tokenizer.model", "tokenizer.model", subdir=shard.model_id)
                 fetch("https://huggingface.co/TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-8B-R/resolve/main/model-00001-of-00004.safetensors", "model-00001-of-00004.safetensors", subdir=shard.model_id)
                 fetch("https://huggingface.co/TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-8B-R/resolve/main/model-00002-of-00004.safetensors", "model-00002-of-00004.safetensors", subdir=shard.model_id)
@@ -202,7 +192,7 @@ class TinygradDynamicShardInferenceEngine(InferenceEngine):
                 fetch("https://huggingface.co/TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-8B-R/resolve/main/model-00004-of-00004.safetensors", "model-00004-of-00004.safetensors", subdir=shard.model_id)
                 model = fetch("https://huggingface.co/TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-8B-R/raw/main/model.safetensors.index.json", "model.safetensors.index.json", subdir=shard.model_id)
                 size = "8B"
-            elif shard.model_id == "llama3-70b-sfr":
+            elif shard.model_id.lower().find("llama3-70b-sfr") != -1:
                 raise NotImplementedError("llama3-70b-sfr is not implemented for tinygrad")
                 # fetch("https://huggingface.co/bofenghuang/Meta-Llama-3-70B/resolve/main/original/tokenizer.model", "tokenizer.model", subdir=shard.model_id)
                 # fetch("https://huggingface.co/TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-70B-R/resolve/main/model-00001-of-00004.safetensors", "model-00001-of-00004.safetensors", subdir=shard.model_id)

+ 2 - 0
exo/inference/tinygrad/models/llama.py

@@ -165,6 +165,8 @@ class Transformer:
 
     for i, layer in enumerate(self.layers):
       h = layer(h, start_pos, freqs_cis, mask)
+      # if i == 0 or i == len(self.layers) - 1:
+      #   print(f"layer {i}: {str(h.numpy())[:60]}")
 
     if self.shard.is_last_layer():
         logits = self.output(self.norm(h)).float()[:, -1, :]

+ 1 - 1
exo/orchestration/standard_node.py

@@ -91,8 +91,8 @@ class StandardNode(Node):
 
             return np.array(self.buffered_token_output[request_id][0]) if len(self.buffered_token_output[request_id][0]) > 0 else None
         except Exception as e:
-            import traceback
             print(f"Error processing tensor for shard {shard}: {e}")
+            import traceback
             traceback.print_exc()
             return None
 

+ 1 - 0
setup.py

@@ -5,6 +5,7 @@ import sys
 install_requires = [
     "aiohttp==3.9.5",
     "aiohttp_cors==0.7.0",
+    "blobfile==2.1.1",
     "grpcio==1.64.1",
     "grpcio-tools==1.64.1",
     "huggingface-hub==0.23.4",