|
@@ -22,7 +22,17 @@ TOP_P = 0.9
|
|
ALPHA_F = 0.1
|
|
ALPHA_F = 0.1
|
|
ALPHA_P = 0.0
|
|
ALPHA_P = 0.0
|
|
MODEL_PARAMS = {
|
|
MODEL_PARAMS = {
|
|
- "8B": {"args": {"dim": 4096, "n_heads": 32, "n_kv_heads": 8, "n_layers": 32, "norm_eps": 1e-5, "rope_theta": 500000, "vocab_size": 128256, "hidden_dim": 14336}, "files": 1},
|
|
|
|
|
|
+ "1B": {
|
|
|
|
+ "args": {
|
|
|
|
+ "dim": 2048, "n_heads": 32, "n_kv_heads": 8, "n_layers": 16, "norm_eps": 1e-5, "rope_theta": 500000, "vocab_size": 128256, "hidden_dim": 8192,
|
|
|
|
+ "rope_scaling": {"factor": 32.0, "high_freq_factor": 4.0, "low_freq_factor": 1.0, "original_max_position_embeddings": 8192, "rope_type": "llama3"}, "tie_word_embeddings": True
|
|
|
|
+ }, "files": 1
|
|
|
|
+ }, "3B": {
|
|
|
|
+ "args": {
|
|
|
|
+ "dim": 3072, "n_heads": 24, "n_kv_heads": 8, "n_layers": 28, "norm_eps": 1e-5, "rope_theta": 500000, "vocab_size": 128256, "hidden_dim": 8192,
|
|
|
|
+ "rope_scaling": {"factor": 32.0, "high_freq_factor": 4.0, "low_freq_factor": 1.0, "original_max_position_embeddings": 8192, "rope_type": "llama3"}, "tie_word_embeddings": True
|
|
|
|
+ }, "files": 1
|
|
|
|
+ }, "8B": {"args": {"dim": 4096, "n_heads": 32, "n_kv_heads": 8, "n_layers": 32, "norm_eps": 1e-5, "rope_theta": 500000, "vocab_size": 128256, "hidden_dim": 14336}, "files": 1},
|
|
"70B": {"args": {"dim": 8192, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-5, "rope_theta": 500000, "vocab_size": 128256, "hidden_dim": 28672}, "files": 8}
|
|
"70B": {"args": {"dim": 8192, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-5, "rope_theta": 500000, "vocab_size": 128256, "hidden_dim": 28672}, "files": 8}
|
|
}
|
|
}
|
|
|
|
|
|
@@ -55,7 +65,7 @@ class TinygradDynamicShardInferenceEngine(InferenceEngine):
|
|
self.shard_downloader = shard_downloader
|
|
self.shard_downloader = shard_downloader
|
|
self.executor = ThreadPoolExecutor(max_workers=1)
|
|
self.executor = ThreadPoolExecutor(max_workers=1)
|
|
|
|
|
|
- async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> (np.ndarray, str, bool):
|
|
|
|
|
|
+ async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> tuple[np.ndarray, str, bool]:
|
|
await self.ensure_shard(shard)
|
|
await self.ensure_shard(shard)
|
|
start_pos = json.loads(inference_state or "{}").get("start_pos", 0)
|
|
start_pos = json.loads(inference_state or "{}").get("start_pos", 0)
|
|
n_captured_toks = json.loads(inference_state or "{}").get("n_captured_toks", 0)
|
|
n_captured_toks = json.loads(inference_state or "{}").get("n_captured_toks", 0)
|
|
@@ -72,7 +82,7 @@ class TinygradDynamicShardInferenceEngine(InferenceEngine):
|
|
n_captured_toks = len(toks)
|
|
n_captured_toks = len(toks)
|
|
return h.numpy(), json.dumps({"start_pos": start_pos, "n_captured_toks": n_captured_toks}), False
|
|
return h.numpy(), json.dumps({"start_pos": start_pos, "n_captured_toks": n_captured_toks}), False
|
|
|
|
|
|
- async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
|
|
|
|
|
|
+ async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> tuple[np.ndarray, str, bool]:
|
|
await self.ensure_shard(shard)
|
|
await self.ensure_shard(shard)
|
|
start_pos = json.loads(inference_state or "{}").get("start_pos", 0)
|
|
start_pos = json.loads(inference_state or "{}").get("start_pos", 0)
|
|
n_captured_toks = json.loads(inference_state or "{}").get("n_captured_toks", 0)
|
|
n_captured_toks = json.loads(inference_state or "{}").get("n_captured_toks", 0)
|
|
@@ -94,7 +104,8 @@ class TinygradDynamicShardInferenceEngine(InferenceEngine):
|
|
model_path = await self.shard_downloader.ensure_shard(shard)
|
|
model_path = await self.shard_downloader.ensure_shard(shard)
|
|
|
|
|
|
if self.shard != shard:
|
|
if self.shard != shard:
|
|
- self.model = await asyncio.get_event_loop().run_in_executor(self.executor, build_transformer, model_path, shard, "8B" if "8b" in shard.model_id.lower() else "70B")
|
|
|
|
|
|
+ parameters = "1B" if "1b" in shard.model_id.lower() else "3B" if "3b" in shard.model_id.lower() else "8B" if "8b" in shard.model_id.lower() else "70B"
|
|
|
|
+ self.model = await asyncio.get_event_loop().run_in_executor(self.executor, build_transformer, model_path, shard, parameters)
|
|
|
|
|
|
tokenizer_path = str((model_path if model_path.is_dir() else model_path.parent))
|
|
tokenizer_path = str((model_path if model_path.is_dir() else model_path.parent))
|
|
self.tokenizer = await resolve_tokenizer(tokenizer_path)
|
|
self.tokenizer = await resolve_tokenizer(tokenizer_path)
|