|
@@ -65,24 +65,24 @@ class TinygradDynamicShardInferenceEngine(InferenceEngine):
|
|
|
self.shard_downloader = shard_downloader
|
|
|
self.executor = ThreadPoolExecutor(max_workers=1)
|
|
|
|
|
|
- async def sample(self, x: np.ndarray):
|
|
|
+ async def sample(self, x: np.ndarray) -> np.ndarray:
|
|
|
logits = x[:, -1, :]
|
|
|
def sample_wrapper():
|
|
|
return sample_logits(Tensor(x).flatten(), TEMPERATURE, 0, 0.8, 0.0, 0.0).realize()
|
|
|
out = await asyncio.get_running_loop().run_in_executor(self.executor, sample_wrapper)
|
|
|
return out.numpy()
|
|
|
|
|
|
- async def encode(self, shard: Shard, prompt: str):
|
|
|
+ async def encode(self, shard: Shard, prompt: str) -> np.ndarray:
|
|
|
await self.ensure_shard(shard)
|
|
|
tokens = await asyncio.get_running_loop().run_in_executor(self.executor, self.tokenizer.encode, prompt)
|
|
|
return np.array(tokens)
|
|
|
|
|
|
- async def decode(self, shard: Shard, tokens):
|
|
|
+ async def decode(self, shard: Shard, tokens) -> str:
|
|
|
await self.ensure_shard(shard)
|
|
|
tokens = await asyncio.get_running_loop().run_in_executor(self.executor, self.tokenizer.decode, tokens)
|
|
|
return tokens
|
|
|
|
|
|
- async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> tuple[np.ndarray, str, bool]:
|
|
|
+ async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> np.ndarray:
|
|
|
await self.ensure_shard(shard)
|
|
|
start_pos = json.loads(inference_state or "{}").get("start_pos", 0)
|
|
|
output_data = await asyncio.get_running_loop().run_in_executor(self.executor, self.model, Tensor(input_data), start_pos)
|