|
@@ -7,7 +7,6 @@ from exo.inference.tokenizers import resolve_tokenizer
|
|
|
from tinygrad.nn.state import load_state_dict
|
|
|
from tinygrad import Tensor, nn, Context
|
|
|
from exo.inference.inference_engine import InferenceEngine
|
|
|
-from typing import Optional, Tuple
|
|
|
import numpy as np
|
|
|
from exo.inference.tinygrad.tinygrad_helpers import concat_weights, load
|
|
|
from exo.download.shard_download import ShardDownloader
|
|
@@ -68,24 +67,21 @@ class TinygradDynamicShardInferenceEngine(InferenceEngine):
|
|
|
async def sample(self, x: np.ndarray, temp=TEMPERATURE, top_p: float = 0.0) -> np.ndarray:
|
|
|
logits = x[:, -1, :]
|
|
|
def sample_wrapper():
|
|
|
- return sample_logits(Tensor(logits).flatten(), temp, 0, 0.8, top_p, 0.0).realize()
|
|
|
- out = await asyncio.get_running_loop().run_in_executor(self.executor, sample_wrapper)
|
|
|
- return out.numpy().astype(int)
|
|
|
+ return sample_logits(Tensor(logits).flatten(), temp, 0, 0.8, top_p, 0.0).realize().numpy().astype(int)
|
|
|
+ return await asyncio.get_running_loop().run_in_executor(self.executor, sample_wrapper)
|
|
|
|
|
|
async def encode(self, shard: Shard, prompt: str) -> np.ndarray:
|
|
|
await self.ensure_shard(shard)
|
|
|
tokens = await asyncio.get_running_loop().run_in_executor(self.executor, self.tokenizer.encode, prompt)
|
|
|
- return np.array(tokens)
|
|
|
+ return await asyncio.get_running_loop().run_in_executor(self.executor, np.array, tokens)
|
|
|
|
|
|
async def decode(self, shard: Shard, tokens) -> str:
|
|
|
await self.ensure_shard(shard)
|
|
|
- tokens = await asyncio.get_running_loop().run_in_executor(self.executor, self.tokenizer.decode, tokens)
|
|
|
- return tokens
|
|
|
+ return await asyncio.get_running_loop().run_in_executor(self.executor, self.tokenizer.decode, tokens)
|
|
|
|
|
|
async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray) -> np.ndarray:
|
|
|
await self.ensure_shard(shard)
|
|
|
- output_data = await asyncio.get_running_loop().run_in_executor(self.executor, lambda: self.model(Tensor(input_data), request_id).realize())
|
|
|
- return output_data.numpy()
|
|
|
+ return await asyncio.get_running_loop().run_in_executor(self.executor, lambda: self.model(Tensor(input_data), request_id).realize().numpy())
|
|
|
|
|
|
async def ensure_shard(self, shard: Shard):
|
|
|
if self.shard == shard:
|