sharded_inference_engine.py 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940
  1. import numpy as np
  2. import mlx.core as mx
  3. from ..inference_engine import InferenceEngine
  4. from .sharded_model import StatefulShardedModel
  5. from .sharded_utils import load_shard, get_image_from_str
  6. from ..shard import Shard
  7. from typing import Optional
  8. from exo.download.shard_download import ShardDownloader
  9. class MLXDynamicShardInferenceEngine(InferenceEngine):
  10. def __init__(self, shard_downloader: ShardDownloader):
  11. self.shard = None
  12. self.shard_downloader = shard_downloader
  13. async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> (np.ndarray, str, bool):
  14. await self.ensure_shard(shard)
  15. if image_str:
  16. image = await get_image_from_str(image_str)
  17. inputs = self.tokenizer(prompt, image, return_tensors="np")
  18. pixel_values = mx.array(inputs["pixel_values"])
  19. input_ids = mx.array(inputs["input_ids"])
  20. output_data: np.ndarray = np.array(self.stateful_sharded_model.step(request_id, input_ids, pixel_values))
  21. else:
  22. output_data: np.ndarray = np.array(self.stateful_sharded_model.step(request_id, mx.array(self.tokenizer.encode(prompt))))
  23. return output_data, "", output_data.size == 1 and output_data.item() == self.tokenizer.eos_token_id
  24. async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> (np.ndarray, str, bool):
  25. await self.ensure_shard(shard)
  26. output_data: np.ndarray = np.array(self.stateful_sharded_model.step(request_id, mx.array(input_data)))
  27. return output_data, "", output_data.size == 1 and output_data.item() == self.tokenizer.eos_token_id
  28. async def ensure_shard(self, shard: Shard):
  29. if self.shard == shard:
  30. return
  31. model_path = await self.shard_downloader.ensure_shard(shard)
  32. model_shard, self.tokenizer = await load_shard(model_path, shard)
  33. self.stateful_sharded_model = StatefulShardedModel(shard, model_shard)
  34. self.shard = shard