7 months ago · 65fdc99ccc
--- a/exo/inference/tinygrad/models/llama.py
+++ b/exo/inference/tinygrad/models/llama.py
@@ -225,7 +225,7 @@ class Transformer:
 
				       h = inputs
			
 
				     return h
			
 
				 
			
 
				-  def __call__(self, tokens: Tensor, start_pos: Variable, request_id: str, cache: Optional[List[Tensor]] = None):
			
 
				+  def __call__(self, tokens: Tensor, start_pos: Variable, cache: Optional[List[Tensor]] = None):
			
 
				     # TODO: better way to handle the first call v.s. the rest?
			
 
				     h = self.embed(x)
			
 
				     if tokens.shape[0:2] == (1, 1) and self.forward_jit is not None: