1 year ago · dd09c59719
--- a/exo/api/chatgpt_api.py
+++ b/exo/api/chatgpt_api.py
@@ -4,7 +4,7 @@ import asyncio
 
															 import json
														
 
															 from pathlib import Path
														
 
															 from transformers import AutoTokenizer
														
 
															-from typing import List, Literal, Union
														
 
															+from typing import List, Literal, Union, Dict
														
 
															 from aiohttp import web
														
 
															 import aiohttp_cors
														
 
															 from exo import DEBUG, VERSION
														
@@ -122,6 +122,8 @@ class ChatGPTAPI:
 
															         self.inference_engine_classname = inference_engine_classname
														
 
															         self.response_timeout_secs = 90
														
 
															         self.app = web.Application()
														
 
															+        self.prev_token_lens: Dict[str, int] = {}
														
 
															+        self.stream_tasks: Dict[str, asyncio.Task] = {}
														
 
															         cors = aiohttp_cors.setup(self.app)
														
 
															         cors_options = aiohttp_cors.ResourceOptions(
														
 
															             allow_credentials=True,
														
@@ -191,12 +193,9 @@ class ChatGPTAPI:
 
															                 )
														
 
															                 await response.prepare(request)
														
 
															-                stream_task = None
														
 
															-                last_tokens_len = 0
														
 
															                 async def stream_result(request_id: str, tokens: List[int], is_finished: bool):
														
 
															-                    nonlocal last_tokens_len
														
 
															-                    prev_last_tokens_len = last_tokens_len
														
 
															-                    last_tokens_len = len(tokens)
														
 
															+                    prev_last_tokens_len = self.prev_token_lens.get(request_id, 0)
														
 
															+                    self.prev_token_lens[request_id] = max(prev_last_tokens_len, len(tokens))
														
 
															                     new_tokens = tokens[prev_last_tokens_len:]
														
 
															                     finish_reason = None
														
 
															                     eos_token_id = tokenizer.special_tokens_map.get("eos_token_id") if isinstance(tokenizer._tokenizer, AutoTokenizer) else tokenizer.eos_token_id
														
@@ -211,15 +210,14 @@ class ChatGPTAPI:
 
															                     if DEBUG >= 2: print(f"Streaming completion: {completion}")
														
 
															                     await response.write(f"data: {json.dumps(completion)}\n\n".encode())
														
 
															                 def on_result(_request_id: str, tokens: List[int], is_finished: bool):
														
 
															-                    nonlocal stream_task
														
 
															-                    stream_task = asyncio.create_task(stream_result(request_id, tokens, is_finished))
														
 
															+                    self.stream_tasks[request_id] = asyncio.create_task(stream_result(request_id, tokens, is_finished))
														
 
															                     return _request_id == request_id and is_finished
														
 
															                 _, tokens, _ = await callback.wait(on_result, timeout=self.response_timeout_secs)
														
 
															-                if stream_task: # in case there is still a stream task running, wait for it to complete
														
 
															+                if request_id in self.stream_tasks: # in case there is still a stream task running, wait for it to complete
														
 
															                     if DEBUG >= 2: print(f"Pending stream task. Waiting for stream task to complete.")
														
 
															                     try:
														
 
															-                        await asyncio.wait_for(stream_task, timeout=30)
														
 
															+                        await asyncio.wait_for(self.stream_tasks[request_id], timeout=30)
														
 
															                     except asyncio.TimeoutError:
														
 
															                         print("WARNING: Stream task timed out. This should not happen.")
														
 
															                 await response.write_eof()