|
@@ -19,88 +19,76 @@ async def measure_performance(api_endpoint: str, prompt: str) -> Dict[str, Any]:
|
|
|
Returns:
|
|
|
Dict[str, Any]: A dictionary containing performance metrics or error information.
|
|
|
"""
|
|
|
- model = os.environ.get('model')
|
|
|
- results: Dict[str, Any] = {'model': model, 'run_id': os.environ.get('GITHUB_RUN_ID')}
|
|
|
- results['configuration'] = json.loads(os.environ.get('HARDWARE_CONFIG'))
|
|
|
-
|
|
|
- # Get prompt length in tokens
|
|
|
- async with aiohttp.ClientSession() as session:
|
|
|
- try:
|
|
|
- request_payload = {
|
|
|
+ model = os.environ.get('model', 'llama-3.2-1b')
|
|
|
+
|
|
|
+ results = {
|
|
|
+ 'model': model,
|
|
|
+ 'run_id': os.environ.get('GITHUB_RUN_ID', 'unknown'),
|
|
|
+ 'configuration': json.loads(os.environ.get('HARDWARE_CONFIG', '{}'))
|
|
|
+ }
|
|
|
+
|
|
|
+ # Get token count
|
|
|
+ session = aiohttp.ClientSession()
|
|
|
+ try:
|
|
|
+ response = await session.post(
|
|
|
+ "http://localhost:52415/v1/chat/token/encode",
|
|
|
+ json={
|
|
|
"model": model,
|
|
|
"messages": [{"role": "user", "content": prompt}]
|
|
|
}
|
|
|
- async with session.post(
|
|
|
- "http://localhost:52415/v1/chat/token/encode",
|
|
|
- json=request_payload
|
|
|
- ) as response:
|
|
|
- token_data = await response.json()
|
|
|
- prompt_tokens = token_data.get('num_tokens', 0)
|
|
|
- print(f"Prompt length: {prompt_tokens} tokens", flush=True)
|
|
|
- except Exception as e:
|
|
|
- print(f"Failed to get prompt length: {e}", flush=True)
|
|
|
- prompt_tokens = 0
|
|
|
- results['prompt_len'] = prompt_tokens
|
|
|
-
|
|
|
- request_payload = {
|
|
|
- "model": model,
|
|
|
- "messages": [{"role": "user", "content": prompt}],
|
|
|
- "temperature": 0,
|
|
|
- "stream": True
|
|
|
- }
|
|
|
+ )
|
|
|
+ response.raise_for_status()
|
|
|
+ token_data = await response.json()
|
|
|
+ results['prompt_len'] = token_data['num_tokens']
|
|
|
+ except Exception as e:
|
|
|
+ await session.close()
|
|
|
+ raise RuntimeError(f"Failed to get token count: {str(e)}")
|
|
|
|
|
|
- async with aiohttp.ClientSession() as session:
|
|
|
- try:
|
|
|
- start_time = time.time()
|
|
|
- first_token_time = None
|
|
|
- total_tokens = 0
|
|
|
-
|
|
|
- async with session.post(api_endpoint, json=request_payload) as response:
|
|
|
- if response.status != 200:
|
|
|
- results["error"] = f"HTTP {response.status}: {response.reason}"
|
|
|
- return results
|
|
|
-
|
|
|
- async for raw_line in response.content:
|
|
|
- line = raw_line.decode('utf-8').strip()
|
|
|
- if not line or not line.startswith('data: '):
|
|
|
- continue
|
|
|
-
|
|
|
- line_content = line[6:] # Remove 'data: ' prefix
|
|
|
- if line_content == '[DONE]':
|
|
|
- break
|
|
|
-
|
|
|
- try:
|
|
|
- chunk = json.loads(line_content)
|
|
|
- choice = chunk.get('choices', [{}])[0]
|
|
|
- content = choice.get('delta', {}).get('content')
|
|
|
-
|
|
|
- if content:
|
|
|
- if first_token_time is None:
|
|
|
- first_token_time = time.time()
|
|
|
- results['ttft'] = first_token_time - start_time
|
|
|
- results['prompt_tps'] = prompt_tokens/results['ttft']
|
|
|
-
|
|
|
- total_tokens += 1
|
|
|
- except json.JSONDecodeError:
|
|
|
- # Log or handle malformed JSON if necessary
|
|
|
- continue
|
|
|
-
|
|
|
- end_time = time.time()
|
|
|
- total_time = end_time - start_time
|
|
|
-
|
|
|
- if total_tokens > 0:
|
|
|
- results.update({
|
|
|
- "generation_tps": total_tokens / total_time,
|
|
|
- "response_len": total_tokens,
|
|
|
- "total_time": total_time
|
|
|
- })
|
|
|
- else:
|
|
|
- results["error"] = "No tokens were generated"
|
|
|
-
|
|
|
- except aiohttp.ClientError as e:
|
|
|
- results["error"] = f"Client error: {e}"
|
|
|
- except Exception as e:
|
|
|
- results["error"] = f"Unexpected error: {e}"
|
|
|
+ # Measure completion performance
|
|
|
+ try:
|
|
|
+ start_time = time.time()
|
|
|
+ response = await session.post(
|
|
|
+ api_endpoint,
|
|
|
+ json={
|
|
|
+ "model": model,
|
|
|
+ "messages": [{"role": "user", "content": prompt}],
|
|
|
+ "temperature": 0,
|
|
|
+ "stream": True
|
|
|
+ }
|
|
|
+ )
|
|
|
+ response.raise_for_status()
|
|
|
+
|
|
|
+ first_token_time = None
|
|
|
+ total_tokens = 0
|
|
|
+
|
|
|
+ async for line in response.content.iter_chunks():
|
|
|
+ line = line[0].decode('utf-8').strip()
|
|
|
+ if not line.startswith('data: '):
|
|
|
+ continue
|
|
|
+
|
|
|
+ data = json.loads(line[6:]) # Skip 'data: ' prefix
|
|
|
+ if content := data.get('choices', [{}])[0].get('delta', {}).get('content'):
|
|
|
+ print(f"Received content: {content}", flush=True)
|
|
|
+ if first_token_time is None:
|
|
|
+ first_token_time = time.time()
|
|
|
+ ttft = first_token_time - start_time
|
|
|
+ results.update({
|
|
|
+ 'ttft': ttft,
|
|
|
+ 'prompt_tps': results['prompt_len'] / ttft
|
|
|
+ })
|
|
|
+ total_tokens += 1
|
|
|
+
|
|
|
+ total_time = time.time() - start_time
|
|
|
+ results.update({
|
|
|
+ 'generation_tps': total_tokens / total_time,
|
|
|
+ 'response_len': total_tokens,
|
|
|
+ 'total_time': total_time
|
|
|
+ })
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ raise RuntimeError(f"Performance measurement failed: {str(e)}")
|
|
|
+ finally:
|
|
|
+ await session.close()
|
|
|
|
|
|
return results
|
|
|
|
|
@@ -122,13 +110,13 @@ async def main() -> None:
|
|
|
aws_secret_access_key=os.environ.get('aws_secret_key')
|
|
|
)
|
|
|
job_name = os.environ.get('GITHUB_JOB')
|
|
|
-
|
|
|
+
|
|
|
# Create S3 key with timestamp and commit info
|
|
|
now = datetime.utcnow()
|
|
|
timestamp = now.strftime('%H-%M-%S')
|
|
|
commit_sha = os.environ.get('GITHUB_SHA', 'unknown')[:7]
|
|
|
s3_key = f"{job_name}/{now.year}/{now.month}/{now.day}/{timestamp}_{commit_sha}.json"
|
|
|
-
|
|
|
+
|
|
|
# Upload to S3
|
|
|
s3_client.put_object(
|
|
|
Bucket='exo-benchmarks',
|
|
@@ -146,4 +134,4 @@ async def main() -> None:
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
- asyncio.run(main())
|
|
|
+ asyncio.run(main())
|