bench.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. import aiohttp
  2. import asyncio
  3. import time
  4. import json
  5. import os
  6. import boto3
  7. from typing import Dict, Any
  8. from datetime import datetime
  9. import subprocess
  10. import psutil
  11. import platform
  12. from pathlib import Path
  13. def check_system_state():
  14. print("\n=== System State Check ===", flush=True)
  15. # Add macOS-specific checks
  16. try:
  17. # Check powermetrics
  18. try:
  19. power_metrics = subprocess.run(['powermetrics', '-n', '1', '-i', '1000', '--show-process-energy'],
  20. capture_output=True, text=True)
  21. except:
  22. # Try with sudo if direct access fails
  23. power_metrics = subprocess.run(['sudo', 'powermetrics', '-n', '1', '-i', '1000', '--show-process-energy'],
  24. capture_output=True, text=True)
  25. print("\nPower Metrics:", power_metrics.stdout, flush=True)
  26. # Check thermal state
  27. thermal_state = subprocess.run(['pmset', '-g', 'therm'], capture_output=True, text=True)
  28. print("\nThermal State:", thermal_state.stdout, flush=True)
  29. # Check if running under Rosetta
  30. arch = subprocess.run(['arch'], capture_output=True, text=True)
  31. print("\nArchitecture:", arch.stdout, flush=True)
  32. # Check MLX compilation mode
  33. import mlx.core as mx
  34. print("\nMLX Build Info:", mx.build_info(), flush=True)
  35. except Exception as e:
  36. print(f"Error in macOS checks: {e}", flush=True)
  37. # CPU Info
  38. print("\nCPU Information:", flush=True)
  39. try:
  40. cpu_freq = psutil.cpu_freq()
  41. print(f"CPU Frequency - Current: {cpu_freq.current:.2f}MHz, Min: {cpu_freq.min:.2f}MHz, Max: {cpu_freq.max:.2f}MHz", flush=True)
  42. print(f"CPU Usage per Core: {psutil.cpu_percent(percpu=True)}%", flush=True)
  43. # Check if running in low power mode
  44. power_mode = subprocess.run(['pmset', '-g'], capture_output=True, text=True)
  45. print("Power Settings:", power_mode.stdout, flush=True)
  46. except Exception as e:
  47. print(f"Error getting CPU info: {e}", flush=True)
  48. # Memory Info
  49. print("\nMemory Information:", flush=True)
  50. try:
  51. mem = psutil.virtual_memory()
  52. print(f"Total: {mem.total/1024/1024/1024:.2f}GB", flush=True)
  53. print(f"Available: {mem.available/1024/1024/1024:.2f}GB", flush=True)
  54. print(f"Used: {mem.used/1024/1024/1024:.2f}GB ({mem.percent}%)", flush=True)
  55. # Check swap
  56. swap = psutil.swap_memory()
  57. print(f"Swap Used: {swap.used/1024/1024/1024:.2f}GB of {swap.total/1024/1024/1024:.2f}GB", flush=True)
  58. except Exception as e:
  59. print(f"Error getting memory info: {e}", flush=True)
  60. # GPU Info
  61. print("\nGPU Information:", flush=True)
  62. try:
  63. # Check MLX GPU settings
  64. print("MLX Environment Variables:", flush=True)
  65. mlx_vars = {k: v for k, v in os.environ.items() if k.startswith('MLX')}
  66. print(json.dumps(mlx_vars, indent=2), flush=True)
  67. # Check Metal GPU memory allocation
  68. gpu_mem = subprocess.run(['sysctl', 'iogpu'], capture_output=True, text=True)
  69. print("GPU Memory Settings:", gpu_mem.stdout, flush=True)
  70. except Exception as e:
  71. print(f"Error getting GPU info: {e}", flush=True)
  72. # Process Priority
  73. print("\nProcess Priority Information:", flush=True)
  74. try:
  75. current_process = psutil.Process()
  76. print(f"Process Nice Value: {current_process.nice()}", flush=True)
  77. print(f"Process IO Nice Value: {current_process.ionice()}", flush=True)
  78. print(f"Process CPU Affinity: {current_process.cpu_affinity()}", flush=True)
  79. except Exception as e:
  80. print(f"Error getting process priority info: {e}", flush=True)
  81. # System Load
  82. print("\nSystem Load:", flush=True)
  83. try:
  84. print(f"Load Average: {psutil.getloadavg()}", flush=True)
  85. # Get top processes by CPU and Memory
  86. print("\nTop Processes:", flush=True)
  87. processes = []
  88. for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
  89. try:
  90. processes.append(proc.info)
  91. except (psutil.NoSuchProcess, psutil.AccessDenied):
  92. pass
  93. sorted_by_cpu = sorted(processes, key=lambda x: x['cpu_percent'], reverse=True)[:5]
  94. print("Top 5 CPU-consuming processes:", json.dumps(sorted_by_cpu, indent=2), flush=True)
  95. except Exception as e:
  96. print(f"Error getting system load info: {e}", flush=True)
  97. print("\n=== End System State Check ===\n", flush=True)
  98. def check_gpu_access():
  99. try:
  100. # Check if MLX can see the GPU
  101. import mlx.core as mx
  102. print("MLX device info:", mx.default_device())
  103. # Check Metal device availability
  104. result = subprocess.run(['system_profiler', 'SPDisplaysDataType'], capture_output=True, text=True)
  105. print("GPU Info:", result.stdout)
  106. except Exception as e:
  107. print(f"Failed to check GPU access: {e}")
  108. async def measure_performance(api_endpoint: str, prompt: str, model: str) -> Dict[str, Any]:
  109. """
  110. Measures the performance of an API endpoint by sending a prompt and recording metrics.
  111. Args:
  112. api_endpoint (str): The API endpoint URL.
  113. prompt (str): The prompt to send to the API.
  114. Returns:
  115. Dict[str, Any]: A dictionary containing performance metrics or error information.
  116. """
  117. results = {
  118. 'model': model,
  119. 'run_id': os.environ.get('GITHUB_RUN_ID', 'unknown'),
  120. 'branch': os.environ.get('GITHUB_REF_NAME', 'unknown'),
  121. 'configuration': json.loads(os.environ.get('HARDWARE_CONFIG', '{}'))
  122. }
  123. # Get token count
  124. session = aiohttp.ClientSession()
  125. try:
  126. response = await session.post(
  127. "http://localhost:52415/v1/chat/token/encode",
  128. json={
  129. "model": model,
  130. "messages": [{"role": "user", "content": prompt}]
  131. }
  132. )
  133. response.raise_for_status()
  134. token_data = await response.json()
  135. results['prompt_len'] = token_data['num_tokens']
  136. except Exception as e:
  137. await session.close()
  138. raise RuntimeError(f"Failed to get token count: {str(e)}")
  139. # Measure completion performance
  140. try:
  141. start_time = time.time()
  142. response = await session.post(
  143. api_endpoint,
  144. json={
  145. "model": model,
  146. "messages": [{"role": "user", "content": prompt}],
  147. "temperature": 0,
  148. "stream": True
  149. }
  150. )
  151. response.raise_for_status()
  152. first_token_time = None
  153. total_tokens = 0
  154. async for line in response.content.iter_chunks():
  155. line = line[0].decode('utf-8').strip()
  156. if not line.startswith('data: '):
  157. continue
  158. data = json.loads(line[6:]) # Skip 'data: ' prefix
  159. if content := data.get('choices', [{}])[0].get('delta', {}).get('content'):
  160. print(f"Received content: {content}", flush=True)
  161. if first_token_time is None:
  162. first_token_time = time.time()
  163. ttft = first_token_time - start_time
  164. results.update({
  165. 'ttft': ttft,
  166. 'prompt_tps': results['prompt_len'] / ttft
  167. })
  168. total_tokens += 1
  169. total_time = time.time() - start_time
  170. results.update({
  171. 'generation_tps': total_tokens / total_time,
  172. 'response_len': total_tokens,
  173. 'total_time': total_time
  174. })
  175. except Exception as e:
  176. raise RuntimeError(f"Performance measurement failed: {str(e)}")
  177. finally:
  178. await session.close()
  179. return results
  180. async def main() -> None:
  181. api_endpoint = "http://localhost:52415/v1/chat/completions"
  182. # Define prompts
  183. prompt_warmup = "what is the capital of France?"
  184. prompt_essay = "write an essay about cats"
  185. model = os.environ.get('model', 'llama-3.2-1b')
  186. # Warmup request
  187. print("\nPerforming warmup request...", flush=True)
  188. try:
  189. warmup_results = await measure_performance(api_endpoint, prompt_warmup, model)
  190. print("Warmup completed successfully", flush=True)
  191. except Exception as e:
  192. print(f"Warmup request failed: {e}", flush=True)
  193. # Measure performance for the essay prompt
  194. print("\nMeasuring performance for the essay prompt...", flush=True)
  195. results = await measure_performance(api_endpoint, prompt_essay, model)
  196. try:
  197. s3_client = boto3.client(
  198. 's3',
  199. aws_access_key_id=os.environ.get('aws_access_key_id'),
  200. aws_secret_access_key=os.environ.get('aws_secret_key')
  201. )
  202. job_name = os.environ.get('GITHUB_JOB')
  203. # Create S3 key with timestamp and commit info
  204. now = datetime.utcnow()
  205. timestamp = now.strftime('%H-%M-%S')
  206. commit_sha = os.environ.get('GITHUB_SHA', 'unknown')[:7]
  207. s3_key = f"{job_name}/{model}/{now.year}/{now.month}/{now.day}/{timestamp}_{commit_sha}.json"
  208. # Upload to S3
  209. s3_client.put_object(
  210. Bucket='exo-benchmarks',
  211. Key=s3_key,
  212. Body=json.dumps(results),
  213. ContentType='application/json'
  214. )
  215. print(f"Performance metrics uploaded to S3: s3://exo-benchmarks/{s3_key}", flush=True)
  216. except Exception as e:
  217. print(f"Failed to upload metrics to S3: {e}", flush=True)
  218. # Optionally print the metrics for visibility
  219. print("Performance metrics:", flush=True)
  220. print(json.dumps(results, indent=4), flush=True)
  221. if __name__ == "__main__":
  222. check_system_state()
  223. check_gpu_access()
  224. asyncio.run(main())