bench.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401
  1. import aiohttp
  2. import asyncio
  3. import time
  4. import json
  5. import os
  6. import boto3
  7. from typing import Dict, Any
  8. from datetime import datetime
  9. import subprocess
  10. import psutil
  11. import platform
  12. from pathlib import Path
  13. def check_system_state():
  14. print("\n=== System State Check ===", flush=True)
  15. # Add macOS-specific checks
  16. try:
  17. # Check powermetrics with sudo
  18. try:
  19. power_metrics = subprocess.run(
  20. ['sudo', 'powermetrics', '-n', '1', '-i', '1000', '--samplers', 'cpu_power'],
  21. capture_output=True, text=True
  22. )
  23. print("\nPower Metrics:", power_metrics.stdout, flush=True)
  24. except Exception as e:
  25. print(f"Error getting power metrics: {e}", flush=True)
  26. # Check thermal state
  27. thermal_state = subprocess.run(['pmset', '-g', 'therm'], capture_output=True, text=True)
  28. print("\nThermal State:", thermal_state.stdout, flush=True)
  29. # Check if running under Rosetta
  30. arch = subprocess.run(['arch'], capture_output=True, text=True)
  31. print("\nArchitecture:", arch.stdout, flush=True)
  32. # Check MLX compilation mode - only if mlx is available
  33. try:
  34. import mlx.core as mx
  35. if hasattr(mx, 'build_info'):
  36. print("\nMLX Build Info:", mx.build_info(), flush=True)
  37. else:
  38. print("\nMLX Build Info: Not available in this version", flush=True)
  39. except ImportError:
  40. print("\nMLX: Not installed", flush=True)
  41. except Exception as e:
  42. print(f"\nError checking MLX: {e}", flush=True)
  43. except Exception as e:
  44. print(f"Error in macOS checks: {e}", flush=True)
  45. # CPU Info
  46. print("\nCPU Information:", flush=True)
  47. try:
  48. if platform.system() == 'Darwin' and platform.processor() == 'arm':
  49. # Use sysctl for Apple Silicon Macs
  50. cpu_info = subprocess.run(['sysctl', 'machdep.cpu'], capture_output=True, text=True)
  51. if cpu_info.returncode == 0:
  52. print(f"CPU Info (Apple Silicon):", cpu_info.stdout, flush=True)
  53. # Parse powermetrics output for clearer CPU frequency display
  54. try:
  55. power_metrics = subprocess.run(
  56. ['sudo', 'powermetrics', '-n', '1', '-i', '100', '--samplers', 'cpu_power'],
  57. capture_output=True, text=True
  58. )
  59. if power_metrics.returncode == 0:
  60. output = power_metrics.stdout
  61. print("\nDetailed CPU Frequency Information:")
  62. # Extract cluster frequencies and max frequencies
  63. current_cluster = None
  64. max_freqs = {'E': 0, 'P0': 0, 'P1': 0}
  65. for line in output.split('\n'):
  66. # Track which cluster we're processing
  67. if "E-Cluster" in line:
  68. current_cluster = 'E'
  69. elif "P0-Cluster" in line:
  70. current_cluster = 'P0'
  71. elif "P1-Cluster" in line:
  72. current_cluster = 'P1'
  73. # Get current frequencies
  74. if "HW active frequency:" in line:
  75. freq = line.split(':')[1].strip()
  76. if freq != "0 MHz":
  77. print(f"Current {current_cluster}-Cluster Frequency: {freq}")
  78. # Get max frequencies from residency lines
  79. if current_cluster and "active residency:" in line and "MHz:" in line:
  80. try:
  81. # Extract all frequency values
  82. freqs = []
  83. parts = line.split('MHz:')[:-1] # Skip last part as it's not a frequency
  84. for part in parts:
  85. freq_str = part.split()[-1]
  86. try:
  87. freq = float(freq_str)
  88. freqs.append(freq)
  89. except ValueError:
  90. continue
  91. if freqs:
  92. max_freqs[current_cluster] = max(max_freqs[current_cluster], max(freqs))
  93. except Exception:
  94. continue
  95. # Print max frequencies
  96. print("\nMaximum Available Frequencies:")
  97. for cluster, max_freq in max_freqs.items():
  98. if max_freq > 0:
  99. print(f"{cluster}-Cluster Max: {max_freq:.0f} MHz")
  100. except Exception as e:
  101. print(f"Error parsing powermetrics: {e}", flush=True)
  102. else:
  103. # Use psutil for other systems
  104. cpu_freq = psutil.cpu_freq()
  105. print(f"CPU Frequency - Current: {cpu_freq.current:.2f}MHz, Min: {cpu_freq.min:.2f}MHz, Max: {cpu_freq.max:.2f}MHz", flush=True)
  106. print(f"\nCPU Usage per Core: {psutil.cpu_percent(percpu=True)}%", flush=True)
  107. # Check if running in low power mode
  108. power_mode = subprocess.run(['pmset', '-g'], capture_output=True, text=True)
  109. print("\nPower Settings:", power_mode.stdout, flush=True)
  110. except Exception as e:
  111. print(f"Error getting CPU info: {e}", flush=True)
  112. # Memory Info
  113. print("\nMemory Information:", flush=True)
  114. try:
  115. mem = psutil.virtual_memory()
  116. print(f"Total: {mem.total/1024/1024/1024:.2f}GB", flush=True)
  117. print(f"Available: {mem.available/1024/1024/1024:.2f}GB", flush=True)
  118. print(f"Used: {mem.used/1024/1024/1024:.2f}GB ({mem.percent}%)", flush=True)
  119. # Check swap
  120. swap = psutil.swap_memory()
  121. print(f"Swap Used: {swap.used/1024/1024/1024:.2f}GB of {swap.total/1024/1024/1024:.2f}GB", flush=True)
  122. except Exception as e:
  123. print(f"Error getting memory info: {e}", flush=True)
  124. # GPU Info
  125. print("\nGPU Information:", flush=True)
  126. try:
  127. # Check MLX GPU settings
  128. print("MLX Environment Variables:", flush=True)
  129. mlx_vars = {k: v for k, v in os.environ.items() if k.startswith('MLX')}
  130. print(json.dumps(mlx_vars, indent=2), flush=True)
  131. # Check Metal GPU memory allocation
  132. gpu_mem = subprocess.run(['sysctl', 'iogpu'], capture_output=True, text=True)
  133. print("GPU Memory Settings:", gpu_mem.stdout, flush=True)
  134. except Exception as e:
  135. print(f"Error getting GPU info: {e}", flush=True)
  136. # Process Priority
  137. print("\nProcess Priority Information:", flush=True)
  138. try:
  139. current_process = psutil.Process()
  140. print(f"Process Nice Value: {current_process.nice()}", flush=True)
  141. # Only try to get ionice if the platform supports it
  142. if hasattr(current_process, 'ionice'):
  143. print(f"Process IO Nice Value: {current_process.ionice()}", flush=True)
  144. except Exception as e:
  145. print(f"Error getting process priority info: {e}", flush=True)
  146. # System Load
  147. print("\nSystem Load:", flush=True)
  148. try:
  149. load_avg = psutil.getloadavg()
  150. print(f"Load Average: {load_avg}", flush=True)
  151. # Get top processes by CPU and Memory
  152. print("\nTop Processes by CPU Usage:", flush=True)
  153. processes = []
  154. for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
  155. try:
  156. pinfo = proc.info
  157. if pinfo['cpu_percent'] is not None and pinfo['memory_percent'] is not None:
  158. processes.append(pinfo)
  159. except (psutil.NoSuchProcess, psutil.AccessDenied):
  160. continue
  161. # Sort and display top 5 CPU-consuming processes
  162. sorted_by_cpu = sorted(processes, key=lambda x: x['cpu_percent'] or 0, reverse=True)[:5]
  163. for proc in sorted_by_cpu:
  164. print(f"PID: {proc['pid']}, Name: {proc['name']}, CPU: {proc['cpu_percent']}%, Memory: {proc['memory_percent']:.1f}%")
  165. except Exception as e:
  166. print(f"Error getting system load info: {e}", flush=True)
  167. print("\n=== End System State Check ===\n", flush=True)
  168. def check_gpu_access():
  169. try:
  170. # Check if MLX can see the GPU
  171. import mlx.core as mx
  172. print("MLX device info:", mx.default_device())
  173. # Check Metal device availability
  174. result = subprocess.run(['system_profiler', 'SPDisplaysDataType'], capture_output=True, text=True)
  175. print("GPU Info:", result.stdout)
  176. except Exception as e:
  177. print(f"Failed to check GPU access: {e}")
  178. async def measure_performance(api_endpoint: str, prompt: str, model: str) -> Dict[str, Any]:
  179. """
  180. Measures the performance of an API endpoint by sending a prompt and recording metrics.
  181. Args:
  182. api_endpoint (str): The API endpoint URL.
  183. prompt (str): The prompt to send to the API.
  184. Returns:
  185. Dict[str, Any]: A dictionary containing performance metrics or error information.
  186. """
  187. results = {
  188. 'model': model,
  189. 'run_id': os.environ.get('GITHUB_RUN_ID', 'unknown'),
  190. 'branch': os.environ.get('GITHUB_REF_NAME', 'unknown'),
  191. 'commit': os.environ.get('GITHUB_SHA', 'unknown'),
  192. 'configuration': json.loads(os.environ.get('HARDWARE_CONFIG', '{}'))
  193. }
  194. # Get token count
  195. session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=600, connect=10, sock_read=600, sock_connect=10))
  196. try:
  197. response = await session.post(
  198. "http://localhost:52415/v1/chat/token/encode",
  199. json={
  200. "model": model,
  201. "messages": [{"role": "user", "content": prompt}]
  202. }
  203. )
  204. response.raise_for_status()
  205. token_data = await response.json()
  206. results['prompt_len'] = token_data['num_tokens']
  207. except Exception as e:
  208. await session.close()
  209. raise RuntimeError(f"Failed to get token count: {str(e)}")
  210. # Measure completion performance
  211. try:
  212. start_time = time.time()
  213. response = await session.post(
  214. api_endpoint,
  215. json={
  216. "model": model,
  217. "messages": [{"role": "user", "content": prompt}],
  218. "temperature": 0,
  219. "stream": True
  220. }
  221. )
  222. response.raise_for_status()
  223. first_token_time = None
  224. total_tokens = 0
  225. async for line in response.content.iter_chunks():
  226. line = line[0].decode('utf-8').strip()
  227. if not line.startswith('data: '):
  228. continue
  229. data = json.loads(line[6:]) # Skip 'data: ' prefix
  230. if content := data.get('choices', [{}])[0].get('delta', {}).get('content'):
  231. print(f"Received content: {content}", flush=True)
  232. if first_token_time is None:
  233. first_token_time = time.time()
  234. ttft = first_token_time - start_time
  235. results.update({
  236. 'ttft': ttft,
  237. 'prompt_tps': results['prompt_len'] / ttft
  238. })
  239. total_tokens += 1
  240. total_time = time.time() - start_time
  241. results.update({
  242. 'generation_tps': total_tokens / total_time,
  243. 'response_len': total_tokens,
  244. 'total_time': total_time
  245. })
  246. except Exception as e:
  247. raise RuntimeError(f"Performance measurement failed: {str(e)}")
  248. finally:
  249. await session.close()
  250. return results
  251. async def main() -> None:
  252. api_endpoint = "http://localhost:52415/v1/chat/completions"
  253. # Define prompts
  254. prompt_warmup = "what is the capital of France?"
  255. prompt_essay = "write an essay about cats"
  256. model = os.environ.get('model', 'llama-3.2-1b')
  257. # Warmup request
  258. print("\nPerforming warmup request...", flush=True)
  259. try:
  260. warmup_results = await measure_performance(api_endpoint, prompt_warmup, model)
  261. print("Warmup completed successfully", flush=True)
  262. except Exception as e:
  263. print(f"Warmup request failed: {e}", flush=True)
  264. # Measure performance for the essay prompt
  265. print("\nMeasuring performance for the essay prompt...", flush=True)
  266. results = await measure_performance(api_endpoint, prompt_essay, model)
  267. try:
  268. s3_client = boto3.client(
  269. 's3',
  270. aws_access_key_id=os.environ.get('aws_access_key_id'),
  271. aws_secret_access_key=os.environ.get('aws_secret_key')
  272. )
  273. job_name = os.environ.get('GITHUB_JOB')
  274. # Create S3 key with timestamp and commit info
  275. now = datetime.utcnow()
  276. timestamp = now.strftime('%H-%M-%S')
  277. commit_sha = os.environ.get('GITHUB_SHA', 'unknown')[:7]
  278. s3_key = f"{job_name}/{model}/{now.year}/{now.month}/{now.day}/{timestamp}_{commit_sha}.json"
  279. # Upload to S3
  280. s3_client.put_object(
  281. Bucket='exo-benchmarks',
  282. Key=s3_key,
  283. Body=json.dumps(results),
  284. ContentType='application/json'
  285. )
  286. print(f"Performance metrics uploaded to S3: s3://exo-benchmarks/{s3_key}", flush=True)
  287. except Exception as e:
  288. print(f"Failed to upload metrics to S3: {e}", flush=True)
  289. # Optionally print the metrics for visibility
  290. print("Performance metrics:", flush=True)
  291. print(json.dumps(results, indent=4), flush=True)
  292. def optimize_system_performance():
  293. """Set optimal system performance settings before running benchmark."""
  294. try:
  295. # Try to set high performance power mode
  296. subprocess.run(['sudo', 'pmset', '-a', 'powermode', '2'], check=False)
  297. # Ensure MLX uses performance cores and GPU
  298. os.environ['MLX_FORCE_P_CORES'] = '1'
  299. os.environ['MLX_METAL_PREWARM'] = '1'
  300. os.environ['MLX_USE_GPU'] = '1'
  301. # Set process priority
  302. current_process = psutil.Process()
  303. try:
  304. # Set highest priority
  305. subprocess.run(['sudo', 'renice', '-n', '-20', '-p', str(current_process.pid)], check=False)
  306. # Print current process state
  307. print("\nProcess State Before Benchmark:", flush=True)
  308. proc_info = subprocess.run(
  309. ['ps', '-o', 'pid,ppid,user,%cpu,%mem,nice,stat,pri,command', '-p', str(current_process.pid)],
  310. capture_output=True, text=True
  311. )
  312. print(proc_info.stdout, flush=True)
  313. # Verify power mode
  314. power_info = subprocess.run(['pmset', '-g'], capture_output=True, text=True)
  315. if 'powermode 0' in power_info.stdout:
  316. print("\nWarning: System still in normal power mode. Trying to set high performance mode again...", flush=True)
  317. subprocess.run(['sudo', 'pmset', '-a', 'powermode', '2'], check=False)
  318. except Exception as e:
  319. print(f"Warning: Could not set process priority: {e}", flush=True)
  320. except Exception as e:
  321. print(f"Warning: Could not optimize system performance: {e}", flush=True)
  322. # Print optimization status
  323. print("\nOptimization Settings:", flush=True)
  324. print("MLX Environment Variables:", flush=True)
  325. for var in ['MLX_FORCE_P_CORES', 'MLX_METAL_PREWARM', 'MLX_USE_GPU']:
  326. print(f"{var}: {os.environ.get(var, 'Not set')}", flush=True)
  327. try:
  328. nice_value = psutil.Process().nice()
  329. print(f"Process Nice Value: {nice_value}", flush=True)
  330. if nice_value != -20:
  331. print("Warning: Process not running at highest priority", flush=True)
  332. except Exception:
  333. pass
  334. if __name__ == "__main__":
  335. check_system_state()
  336. check_gpu_access()
  337. optimize_system_performance()
  338. asyncio.run(main())