7 months ago · 785710355f
--- a/.github/bench.py
+++ b/.github/bench.py
@@ -3,7 +3,9 @@ import asyncio
 
				 import time
			
 
				 import json
			
 
				 import os
			
 
				+import boto3
			
 
				 from typing import Dict, Any
			
 
				+from datetime import datetime
			
 
				 
			
 
				 
			
 
				 async def measure_performance(api_endpoint: str, prompt: str) -> Dict[str, Any]:
			
@@ -17,9 +19,33 @@ async def measure_performance(api_endpoint: str, prompt: str) -> Dict[str, Any]:
 
				     Returns:
			
 
				         Dict[str, Any]: A dictionary containing performance metrics or error information.
			
 
				     """
			
 
				-    results: Dict[str, Any] = {}
			
 
				+    model = os.environ.get('model')
			
 
				+    results: Dict[str, Any] = {'model': model, 'run_id': os.environ.get('GITHUB_RUN_ID')}
			
 
				+    results['configuration'] = {
			
 
				+        'M4': 2 # TODO get this through env vars from the matrix def
			
 
				+    }
			
 
				+
			
 
				+    # Get prompt length in tokens
			
 
				+    async with aiohttp.ClientSession() as session:
			
 
				+        try:
			
 
				+            request_payload = {
			
 
				+                "model": model,
			
 
				+                "messages": [{"role": "user", "content": prompt}]
			
 
				+            }
			
 
				+            async with session.post(
			
 
				+                "http://localhost:52415/v1/chat/token/encode",
			
 
				+                json=request_payload
			
 
				+            ) as response:
			
 
				+                token_data = await response.json()
			
 
				+                prompt_tokens = token_data.get('length', 0)
			
 
				+                print(f"Prompt length: {prompt_tokens} tokens", flush=True)
			
 
				+        except Exception as e:
			
 
				+            print(f"Failed to get prompt length: {e}", flush=True)
			
 
				+            prompt_tokens = 0
			
 
				+    results['prompt_len'] = prompt_tokens
			
 
				+
			
 
				     request_payload = {
			
 
				-        "model": "llama-3.2-1b",
			
 
				+        "model": model,
			
 
				         "messages": [{"role": "user", "content": prompt}],
			
 
				         "temperature": 0,
			
 
				         "stream": True
			
@@ -53,7 +79,8 @@ async def measure_performance(api_endpoint: str, prompt: str) -> Dict[str, Any]:
 
				                         if content:
			
 
				                             if first_token_time is None:
			
 
				                                 first_token_time = time.time()
			
 
				-                                results["time_to_first_token"] = first_token_time - start_time
			
 
				+                                results['ttft'] = first_token_time - start_time
			
 
				+                                results['prompt_tps'] = prompt_tokens/results['ttft']
			
 
				 
			
 
				                             total_tokens += 1
			
 
				                     except json.JSONDecodeError:
			
@@ -65,8 +92,8 @@ async def measure_performance(api_endpoint: str, prompt: str) -> Dict[str, Any]:
 
				 
			
 
				             if total_tokens > 0:
			
 
				                 results.update({
			
 
				-                    "tokens_per_second": total_tokens / total_time,
			
 
				-                    "total_tokens": total_tokens,
			
 
				+                    "generation_tps": total_tokens / total_time,
			
 
				+                    "response_len": total_tokens,
			
 
				                     "total_time": total_time
			
 
				                 })
			
 
				             else:
			
@@ -84,28 +111,36 @@ async def main() -> None:
 
				     api_endpoint = "http://localhost:52415/v1/chat/completions"
			
 
				 
			
 
				     # Define prompts
			
 
				-    prompt_basic = "this is a ping"
			
 
				     prompt_essay = "write an essay about cats"
			
 
				 
			
 
				-    # Measure performance for the basic prompt
			
 
				-    print("Measuring performance for the basic prompt...", flush=True)
			
 
				-    results_basic = await measure_performance(api_endpoint, prompt_basic)
			
 
				-    print("Basic prompt performance metrics:", flush=True)
			
 
				-    print(json.dumps(results_basic, indent=4))
			
 
				-
			
 
				     # Measure performance for the essay prompt, which depends on the first measurement
			
 
				     print("\nMeasuring performance for the essay prompt...", flush=True)
			
 
				     results = await measure_performance(api_endpoint, prompt_essay)
			
 
				 
			
 
				-    # Save metrics from the "universe and everything" prompt
			
 
				-    metrics_file = os.path.join("artifacts", "benchmark.json")
			
 
				-    os.makedirs(os.path.dirname(metrics_file), exist_ok=True)
			
 
				     try:
			
 
				-        with open(metrics_file, "w", encoding="utf-8") as f:
			
 
				-            json.dump(results, f, indent=4)
			
 
				-        print(f"Performance metrics saved to {metrics_file}", flush=True)
			
 
				-    except IOError as e:
			
 
				-        print(f"Failed to save metrics: {e}", flush=True)
			
 
				+        s3_client = boto3.client(
			
 
				+            's3',
			
 
				+            aws_access_key_id=os.environ.get('aws_access_key_id'),
			
 
				+            aws_secret_access_key=os.environ.get('aws_secret_key')
			
 
				+        )
			
 
				+        job_name = os.environ.get('GITHUB_JOB')
			
 
				+        
			
 
				+        # Create S3 key with timestamp and commit info
			
 
				+        now = datetime.utcnow()
			
 
				+        timestamp = now.strftime('%H-%M-%S')
			
 
				+        commit_sha = os.environ.get('GITHUB_SHA', 'unknown')[:7]
			
 
				+        s3_key = f"{job_name}/{now.year}/{now.month}/{now.day}/{timestamp}_{commit_sha}.json"
			
 
				+        
			
 
				+        # Upload to S3
			
 
				+        s3_client.put_object(
			
 
				+            Bucket='exo-benchmarks',
			
 
				+            Key=s3_key,
			
 
				+            Body=json.dumps(results),
			
 
				+            ContentType='application/json'
			
 
				+        )
			
 
				+        print(f"Performance metrics uploaded to S3: s3://exo-benchmarks/{s3_key}", flush=True)
			
 
				+    except Exception as e:
			
 
				+        print(f"Failed to upload metrics to S3: {e}", flush=True)
			
 
				 
			
 
				     # Optionally print the metrics for visibility
			
 
				     print("Performance metrics:", flush=True)
			
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -17,8 +17,6 @@ jobs:
 
				     strategy:
			
 
				       matrix:
			
 
				         cpu: ['M4', 'M4']
			
 
				-        include:
			
 
				-          - benchmark_timeout: 500
			
 
				     runs-on: ['self-hosted', 'macOS', '${{ matrix.cpu }}']
			
 
				     steps:
			
 
				       - uses: actions/checkout@v4
			
@@ -38,8 +36,13 @@ jobs:
 
				           source env/bin/activate
			
 
				           pip install --upgrade pip
			
 
				           pip install .
			
 
				+          pip install boto3==1.35.76
			
 
				 
			
 
				       - name: Run exo
			
 
				+        env:
			
 
				+          aws_access_key_id: ${{ secrets.S3_EXO_BENCHMARKS_AWS_ACCESS_KEY_ID }}
			
 
				+          aws_secret_key: ${{ secrets.S3_EXO_BENCHMARKS_AWS_SECRET_ACCESS_KEY }}
			
 
				+          model: llama-3.2-1b
			
 
				         run: |
			
 
				           ALL_NODE_IDS=$(for i in $(seq ${{ strategy.job-total }} -1 0); do echo -n "${GITHUB_JOB}_${i},"; done | sed 's/,$//')
			
 
				           MY_NODE_ID="${GITHUB_JOB}_${{ strategy.job-index }}"
			
@@ -52,6 +55,7 @@ jobs:
 
				           TAIL1=$!
			
 
				 
			
 
				           trap 'kill $TAIL1' EXIT
			
 
				+          trap 'kill $PID1' EXIT
			
 
				 
			
 
				           for i in {1..100}; do
			
 
				             nodes=$(curl http://localhost:52415/topology | jq ".nodes | length")
			
@@ -70,10 +74,15 @@ jobs:
 
				           if [ "${{ strategy.job-index }}" -eq "0" ]; then
			
 
				             python .github/bench.py
			
 
				           else
			
 
				-            sleep ${{ matrix.benchmark_timeout }}
			
 
				+            sleep 10
			
 
				+          while true; do
			
 
				+            nodes=$(curl http://localhost:52415/topology | jq ".nodes | length")
			
 
				+            if [ "$nodes" -lt "${{ strategy.job-total }}" ]; then
			
 
				+              break
			
 
				+            fi
			
 
				+            sleep 5
			
 
				+          done
			
 
				           fi
			
 
				-
			
 
				-          kill $PID1
			
 
				       - name: Test
			
 
				         run: |
			
 
				           echo "GITHUB_JOB: ${GITHUB_JOB}, GITHUB_RUN_ID: ${GITHUB_RUN_ID}, GITHUB_RUN_NUMBER: ${GITHUB_RUN_NUMBER}, GITHUB_WORKFLOW: ${GITHUB_WORKFLOW}"