Glen 7 months ago
parent
commit
785710355f
2 changed files with 69 additions and 25 deletions
  1. 55 20
      .github/bench.py
  2. 14 5
      .github/workflows/build_and_test.yml

+ 55 - 20
.github/bench.py

@@ -3,7 +3,9 @@ import asyncio
 import time
 import json
 import os
+import boto3
 from typing import Dict, Any
+from datetime import datetime
 
 
 async def measure_performance(api_endpoint: str, prompt: str) -> Dict[str, Any]:
@@ -17,9 +19,33 @@ async def measure_performance(api_endpoint: str, prompt: str) -> Dict[str, Any]:
     Returns:
         Dict[str, Any]: A dictionary containing performance metrics or error information.
     """
-    results: Dict[str, Any] = {}
+    model = os.environ.get('model')
+    results: Dict[str, Any] = {'model': model, 'run_id': os.environ.get('GITHUB_RUN_ID')}
+    results['configuration'] = {
+        'M4': 2 # TODO get this through env vars from the matrix def
+    }
+
+    # Get prompt length in tokens
+    async with aiohttp.ClientSession() as session:
+        try:
+            request_payload = {
+                "model": model,
+                "messages": [{"role": "user", "content": prompt}]
+            }
+            async with session.post(
+                "http://localhost:52415/v1/chat/token/encode",
+                json=request_payload
+            ) as response:
+                token_data = await response.json()
+                prompt_tokens = token_data.get('length', 0)
+                print(f"Prompt length: {prompt_tokens} tokens", flush=True)
+        except Exception as e:
+            print(f"Failed to get prompt length: {e}", flush=True)
+            prompt_tokens = 0
+    results['prompt_len'] = prompt_tokens
+
     request_payload = {
-        "model": "llama-3.2-1b",
+        "model": model,
         "messages": [{"role": "user", "content": prompt}],
         "temperature": 0,
         "stream": True
@@ -53,7 +79,8 @@ async def measure_performance(api_endpoint: str, prompt: str) -> Dict[str, Any]:
                         if content:
                             if first_token_time is None:
                                 first_token_time = time.time()
-                                results["time_to_first_token"] = first_token_time - start_time
+                                results['ttft'] = first_token_time - start_time
+                                results['prompt_tps'] = prompt_tokens/results['ttft']
 
                             total_tokens += 1
                     except json.JSONDecodeError:
@@ -65,8 +92,8 @@ async def measure_performance(api_endpoint: str, prompt: str) -> Dict[str, Any]:
 
             if total_tokens > 0:
                 results.update({
-                    "tokens_per_second": total_tokens / total_time,
-                    "total_tokens": total_tokens,
+                    "generation_tps": total_tokens / total_time,
+                    "response_len": total_tokens,
                     "total_time": total_time
                 })
             else:
@@ -84,28 +111,36 @@ async def main() -> None:
     api_endpoint = "http://localhost:52415/v1/chat/completions"
 
     # Define prompts
-    prompt_basic = "this is a ping"
     prompt_essay = "write an essay about cats"
 
-    # Measure performance for the basic prompt
-    print("Measuring performance for the basic prompt...", flush=True)
-    results_basic = await measure_performance(api_endpoint, prompt_basic)
-    print("Basic prompt performance metrics:", flush=True)
-    print(json.dumps(results_basic, indent=4))
-
     # Measure performance for the essay prompt, which depends on the first measurement
     print("\nMeasuring performance for the essay prompt...", flush=True)
     results = await measure_performance(api_endpoint, prompt_essay)
 
-    # Save metrics from the "universe and everything" prompt
-    metrics_file = os.path.join("artifacts", "benchmark.json")
-    os.makedirs(os.path.dirname(metrics_file), exist_ok=True)
     try:
-        with open(metrics_file, "w", encoding="utf-8") as f:
-            json.dump(results, f, indent=4)
-        print(f"Performance metrics saved to {metrics_file}", flush=True)
-    except IOError as e:
-        print(f"Failed to save metrics: {e}", flush=True)
+        s3_client = boto3.client(
+            's3',
+            aws_access_key_id=os.environ.get('aws_access_key_id'),
+            aws_secret_access_key=os.environ.get('aws_secret_key')
+        )
+        job_name = os.environ.get('GITHUB_JOB')
+        
+        # Create S3 key with timestamp and commit info
+        now = datetime.utcnow()
+        timestamp = now.strftime('%H-%M-%S')
+        commit_sha = os.environ.get('GITHUB_SHA', 'unknown')[:7]
+        s3_key = f"{job_name}/{now.year}/{now.month}/{now.day}/{timestamp}_{commit_sha}.json"
+        
+        # Upload to S3
+        s3_client.put_object(
+            Bucket='exo-benchmarks',
+            Key=s3_key,
+            Body=json.dumps(results),
+            ContentType='application/json'
+        )
+        print(f"Performance metrics uploaded to S3: s3://exo-benchmarks/{s3_key}", flush=True)
+    except Exception as e:
+        print(f"Failed to upload metrics to S3: {e}", flush=True)
 
     # Optionally print the metrics for visibility
     print("Performance metrics:", flush=True)

+ 14 - 5
.github/workflows/build_and_test.yml

@@ -17,8 +17,6 @@ jobs:
     strategy:
       matrix:
         cpu: ['M4', 'M4']
-        include:
-          - benchmark_timeout: 500
     runs-on: ['self-hosted', 'macOS', '${{ matrix.cpu }}']
     steps:
       - uses: actions/checkout@v4
@@ -38,8 +36,13 @@ jobs:
           source env/bin/activate
           pip install --upgrade pip
           pip install .
+          pip install boto3==1.35.76
 
       - name: Run exo
+        env:
+          aws_access_key_id: ${{ secrets.S3_EXO_BENCHMARKS_AWS_ACCESS_KEY_ID }}
+          aws_secret_key: ${{ secrets.S3_EXO_BENCHMARKS_AWS_SECRET_ACCESS_KEY }}
+          model: llama-3.2-1b
         run: |
           ALL_NODE_IDS=$(for i in $(seq ${{ strategy.job-total }} -1 0); do echo -n "${GITHUB_JOB}_${i},"; done | sed 's/,$//')
           MY_NODE_ID="${GITHUB_JOB}_${{ strategy.job-index }}"
@@ -52,6 +55,7 @@ jobs:
           TAIL1=$!
 
           trap 'kill $TAIL1' EXIT
+          trap 'kill $PID1' EXIT
 
           for i in {1..100}; do
             nodes=$(curl http://localhost:52415/topology | jq ".nodes | length")
@@ -70,10 +74,15 @@ jobs:
           if [ "${{ strategy.job-index }}" -eq "0" ]; then
             python .github/bench.py
           else
-            sleep ${{ matrix.benchmark_timeout }}
+            sleep 10
+          while true; do
+            nodes=$(curl http://localhost:52415/topology | jq ".nodes | length")
+            if [ "$nodes" -lt "${{ strategy.job-total }}" ]; then
+              break
+            fi
+            sleep 5
+          done
           fi
-
-          kill $PID1
       - name: Test
         run: |
           echo "GITHUB_JOB: ${GITHUB_JOB}, GITHUB_RUN_ID: ${GITHUB_RUN_ID}, GITHUB_RUN_NUMBER: ${GITHUB_RUN_NUMBER}, GITHUB_WORKFLOW: ${GITHUB_WORKFLOW}"