Alex Cheema 4 月之前
父节点
当前提交
cdae702673
共有 3 个文件被更改,包括 186 次插入145 次删除
  1. 45 35
      .github/bootstrap.sh
  2. 97 0
      .github/optimize_performance.sh
  3. 44 110
      .github/workflows/bench_job.yml

+ 45 - 35
.github/bootstrap.sh

@@ -216,41 +216,6 @@ defaults write com.apple.Metal ForceMaximumPerformance -bool true
 sudo mkdir -p /tmp/mps_cache
 sudo chmod 777 /tmp/mps_cache
 
-# Create CPU affinity configuration for performance cores
-sudo tee /Library/LaunchDaemons/com.github.runner.cpuaffinity.plist << EOF
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-    <key>Label</key>
-    <string>com.github.runner.cpuaffinity</string>
-    <key>ProgramArguments</key>
-    <array>
-        <string>/usr/bin/taskpolicy</string>
-        <string>-p</string>
-        <string>PERFORMANCE</string>
-        <string>-b</string>
-        <string>PERFORMANCE</string>
-        <string>-t</string>
-        <string>PERFORMANCE</string>
-        <string>--cpu-qos</string>
-        <string>USER_INTERACTIVE</string>
-        <string>--gpu-qos</string>
-        <string>USER_INTERACTIVE</string>
-        <string>--io-qos</string>
-        <string>USER_INTERACTIVE</string>
-        <string>--affinity-tag</string>
-        <string>com.github.runner</string>
-        <string>${RUNNER_DIR}/run.sh</string>
-    </array>
-    <key>RunAtLoad</key>
-    <true/>
-    <key>KeepAlive</key>
-    <true/>
-</dict>
-</plist>
-EOF
-
 # Create and load launch daemon
 log "Creating LaunchDaemon service..."
 sudo tee /Library/LaunchDaemons/com.github.runner.plist > /dev/null << EOF
@@ -266,6 +231,21 @@ sudo tee /Library/LaunchDaemons/com.github.runner.plist > /dev/null << EOF
         <string>${RUNNER_DIR}</string>
         <key>ProgramArguments</key>
         <array>
+            <string>/usr/bin/taskpolicy</string>
+            <string>-b</string>
+            <string>PERFORMANCE</string>
+            <string>-p</string>
+            <string>PERFORMANCE</string>
+            <string>-t</string>
+            <string>PERFORMANCE</string>
+            <string>--cpu-qos</string>
+            <string>USER_INTERACTIVE</string>
+            <string>--gpu-qos</string>
+            <string>USER_INTERACTIVE</string>
+            <string>--io-qos</string>
+            <string>USER_INTERACTIVE</string>
+            <string>--affinity-tag</string>
+            <string>com.github.runner</string>
             <string>/usr/bin/nice</string>
             <string>-n</string>
             <string>-20</string>
@@ -299,6 +279,8 @@ sudo tee /Library/LaunchDaemons/com.github.runner.plist > /dev/null << EOF
             <key>MLX_METAL_PREWARM</key>
             <string>1</string>
             <!-- Metal Settings -->
+            <key>MTL_DEBUG_LAYER</key>
+            <string>0</string>
             <key>METAL_DEBUG_ERROR_MODE</key>
             <string>0</string>
             <key>METAL_DEVICE_WRAPPER_TYPE</key>
@@ -335,6 +317,13 @@ sudo tee /Library/LaunchDaemons/com.github.runner.plist > /dev/null << EOF
             <string>1</string>
             <key>PERFORMANCE_MODE</key>
             <string>1</string>
+            <!-- Python Settings -->
+            <key>PYTHONOPTIMIZE</key>
+            <string>2</string>
+            <key>PYTHONUNBUFFERED</key>
+            <string>1</string>
+            <key>PYTHONHASHSEED</key>
+            <string>0</string>
         </dict>
         <key>RunAtLoad</key>
         <true/>
@@ -346,10 +335,31 @@ sudo tee /Library/LaunchDaemons/com.github.runner.plist > /dev/null << EOF
         <false/>
         <key>AbandonProcessGroup</key>
         <false/>
+        <key>Nice</key>
+        <integer>-20</integer>
+        <key>ThrottleInterval</key>
+        <integer>0</integer>
+        <key>EnableTransactions</key>
+        <true/>
+        <key>EnablePressuredExit</key>
+        <false/>
+        <key>HardResourceLimits</key>
+        <dict>
+            <key>NumberOfFiles</key>
+            <integer>524288</integer>
+        </dict>
+        <key>SoftResourceLimits</key>
+        <dict>
+            <key>NumberOfFiles</key>
+            <integer>524288</integer>
+        </dict>
     </dict>
 </plist>
 EOF
 
+# Remove the separate CPU affinity configuration since it's now integrated
+sudo rm -f /Library/LaunchDaemons/com.github.runner.cpuaffinity.plist
+
 # Set proper permissions for the LaunchDaemon
 sudo chown root:wheel /Library/LaunchDaemons/com.github.runner.plist
 sudo chmod 644 /Library/LaunchDaemons/com.github.runner.plist

+ 97 - 0
.github/optimize_performance.sh

@@ -0,0 +1,97 @@
+#!/bin/bash
+set -e
+
+# Function to log with timestamp
+log() {
+  echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
+}
+
+log "Applying comprehensive performance optimizations..."
+
+# System-wide power management
+log "Configuring power management..."
+sudo pmset -a lessbright 0
+sudo pmset -a disablesleep 1
+sudo pmset -a sleep 0
+sudo pmset -a hibernatemode 0
+sudo pmset -a autopoweroff 0
+sudo pmset -a standby 0
+sudo pmset -a powernap 0
+sudo pmset -a proximitywake 0
+sudo pmset -a tcpkeepalive 1
+sudo pmset -a powermode 1
+sudo pmset -a gpuswitch 2
+sudo pmset -a displaysleep 0
+sudo pmset -a disksleep 0
+
+# Memory and kernel optimizations
+log "Configuring memory and kernel settings..."
+sudo sysctl -w kern.memorystatus_purge_on_warning=0
+sudo sysctl -w kern.memorystatus_purge_on_critical=0
+sudo sysctl -w kern.timer.coalescing_enabled=0
+sudo sysctl -w kern.iogpu.dynamic_memory_management=0
+sudo sysctl -w kern.iogpu.dynamic_memory_management_debug=0
+
+# Metal and GPU optimizations
+log "Configuring Metal and GPU settings..."
+defaults write com.apple.CoreML MPSEnableGPUValidation -bool false
+defaults write com.apple.CoreML MPSEnableMetalValidation -bool false
+defaults write com.apple.CoreML MPSEnableGPUDebug -bool false
+defaults write com.apple.Metal GPUDebug -bool false
+defaults write com.apple.Metal GPUValidation -bool false
+defaults write com.apple.Metal MetalValidation -bool false
+defaults write com.apple.Metal MetalCaptureEnabled -bool false
+defaults write com.apple.Metal MTLValidationBehavior -string "Disabled"
+defaults write com.apple.Metal EnableMTLDebugLayer -bool false
+defaults write com.apple.Metal MTLDebugLevel -int 0
+defaults write com.apple.Metal PreferIntegratedGPU -bool false
+defaults write com.apple.Metal ForceMaximumPerformance -bool true
+defaults write com.apple.Metal MTLPreferredDeviceGPUFrame -bool true
+
+# Create MPS cache directory with proper permissions
+sudo mkdir -p /tmp/mps_cache
+sudo chmod 777 /tmp/mps_cache
+
+# Process and resource limits
+log "Configuring process limits..."
+sudo launchctl limit maxfiles 524288 524288
+ulimit -n 524288
+ulimit -c 0
+ulimit -l unlimited
+
+# Export performance-related environment variables
+cat << 'EOF' > /tmp/performance_env.sh
+# Metal optimizations
+export MTL_DEBUG_LAYER=0
+export METAL_DEVICE_WRAPPER_TYPE=1
+export METAL_DEBUG_ERROR_MODE=0
+export METAL_FORCE_PERFORMANCE_MODE=1
+export METAL_DEVICE_PRIORITY=high
+export METAL_MAX_COMMAND_QUEUES=1024
+export METAL_LOAD_LIMIT=0
+export METAL_VALIDATION_ENABLED=0
+export METAL_ENABLE_VALIDATION_LAYER=0
+export OBJC_DEBUG_MISSING_POOLS=NO
+export MPS_CACHEDIR=/tmp/mps_cache
+
+# MLX optimizations
+export MLX_USE_GPU=1
+export MLX_METAL_COMPILE_ASYNC=1
+export MLX_METAL_PREALLOCATE=1
+export MLX_METAL_MEMORY_GUARD=0
+export MLX_METAL_CACHE_KERNELS=1
+export MLX_PLACEMENT_POLICY=metal
+export MLX_METAL_VALIDATION=0
+export MLX_METAL_DEBUG=0
+export MLX_FORCE_P_CORES=1
+export MLX_METAL_MEMORY_BUDGET=0
+export MLX_METAL_PREWARM=1
+
+# Python optimizations
+export PYTHONUNBUFFERED=1
+export PYTHONOPTIMIZE=2
+export PYTHONHASHSEED=0
+export PYTHONDONTWRITEBYTECODE=1
+EOF
+
+log "Performance optimizations completed. Environment variables written to /tmp/performance_env.sh" 

+ 44 - 110
.github/workflows/bench_job.yml

@@ -10,7 +10,7 @@ on:
       model:
         required: true
         type: string
-      calling_job_name:  # New input parameter
+      calling_job_name:
         required: true
         type: string
 jobs:
@@ -34,20 +34,26 @@ jobs:
     env:
       HARDWARE_CONFIG: ${{ inputs.config }}
       model: ${{ inputs.model }}
+      # Add performance-related environment variables
+      MTL_DEBUG_LAYER: 0
+      METAL_VALIDATION_ENABLED: 0
+      MLX_METAL_VALIDATION: 0
+      MLX_METAL_DEBUG: 0
+      MLX_FORCE_P_CORES: 1
+      MLX_METAL_PREWARM: 1
+      PYTHONOPTIMIZE: 2
     steps:
       - name: Cleanup workspace
         run: |
           sudo rm -rf "$GITHUB_WORKSPACE"
           sudo mkdir -p "$GITHUB_WORKSPACE"
           sudo chown -R $(whoami):$(id -g) "$GITHUB_WORKSPACE"
+
       - uses: actions/checkout@v4
+
       - name: Install dependencies
         run: |
-          # First, find where python3.12 is installed
-          which python3.12 || echo "python3.12 not in PATH"
-          # Add common Python installation locations to PATH
           export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
-          # Now try to create the venv with explicit python3.12
           python3.12 -m venv .venv || {
             echo "Failed to find python3.12. Checking installation locations:"
             ls -l /usr/local/bin/python* /opt/homebrew/bin/python* 2>/dev/null || true
@@ -57,143 +63,70 @@ jobs:
           pip install --upgrade pip
           pip install -e .
           pip install boto3==1.35.76
-      - name: Configure system
-        run: |
-          # Disable all power management and performance throttling
-          sudo pmset -a lessbright 0
-          sudo pmset -a disablesleep 1
-          sudo pmset -a sleep 0
-          sudo pmset -a hibernatemode 0
-          sudo pmset -a autopoweroff 0
-          sudo pmset -a standby 0
-          sudo pmset -a powernap 0
-          sudo pmset -a proximitywake 0
-          sudo pmset -a tcpkeepalive 1
-          sudo pmset -a powermode 1
-          sudo pmset -a gpuswitch 2
-          
-          # Optimize GPU memory allocation
-          sudo sysctl -w kern.memorystatus_purge_on_warning=0
-          sudo sysctl -w kern.memorystatus_purge_on_critical=0
-          
-          # Additional performance optimizations
-          sudo sysctl -w kern.timer.coalescing_enabled=0
-          
-          # Optimize Metal performance
-          defaults write com.apple.CoreML MPSEnableGPUValidation -bool false
-          defaults write com.apple.CoreML MPSEnableMetalValidation -bool false
-          defaults write com.apple.CoreML MPSEnableGPUDebug -bool false
-          defaults write com.apple.Metal GPUDebug -bool false
-          defaults write com.apple.Metal GPUValidation -bool false
-          defaults write com.apple.Metal MetalValidation -bool false
-          defaults write com.apple.Metal MetalCaptureEnabled -bool false
-          defaults write com.apple.Metal MTLValidationBehavior -string "Disabled"
-          defaults write com.apple.Metal EnableMTLDebugLayer -bool false
-          defaults write com.apple.Metal MTLDebugLevel -int 0
-          defaults write com.apple.Metal PreferIntegratedGPU -bool false
-          defaults write com.apple.Metal ForceMaximumPerformance -bool true
-
-          ./configure_mlx.sh
 
-          # Check final state
-          echo "Final system state:"
-          pmset -g
-          sysctl iogpu
-          sysctl kern.memorystatus_purge_on_warning
-          sysctl kern.memorystatus_purge_on_critical
-      - name: Configure process limits
+      - name: Apply Performance Optimizations
         run: |
-          # Increase resource limits
-          sudo launchctl limit maxfiles 524288 524288
-          ulimit -n 524288
-          
-          # Disable core dumps
-          ulimit -c 0
+          # Make the script executable and run it
+          chmod +x .github/optimize_performance.sh
+          ./.github/optimize_performance.sh
           
-          # Set max locked memory to unlimited
-          ulimit -l unlimited
+          # Source the performance environment variables
+          source /tmp/performance_env.sh
           
-          # Set process priority using macOS commands
-          CURRENT_PID=$PPID
-          sudo renice -n -20 $CURRENT_PID || true
+          # Additional runtime optimizations
+          sudo sysctl -w kern.iogpu.dynamic_memory_management=0
           
-          # Set high performance I/O policy
-          sudo taskpolicy -d 0 -p $CURRENT_PID || true
+          # Set process scheduling
+          sudo taskpolicy -b PERFORMANCE
           
-          # Set Metal environment variables
-          export METAL_DEVICE_WRAPPER_TYPE=1
-          export METAL_DEBUG_ERROR_MODE=0
-          export METAL_FORCE_PERFORMANCE_MODE=1
-          export METAL_DEVICE_PRIORITY=high
-          export METAL_MAX_COMMAND_QUEUES=1024
-          export METAL_LOAD_LIMIT=0
-          export METAL_VALIDATION_ENABLED=0
-          export METAL_ENABLE_VALIDATION_LAYER=0
-          export OBJC_DEBUG_MISSING_POOLS=NO
-          
-          # MLX optimizations
-          export MLX_USE_GPU=1
-          export MLX_METAL_COMPILE_ASYNC=1
-          export MLX_METAL_PREALLOCATE=1
-          export MLX_METAL_MEMORY_GUARD=0
-          export MLX_METAL_CACHE_KERNELS=1
-          export MLX_PLACEMENT_POLICY=metal
-          export MLX_METAL_VALIDATION=0
-          export MLX_METAL_DEBUG=0
-          export MLX_FORCE_P_CORES=1
-          export MLX_METAL_MEMORY_BUDGET=0
-          export MLX_METAL_PREWARM=1
+          # Verify optimizations
+          echo "Verifying performance settings..."
+          pmset -g
+          sysctl iogpu
+          env | grep -E "MLX_|METAL_|MTL_"
+
       - name: Run exo
         env:
           aws_access_key_id: ${{ secrets.S3_EXO_BENCHMARKS_AWS_ACCESS_KEY_ID }}
           aws_secret_key: ${{ secrets.S3_EXO_BENCHMARKS_AWS_SECRET_ACCESS_KEY }}
         run: |
+          # Source performance environment variables
+          source /tmp/performance_env.sh
+          
           # Debug information
           echo "Current commit SHA: $GITHUB_SHA"
           git rev-parse HEAD
           git status
           
-          # List existing exo processes
-          echo "Existing exo processes:"
-          ps aux | grep exo || true
-
           CALLING_JOB="${{ inputs.calling_job_name }}"
           UNIQUE_JOB_ID="${CALLING_JOB}_${model}_${GITHUB_RUN_ID}"
           ALL_NODE_IDS=$(for i in $(seq ${{ strategy.job-total }} -1 0); do echo -n "${UNIQUE_JOB_ID}_${i},"; done | sed 's/,$//')
           MY_NODE_ID="${UNIQUE_JOB_ID}_${{ strategy.job-index }}"
-          echo "Starting exo process with:"
-          echo "MY_NODE_ID: ${MY_NODE_ID}"
-          echo "ALL_NODE_IDS: ${ALL_NODE_IDS}"
-          echo "Total expected nodes: ${{ strategy.job-total }}"
-
+          
           source .venv/bin/activate
           export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
           
-          # Check installed exo version
-          pip show exo
-          which .venv/bin/exo
-
           echo "Starting exo daemon..."
-          # Set process scheduling priority
+          # Set high priority and performance mode
           sudo renice -n -20 $$ || true
-          sudo taskpolicy -d 0 $$ || true
+          sudo taskpolicy -b PERFORMANCE $$ || true
           
-          # Start exo with inherited priority and performance settings
-          sudo taskpolicy -d 0 .venv/bin/exo \
+          # Start exo with performance optimizations
+          sudo taskpolicy -b PERFORMANCE .venv/bin/exo \
             --node-id="${MY_NODE_ID}" \
             --node-id-filter="${ALL_NODE_IDS}" \
             --interface-type-filter="Ethernet" \
             --chatgpt-api-port 52415 > output1.log 2>&1 &
           PID1=$!
           
-          # Set process priority using macOS-specific commands
+          # Set process and thread priorities
           sudo renice -n -20 -p $PID1 || true
-          sudo taskpolicy -d 0 -p $PID1 || true
+          sudo taskpolicy -b PERFORMANCE -p $PID1 || true
           
           # Set thread priority for all Python threads
           for tid in $(ps -M $PID1 | grep Python | awk '{print $2}'); do
             sudo renice -n -20 -p $tid || true
-            sudo taskpolicy -d 0 -p $tid || true
+            sudo taskpolicy -b PERFORMANCE -p $tid || true
           done
 
           echo "Exo process started with PID: $PID1"
@@ -240,13 +173,14 @@ jobs:
             done
           fi
 
-      # Add system state check
-      - name: Check System State
+      - name: Check Final System State
+        if: always()
         run: |
+          echo "=== Final System State ==="
           sudo pmset -g
-          sudo powermetrics -n 1 -i 1000 --show-process-energy
+          sudo powermetrics -n 1 -i 1000 --show-process-energy || true
           system_profiler SPDisplaysDataType
           sysctl iogpu
           ps -eo pid,ppid,user,%cpu,%mem,nice,state,command | grep -i python
-          sudo launchctl list | grep github
-          env | grep -E "MLX|METAL"
+          env | grep -E "MLX_|METAL_|MTL_"
+          echo "=== End Final System State ==="