123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153 |
- # This is the reusable workflow file
- name: Distributed Job Runner
- on:
- workflow_call:
- inputs:
- config:
- required: true
- type: string
- model:
- required: true
- type: string
- calling_job_name: # New input parameter
- required: true
- type: string
- jobs:
- generate-matrix:
- runs-on: ubuntu-latest
- outputs:
- matrix: ${{ steps.set-matrix.outputs.matrix }}
- steps:
- - id: set-matrix
- env:
- CONFIG: ${{ inputs.config }}
- run: |
- MATRIX=$(echo $CONFIG | jq -c '{cpu: [to_entries | .[] | .key as $k | range(.value) | $k]}')
- echo "matrix=$MATRIX" >> $GITHUB_OUTPUT
- run-distributed-job:
- needs: generate-matrix
- strategy:
- matrix: ${{fromJson(needs.generate-matrix.outputs.matrix)}}
- runs-on: ['self-hosted', 'macOS', '${{ matrix.cpu }}']
- env:
- HARDWARE_CONFIG: ${{ inputs.config }}
- model: ${{ inputs.model }}
- steps:
- - uses: actions/checkout@v4
- - name: Install dependencies
- run: |
- # First, find where python3.12 is installed
- which python3.12 || echo "python3.12 not in PATH"
- # Add common Python installation locations to PATH
- export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
- # Now try to create the venv with explicit python3.12
- python3.12 -m venv .venv || {
- echo "Failed to find python3.12. Checking installation locations:"
- ls -l /usr/local/bin/python* /opt/homebrew/bin/python* 2>/dev/null || true
- exit 1
- }
- source .venv/bin/activate
- pip install --upgrade pip
- pip install -e .
- pip install boto3==1.35.76
- - name: Configure system
- run: |
- # Check initial CPU state
- echo "Initial CPU state:"
- sysctl machdep.cpu || true
- sysctl hw.cpufamily || true
- sysctl hw.cputype || true
- sysctl hw.logicalcpu || true
- sysctl hw.physicalcpu || true
-
- # Apply MLX optimizations
- ./configure_mlx.sh
-
- # Disable CPU throttling (correct syntax for M-series Macs)
- sudo pmset -a powermode 0 2>/dev/null || echo "Failed to set power mode"
- sudo pmset -a gpuswitch 2 2>/dev/null || echo "Failed to set GPU mode"
-
- # Check final state
- echo "Final CPU state:"
- sysctl machdep.cpu || true
- pmset -g thermals || true
- pmset -g || true
- sysctl iogpu
- - name: Run exo
- env:
- aws_access_key_id: ${{ secrets.S3_EXO_BENCHMARKS_AWS_ACCESS_KEY_ID }}
- aws_secret_key: ${{ secrets.S3_EXO_BENCHMARKS_AWS_SECRET_ACCESS_KEY }}
- run: |
- # Debug information
- echo "Current commit SHA: $GITHUB_SHA"
- git rev-parse HEAD
- git status
-
- # List existing exo processes
- echo "Existing exo processes:"
- ps aux | grep exo || true
- CALLING_JOB="${{ inputs.calling_job_name }}"
- UNIQUE_JOB_ID="${CALLING_JOB}_${model}_${GITHUB_RUN_ID}"
- ALL_NODE_IDS=$(for i in $(seq ${{ strategy.job-total }} -1 0); do echo -n "${UNIQUE_JOB_ID}_${i},"; done | sed 's/,$//')
- MY_NODE_ID="${UNIQUE_JOB_ID}_${{ strategy.job-index }}"
- echo "Starting exo process with:"
- echo "MY_NODE_ID: ${MY_NODE_ID}"
- echo "ALL_NODE_IDS: ${ALL_NODE_IDS}"
- echo "Total expected nodes: ${{ strategy.job-total }}"
- source .venv/bin/activate
- export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
-
- # Check installed exo version
- pip show exo
- which .venv/bin/exo
- echo "Starting exo daemon..."
- DEBUG=6 DEBUG_DISCOVERY=6 .venv/bin/exo --node-id="${MY_NODE_ID}" --node-id-filter="${ALL_NODE_IDS}" --interface-type-filter="Ethernet" --chatgpt-api-port 52415 > output1.log 2>&1 &
- PID1=$!
- echo "Exo process started with PID: $PID1"
- tail -f output1.log &
- TAIL1=$!
- trap 'kill $TAIL1' EXIT
- trap 'kill $PID1' EXIT
- echo "Waiting for all nodes to connect..."
- for i in {1..100}; do
- echo "Attempt $i: Checking node count..."
- nodes=$(curl -s http://localhost:52415/topology | jq ".nodes | length")
- echo "Current node count: $nodes"
- if [ "$nodes" -eq "${{ strategy.job-total }}" ]; then
- echo "All nodes connected successfully!"
- break
- fi
- sleep 5
- done
- if ! kill -0 $PID1 2>/dev/null; then
- echo "ERROR: Instance (PID $PID1) died unexpectedly. Full log output:"
- cat output1.log
- exit 1
- fi
- if [ "${{ strategy.job-index }}" -eq "0" ]; then
- sleep 10
- echo "This is the primary node (index 0). Running benchmark..."
- GITHUB_JOB=$CALLING_JOB python .github/bench.py
- else
- echo "This is a secondary node (index ${{ strategy.job-index }}). Waiting for completion..."
- sleep 10
- while true; do
- echo "Checking if primary node is still running..."
- nodes=$(curl -s http://localhost:52415/topology | jq ".nodes | length")
- echo "Current node count: $nodes"
- if [ "$nodes" -lt "${{ strategy.job-total }}" ]; then
- echo "Primary node completed, exiting..."
- break
- fi
- sleep 5
- done
- fi
|