123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899 |
- # This is the reusable workflow file
- name: Distributed Job Runner
- on:
- workflow_call:
- inputs:
- config:
- required: true
- type: string
- model:
- required: true
- type: string
- calling_job_name: # New input parameter
- required: true
- type: string
- jobs:
- generate-matrix:
- runs-on: ubuntu-latest
- outputs:
- matrix: ${{ steps.set-matrix.outputs.matrix }}
- steps:
- - id: set-matrix
- env:
- CONFIG: ${{ inputs.config }}
- run: |
- MATRIX=$(echo $CONFIG | jq -c '{cpu: [to_entries | .[] | .key as $k | range(.value) | $k]}')
- echo "matrix=$MATRIX" >> $GITHUB_OUTPUT
- run-distributed-job:
- needs: generate-matrix
- strategy:
- matrix: ${{fromJson(needs.generate-matrix.outputs.matrix)}}
- runs-on: ['self-hosted', 'macOS', '${{ matrix.cpu }}']
- env:
- HARDWARE_CONFIG: ${{ inputs.config }}
- model: ${{ inputs.model }}
- steps:
- - uses: actions/checkout@v4
- - name: Install dependencies
- run: |
- # First, find where python3.12 is installed
- which python3.12 || echo "python3.12 not in PATH"
- # Add common Python installation locations to PATH
- export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
- # Now try to create the venv with explicit python3.12
- python3.12 -m venv env || {
- echo "Failed to find python3.12. Checking installation locations:"
- ls -l /usr/local/bin/python* /opt/homebrew/bin/python* 2>/dev/null || true
- exit 1
- }
- source env/bin/activate
- pip install --upgrade pip
- pip install .
- pip install boto3==1.35.76
- - name: Run exo
- env:
- aws_access_key_id: ${{ secrets.S3_EXO_BENCHMARKS_AWS_ACCESS_KEY_ID }}
- aws_secret_key: ${{ secrets.S3_EXO_BENCHMARKS_AWS_SECRET_ACCESS_KEY }}
- run: |
- CALLING_JOB="${{ inputs.calling_job_name }}"
- UNIQUE_JOB_ID="${CALLING_JOB}_${GITHUB_RUN_ID}"
- ALL_NODE_IDS=$(for i in $(seq ${{ strategy.job-total }} -1 0); do echo -n "${UNIQUE_JOB_ID}_${i},"; done | sed 's/,$//')
- MY_NODE_ID="${UNIQUE_JOB_ID}_${{ strategy.job-index }}"
- source env/bin/activate
- export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
- exo --node-id="${MY_NODE_ID}" --node-id-filter="${ALL_NODE_IDS}" --chatgpt-api-port 52415 > output1.log 2>&1 &
- PID1=$!
- tail -f output1.log &
- TAIL1=$!
- trap 'kill $TAIL1' EXIT
- trap 'kill $PID1' EXIT
- for i in {1..100}; do
- nodes=$(curl http://localhost:52415/topology | jq ".nodes | length")
- if [ "$nodes" -eq "${{ strategy.job-total }}" ]; then
- break
- fi
- sleep 5
- done
- if ! kill -0 $PID1 2>/dev/null; then
- echo "Instance (PID $PID1) died unexpectedly. Log output:"
- cat output1.log
- exit 1
- fi
- if [ "${{ strategy.job-index }}" -eq "0" ]; then
- GITHUB_JOB=$CALLING_JOB python .github/bench.py
- else
- sleep 10
- while true; do
- nodes=$(curl http://localhost:52415/topology | jq ".nodes | length")
- if [ "$nodes" -lt "${{ strategy.job-total }}" ]; then
- break
- fi
- sleep 5
- done
- fi
|