bench_job.yml 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. # This is the reusable workflow file
  2. name: Distributed Job Runner
  3. on:
  4. workflow_call:
  5. inputs:
  6. config:
  7. required: true
  8. type: string
  9. model:
  10. required: true
  11. type: string
  12. jobs:
  13. generate-matrix:
  14. runs-on: ubuntu-latest
  15. outputs:
  16. matrix: ${{ steps.set-matrix.outputs.matrix }}
  17. steps:
  18. - id: set-matrix
  19. env:
  20. CONFIG: ${{ inputs.config }}
  21. run: |
  22. MATRIX=$(echo $CONFIG | jq -c '{cpu: [to_entries | .[] | .key as $k | range(.value) | $k]}')
  23. echo "matrix=$MATRIX" >> $GITHUB_OUTPUT
  24. run-distributed-job:
  25. needs: generate-matrix
  26. strategy:
  27. matrix: ${{fromJson(needs.generate-matrix.outputs.matrix)}}
  28. runs-on: ['self-hosted', 'macOS', '${{ matrix.cpu }}']
  29. env:
  30. HARDWARE_CONFIG: ${{ inputs.config }}
  31. model: ${{ inputs.model }}
  32. steps:
  33. - uses: actions/checkout@v4
  34. - name: Install dependencies
  35. run: |
  36. # First, find where python3.12 is installed
  37. which python3.12 || echo "python3.12 not in PATH"
  38. # Add common Python installation locations to PATH
  39. export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
  40. # Now try to create the venv with explicit python3.12
  41. python3.12 -m venv env || {
  42. echo "Failed to find python3.12. Checking installation locations:"
  43. ls -l /usr/local/bin/python* /opt/homebrew/bin/python* 2>/dev/null || true
  44. exit 1
  45. }
  46. source env/bin/activate
  47. pip install --upgrade pip
  48. pip install .
  49. pip install boto3==1.35.76
  50. - name: Run exo
  51. env:
  52. aws_access_key_id: ${{ secrets.S3_EXO_BENCHMARKS_AWS_ACCESS_KEY_ID }}
  53. aws_secret_key: ${{ secrets.S3_EXO_BENCHMARKS_AWS_SECRET_ACCESS_KEY }}
  54. run: |
  55. GITHUB_JOB=$(echo "${{ github.workflow_ref }}" | awk -F'/' '{print $NF}')
  56. UNIQUE_JOB_ID="${GITHUB_JOB}_${GITHUB_RUN_ID}"
  57. ALL_NODE_IDS=$(for i in $(seq ${{ strategy.job-total }} -1 0); do echo -n "${UNIQUE_JOB_ID}_${i},"; done | sed 's/,$//')
  58. MY_NODE_ID="${UNIQUE_JOB_ID}_${{ strategy.job-index }}"
  59. source env/bin/activate
  60. export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
  61. exo --node-id="${MY_NODE_ID}" --node-id-filter="${ALL_NODE_IDS}" --chatgpt-api-port 52415 --disable-tui > output1.log 2>&1 &
  62. PID1=$!
  63. tail -f output1.log &
  64. TAIL1=$!
  65. trap 'kill $TAIL1' EXIT
  66. trap 'kill $PID1' EXIT
  67. for i in {1..100}; do
  68. nodes=$(curl http://localhost:52415/topology | jq ".nodes | length")
  69. if [ "$nodes" -eq "${{ strategy.job-total }}" ]; then
  70. break
  71. fi
  72. sleep 5
  73. done
  74. if ! kill -0 $PID1 2>/dev/null; then
  75. echo "Instance (PID $PID1) died unexpectedly. Log output:"
  76. cat output1.log
  77. exit 1
  78. fi
  79. if [ "${{ strategy.job-index }}" -eq "0" ]; then
  80. python .github/bench.py
  81. else
  82. sleep 10
  83. while true; do
  84. nodes=$(curl http://localhost:52415/topology | jq ".nodes | length")
  85. if [ "$nodes" -lt "${{ strategy.job-total }}" ]; then
  86. break
  87. fi
  88. sleep 5
  89. done
  90. fi