bench_job.yml 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. # This is the reusable workflow file
  2. name: Distributed Job Runner
  3. on:
  4. workflow_call:
  5. inputs:
  6. config:
  7. required: true
  8. type: string
  9. model:
  10. required: true
  11. type: string
  12. calling_job_name: # New input parameter
  13. required: true
  14. type: string
  15. jobs:
  16. generate-matrix:
  17. runs-on: ubuntu-latest
  18. outputs:
  19. matrix: ${{ steps.set-matrix.outputs.matrix }}
  20. steps:
  21. - id: set-matrix
  22. env:
  23. CONFIG: ${{ inputs.config }}
  24. run: |
  25. MATRIX=$(echo $CONFIG | jq -c '{cpu: [to_entries | .[] | .key as $k | range(.value) | $k]}')
  26. echo "matrix=$MATRIX" >> $GITHUB_OUTPUT
  27. run-distributed-job:
  28. needs: generate-matrix
  29. strategy:
  30. matrix: ${{fromJson(needs.generate-matrix.outputs.matrix)}}
  31. runs-on: ['self-hosted', 'macOS', '${{ matrix.cpu }}']
  32. env:
  33. HARDWARE_CONFIG: ${{ inputs.config }}
  34. model: ${{ inputs.model }}
  35. steps:
  36. - uses: actions/checkout@v4
  37. - name: Install dependencies
  38. run: |
  39. # First, find where python3.12 is installed
  40. which python3.12 || echo "python3.12 not in PATH"
  41. # Add common Python installation locations to PATH
  42. export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
  43. # Now try to create the venv with explicit python3.12
  44. python3.12 -m venv .venv || {
  45. echo "Failed to find python3.12. Checking installation locations:"
  46. ls -l /usr/local/bin/python* /opt/homebrew/bin/python* 2>/dev/null || true
  47. exit 1
  48. }
  49. source .venv/bin/activate
  50. pip install --upgrade pip
  51. pip install -e .
  52. pip install boto3==1.35.76
  53. - name: Configure system
  54. run: |
  55. # Disable all power management and performance throttling
  56. sudo pmset -a lessbright 0
  57. sudo pmset -a disablesleep 1
  58. sudo pmset -a sleep 0
  59. sudo pmset -a hibernatemode 0
  60. sudo pmset -a autopoweroff 0
  61. sudo pmset -a standby 0
  62. sudo pmset -a powernap 0
  63. sudo pmset -a proximitywake 0
  64. sudo pmset -a tcpkeepalive 1
  65. sudo pmset -a powermode 0
  66. sudo pmset -a gpuswitch 2
  67. # Optimize GPU memory allocation
  68. sudo sysctl -w kern.memorystatus_purge_on_warning=0
  69. sudo sysctl -w kern.memorystatus_purge_on_critical=0
  70. ./configure_mlx.sh
  71. # Check final state
  72. echo "Final system state:"
  73. pmset -g
  74. sysctl iogpu
  75. sysctl kern.memorystatus_purge_on_warning
  76. sysctl kern.memorystatus_purge_on_critical
  77. - name: Run exo
  78. env:
  79. aws_access_key_id: ${{ secrets.S3_EXO_BENCHMARKS_AWS_ACCESS_KEY_ID }}
  80. aws_secret_key: ${{ secrets.S3_EXO_BENCHMARKS_AWS_SECRET_ACCESS_KEY }}
  81. run: |
  82. # Debug information
  83. echo "Current commit SHA: $GITHUB_SHA"
  84. git rev-parse HEAD
  85. git status
  86. # List existing exo processes
  87. echo "Existing exo processes:"
  88. ps aux | grep exo || true
  89. CALLING_JOB="${{ inputs.calling_job_name }}"
  90. UNIQUE_JOB_ID="${CALLING_JOB}_${model}_${GITHUB_RUN_ID}"
  91. ALL_NODE_IDS=$(for i in $(seq ${{ strategy.job-total }} -1 0); do echo -n "${UNIQUE_JOB_ID}_${i},"; done | sed 's/,$//')
  92. MY_NODE_ID="${UNIQUE_JOB_ID}_${{ strategy.job-index }}"
  93. echo "Starting exo process with:"
  94. echo "MY_NODE_ID: ${MY_NODE_ID}"
  95. echo "ALL_NODE_IDS: ${ALL_NODE_IDS}"
  96. echo "Total expected nodes: ${{ strategy.job-total }}"
  97. source .venv/bin/activate
  98. export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
  99. # Check installed exo version
  100. pip show exo
  101. which .venv/bin/exo
  102. echo "Starting exo daemon..."
  103. # Start exo with high priority
  104. sudo nice -n -20 .venv/bin/exo \
  105. --node-id="${MY_NODE_ID}" \
  106. --node-id-filter="${ALL_NODE_IDS}" \
  107. --interface-type-filter="Ethernet" \
  108. --chatgpt-api-port 52415 > output1.log 2>&1 &
  109. PID1=$!
  110. echo "Exo process started with PID: $PID1"
  111. tail -f output1.log &
  112. TAIL1=$!
  113. trap 'kill $TAIL1' EXIT
  114. trap 'kill $PID1' EXIT
  115. echo "Waiting for all nodes to connect..."
  116. for i in {1..100}; do
  117. echo "Attempt $i: Checking node count..."
  118. nodes=$(curl -s http://localhost:52415/topology | jq ".nodes | length")
  119. echo "Current node count: $nodes"
  120. if [ "$nodes" -eq "${{ strategy.job-total }}" ]; then
  121. echo "All nodes connected successfully!"
  122. break
  123. fi
  124. sleep 5
  125. done
  126. if ! kill -0 $PID1 2>/dev/null; then
  127. echo "ERROR: Instance (PID $PID1) died unexpectedly. Full log output:"
  128. cat output1.log
  129. exit 1
  130. fi
  131. if [ "${{ strategy.job-index }}" -eq "0" ]; then
  132. sleep 10
  133. echo "This is the primary node (index 0). Running benchmark..."
  134. GITHUB_JOB=$CALLING_JOB python .github/bench.py
  135. else
  136. echo "This is a secondary node (index ${{ strategy.job-index }}). Waiting for completion..."
  137. sleep 10
  138. while true; do
  139. echo "Checking if primary node is still running..."
  140. nodes=$(curl -s http://localhost:52415/topology | jq ".nodes | length")
  141. echo "Current node count: $nodes"
  142. if [ "$nodes" -lt "${{ strategy.job-total }}" ]; then
  143. echo "Primary node completed, exiting..."
  144. break
  145. fi
  146. sleep 5
  147. done
  148. fi