bench_job.yml 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. # This is the reusable workflow file
  2. name: Distributed Job Runner
  3. on:
  4. workflow_call:
  5. inputs:
  6. config:
  7. required: true
  8. type: string
  9. model:
  10. required: true
  11. type: string
  12. calling_job_name:
  13. required: true
  14. type: string
  15. network_interface:
  16. required: true
  17. type: string
  18. jobs:
  19. generate-matrix:
  20. runs-on: ubuntu-latest
  21. outputs:
  22. matrix: ${{ steps.set-matrix.outputs.matrix }}
  23. steps:
  24. - id: set-matrix
  25. env:
  26. CONFIG: ${{ inputs.config }}
  27. run: |
  28. MATRIX=$(echo $CONFIG | jq -c '{cpu: [to_entries | .[] | .key as $k | range(.value) | $k]}')
  29. echo "matrix=$MATRIX" >> $GITHUB_OUTPUT
  30. run-distributed-job:
  31. needs: generate-matrix
  32. strategy:
  33. matrix: ${{fromJson(needs.generate-matrix.outputs.matrix)}}
  34. runs-on: ['self-hosted', 'macOS', '${{ matrix.cpu }}']
  35. env:
  36. HARDWARE_CONFIG: ${{ inputs.config }}
  37. model: ${{ inputs.model }}
  38. # Add performance-related environment variables
  39. MTL_DEBUG_LAYER: 0
  40. METAL_VALIDATION_ENABLED: 0
  41. MLX_METAL_VALIDATION: 0
  42. MLX_METAL_DEBUG: 0
  43. MLX_FORCE_P_CORES: 1
  44. MLX_METAL_PREWARM: 1
  45. PYTHONOPTIMIZE: 2
  46. steps:
  47. - name: Cleanup workspace
  48. run: |
  49. sudo rm -rf "$GITHUB_WORKSPACE"
  50. sudo mkdir -p "$GITHUB_WORKSPACE"
  51. sudo chown -R $(whoami):$(id -g) "$GITHUB_WORKSPACE"
  52. - uses: actions/checkout@v4
  53. - name: Install dependencies
  54. run: |
  55. export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
  56. python3.12 -m venv .venv || {
  57. echo "Failed to find python3.12. Checking installation locations:"
  58. ls -l /usr/local/bin/python* /opt/homebrew/bin/python* 2>/dev/null || true
  59. exit 1
  60. }
  61. source .venv/bin/activate
  62. pip install --upgrade pip
  63. pip install -e .
  64. pip install boto3==1.35.76
  65. - name: Apply Performance Optimizations
  66. run: |
  67. # Export performance-related environment variables
  68. cat << 'EOF' > /tmp/performance_env.sh
  69. # MLX and Metal optimizations
  70. export MTL_DEBUG_LAYER=0
  71. export METAL_VALIDATION_ENABLED=0
  72. export MLX_METAL_VALIDATION=0
  73. export MLX_METAL_DEBUG=0
  74. export MLX_FORCE_P_CORES=1
  75. export MLX_METAL_PREWARM=1
  76. export PYTHONOPTIMIZE=2
  77. EOF
  78. # Source the performance environment variables
  79. source /tmp/performance_env.sh
  80. # MLX Memory Settings
  81. ./configure_mlx.sh
  82. # Verify optimizations
  83. echo "Verifying performance settings..."
  84. env | grep -E "MLX_|METAL_|MTL_"
  85. - name: Run exo
  86. env:
  87. aws_access_key_id: ${{ secrets.S3_EXO_BENCHMARKS_AWS_ACCESS_KEY_ID }}
  88. aws_secret_key: ${{ secrets.S3_EXO_BENCHMARKS_AWS_SECRET_ACCESS_KEY }}
  89. run: |
  90. # Source performance environment variables
  91. source /tmp/performance_env.sh
  92. # Debug information
  93. echo "Current commit SHA: $GITHUB_SHA"
  94. git rev-parse HEAD
  95. git status
  96. CALLING_JOB="${{ inputs.calling_job_name }}"
  97. UNIQUE_JOB_ID="${CALLING_JOB}_${model}_${GITHUB_RUN_ID}"
  98. ALL_NODE_IDS=$(for i in $(seq ${{ strategy.job-total }} -1 0); do echo -n "${UNIQUE_JOB_ID}_${i},"; done | sed 's/,$//')
  99. MY_NODE_ID="${UNIQUE_JOB_ID}_${{ strategy.job-index }}"
  100. source .venv/bin/activate
  101. export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
  102. echo "=== Before starting exo ==="
  103. ps -eo pid,ppid,user,%cpu,%mem,nice,state,pri,command | head -1
  104. ps -eo pid,ppid,user,%cpu,%mem,nice,state,pri,command | grep -i python
  105. echo "Starting exo daemon..."
  106. echo "Power mode settings:"
  107. sudo pmset -g
  108. # Start exo with explicit process control
  109. sudo taskpolicy -d default -g default -a -t 0 -l 0 .venv/bin/exo \
  110. --node-id="${MY_NODE_ID}" \
  111. --node-id-filter="${ALL_NODE_IDS}" \
  112. --interface-type-filter="${{ inputs.network_interface }}" \
  113. --disable-tui \
  114. --max-generate-tokens 250 \
  115. --chatgpt-api-response-timeout 900 \
  116. --chatgpt-api-port 52415 > output1.log 2>&1 &
  117. PID1=$!
  118. echo "Exo process started with PID: $PID1"
  119. tail -f output1.log &
  120. TAIL1=$!
  121. # Give process time to start
  122. sleep 2
  123. # Set additional process priorities
  124. sudo renice -n -20 -p $PID1
  125. sudo taskpolicy -t 4 -p $PID1
  126. echo "=== After starting exo ==="
  127. ps -eo pid,ppid,user,%cpu,%mem,nice,state,pri,command | head -1
  128. ps -eo pid,ppid,user,%cpu,%mem,nice,state,pri,command | grep $PID1
  129. echo "Additional process details:"
  130. sudo powermetrics -n 1 -i 1000 --show-process-energy | grep -A 5 $PID1 || true
  131. trap 'kill $TAIL1' EXIT
  132. trap 'kill $PID1' EXIT
  133. echo "Waiting for all nodes to connect..."
  134. for i in {1..20}; do
  135. echo "Attempt $i: Checking node count..."
  136. nodes=$(curl -s http://localhost:52415/topology | jq ".nodes | length")
  137. echo "Current node count: $nodes"
  138. if [ "$nodes" -eq "${{ strategy.job-total }}" ]; then
  139. echo "All nodes connected successfully!"
  140. break
  141. fi
  142. if [ $i -eq 20 ]; then
  143. echo "ERROR: Failed to connect all nodes after 20 attempts. Expected ${{ strategy.job-total }} nodes, but got $nodes"
  144. exit 1
  145. fi
  146. sleep 5
  147. done
  148. if ! kill -0 $PID1 2>/dev/null; then
  149. echo "ERROR: Instance (PID $PID1) died unexpectedly. Full log output:"
  150. cat output1.log
  151. exit 1
  152. fi
  153. if [ "${{ strategy.job-index }}" -eq "0" ]; then
  154. sleep 10
  155. echo "This is the primary node (index 0). Running benchmark..."
  156. GITHUB_JOB=$CALLING_JOB python .github/bench.py
  157. else
  158. echo "This is a secondary node (index ${{ strategy.job-index }}). Waiting for completion..."
  159. sleep 10
  160. while true; do
  161. echo "Checking if primary node is still running..."
  162. nodes=$(curl -s http://localhost:52415/topology | jq ".nodes | length")
  163. echo "Current node count: $nodes"
  164. if [ "$nodes" -lt "${{ strategy.job-total }}" ]; then
  165. echo "Primary node completed, exiting..."
  166. break
  167. fi
  168. sleep 5
  169. done
  170. fi
  171. - name: Check Final System State
  172. if: always()
  173. run: |
  174. echo "=== Final System State ==="
  175. sudo pmset -g
  176. sudo powermetrics -n 1 -i 1000 --show-process-energy || true
  177. system_profiler SPDisplaysDataType
  178. sysctl iogpu
  179. ps -eo pid,ppid,user,%cpu,%mem,nice,state,command | grep -i python
  180. env | grep -E "MLX_|METAL_|MTL_"
  181. echo "=== End Final System State ==="