lqb
/
exo
spiegel van https://github.com/exo-explore/exo


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
							# This is the reusable workflow file
name: Distributed Job Runner

on:
  workflow_call:
    inputs:
      config:
        required: true
        type: string
      model:
        required: true
        type: string
      calling_job_name:
        required: true
        type: string
      network_interface:
        required: true
        type: string
jobs:
  generate-matrix:
    runs-on: ubuntu-latest
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
    steps:
      - id: set-matrix
        env:
          CONFIG: ${{ inputs.config }}
        run: |
          MATRIX=$(echo $CONFIG | jq -c '{cpu: [to_entries | .[] | .key as $k | range(.value) | $k]}')
          echo "matrix=$MATRIX" >> $GITHUB_OUTPUT

  run-distributed-job:
    needs: generate-matrix
    strategy:
      matrix: ${{fromJson(needs.generate-matrix.outputs.matrix)}}
    runs-on: ['self-hosted', 'macOS', '${{ matrix.cpu }}']
    env:
      HARDWARE_CONFIG: ${{ inputs.config }}
      model: ${{ inputs.model }}
      # Add performance-related environment variables
      MTL_DEBUG_LAYER: 0
      METAL_VALIDATION_ENABLED: 0
      MLX_METAL_VALIDATION: 0
      MLX_METAL_DEBUG: 0
      MLX_FORCE_P_CORES: 1
      MLX_METAL_PREWARM: 1
      PYTHONOPTIMIZE: 2
    steps:
      - name: Cleanup workspace
        run: |
          sudo rm -rf "$GITHUB_WORKSPACE"
          sudo mkdir -p "$GITHUB_WORKSPACE"
          sudo chown -R $(whoami):$(id -g) "$GITHUB_WORKSPACE"

      - uses: actions/checkout@v4

      - name: Install dependencies
        run: |
          export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
          python3.12 -m venv .venv || {
            echo "Failed to find python3.12. Checking installation locations:"
            ls -l /usr/local/bin/python* /opt/homebrew/bin/python* 2>/dev/null || true
            exit 1
          }
          source .venv/bin/activate
          pip install --upgrade pip
          pip install -e .
          pip install boto3==1.35.76

      - name: Apply Performance Optimizations
        run: |
          # Export performance-related environment variables
          cat << 'EOF' > /tmp/performance_env.sh
          # MLX and Metal optimizations
          export MTL_DEBUG_LAYER=0
          export METAL_VALIDATION_ENABLED=0
          export MLX_METAL_VALIDATION=0
          export MLX_METAL_DEBUG=0
          export MLX_FORCE_P_CORES=1
          export MLX_METAL_PREWARM=1
          export PYTHONOPTIMIZE=2
          EOF
          
          # Source the performance environment variables
          source /tmp/performance_env.sh

          # MLX Memory Settings
          ./configure_mlx.sh
          
          # Verify optimizations
          echo "Verifying performance settings..."
          env | grep -E "MLX_|METAL_|MTL_"

      - name: Run exo
        env:
          aws_access_key_id: ${{ secrets.S3_EXO_BENCHMARKS_AWS_ACCESS_KEY_ID }}
          aws_secret_key: ${{ secrets.S3_EXO_BENCHMARKS_AWS_SECRET_ACCESS_KEY }}
        run: |
          # Source performance environment variables
          source /tmp/performance_env.sh
          
          # Debug information
          echo "Current commit SHA: $GITHUB_SHA"
          git rev-parse HEAD
          git status
          
          CALLING_JOB="${{ inputs.calling_job_name }}"
          UNIQUE_JOB_ID="${CALLING_JOB}_${model}_${GITHUB_RUN_ID}"
          ALL_NODE_IDS=$(for i in $(seq ${{ strategy.job-total }} -1 0); do echo -n "${UNIQUE_JOB_ID}_${i},"; done | sed 's/,$//')
          MY_NODE_ID="${UNIQUE_JOB_ID}_${{ strategy.job-index }}"
          
          source .venv/bin/activate
          export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
          
          echo "=== Before starting exo ==="
          ps -eo pid,ppid,user,%cpu,%mem,nice,state,pri,command | head -1
          ps -eo pid,ppid,user,%cpu,%mem,nice,state,pri,command | grep -i python
          
          echo "Starting exo daemon..."
          
          echo "Power mode settings:"
          sudo pmset -g
          
          # Start exo with explicit process control
          sudo taskpolicy -d default -g default -a -t 0 -l 0 .venv/bin/exo \
            --node-id="${MY_NODE_ID}" \
            --node-id-filter="${ALL_NODE_IDS}" \
            --interface-type-filter="${{ inputs.network_interface }}" \
            --disable-tui \
            --max-generate-tokens 250 \
            --chatgpt-api-response-timeout 900 \
            --chatgpt-api-port 52415 > output1.log 2>&1 &
          PID1=$!
          
          echo "Exo process started with PID: $PID1"
          tail -f output1.log &
          TAIL1=$!

          # Give process time to start
          sleep 2
          
          # Set additional process priorities
          sudo renice -n -20 -p $PID1
          sudo taskpolicy -t 4 -p $PID1
          
          echo "=== After starting exo ==="
          ps -eo pid,ppid,user,%cpu,%mem,nice,state,pri,command | head -1
          ps -eo pid,ppid,user,%cpu,%mem,nice,state,pri,command | grep $PID1
          
          echo "Additional process details:"
          sudo powermetrics -n 1 -i 1000 --show-process-energy | grep -A 5 $PID1 || true

          trap 'kill $TAIL1' EXIT
          trap 'kill $PID1' EXIT

          echo "Waiting for all nodes to connect..."
          for i in {1..20}; do
            echo "Attempt $i: Checking node count..."
            nodes=$(curl -s http://localhost:52415/topology | jq ".nodes | length")
            echo "Current node count: $nodes"
            if [ "$nodes" -eq "${{ strategy.job-total }}" ]; then
              echo "All nodes connected successfully!"
              break
            fi
            if [ $i -eq 20 ]; then
              echo "ERROR: Failed to connect all nodes after 20 attempts. Expected ${{ strategy.job-total }} nodes, but got $nodes"
              exit 1
            fi
            sleep 5
          done

          if ! kill -0 $PID1 2>/dev/null; then
              echo "ERROR: Instance (PID $PID1) died unexpectedly. Full log output:"
              cat output1.log
              exit 1
          fi

          if [ "${{ strategy.job-index }}" -eq "0" ]; then
            sleep 10
            echo "This is the primary node (index 0). Running benchmark..."
            GITHUB_JOB=$CALLING_JOB python .github/bench.py
          else
            echo "This is a secondary node (index ${{ strategy.job-index }}). Waiting for completion..."
            sleep 10
            while true; do
              echo "Checking if primary node is still running..."
              nodes=$(curl -s http://localhost:52415/topology | jq ".nodes | length")
              echo "Current node count: $nodes"
              if [ "$nodes" -lt "${{ strategy.job-total }}" ]; then
                echo "Primary node completed, exiting..."
                break
              fi
              sleep 5
            done
          fi

      - name: Check Final System State
        if: always()
        run: |
          echo "=== Final System State ==="
          sudo pmset -g
          sudo powermetrics -n 1 -i 1000 --show-process-energy || true
          system_profiler SPDisplaysDataType
          sysctl iogpu
          ps -eo pid,ppid,user,%cpu,%mem,nice,state,command | grep -i python
          env | grep -E "MLX_|METAL_|MTL_"
          echo "=== End Final System State ==="