Browse Source

bench logs

Alex Cheema 7 months ago
parent
commit
3687ba18df
1 changed files with 20 additions and 3 deletions
  1. 20 3
      .github/workflows/bench_job.yml

+ 20 - 3
.github/workflows/bench_job.yml

@@ -65,37 +65,54 @@ jobs:
           UNIQUE_JOB_ID="${CALLING_JOB}_${model}_${GITHUB_RUN_ID}"
           ALL_NODE_IDS=$(for i in $(seq ${{ strategy.job-total }} -1 0); do echo -n "${UNIQUE_JOB_ID}_${i},"; done | sed 's/,$//')
           MY_NODE_ID="${UNIQUE_JOB_ID}_${{ strategy.job-index }}"
+          echo "Starting exo process with:"
+          echo "MY_NODE_ID: ${MY_NODE_ID}"
+          echo "ALL_NODE_IDS: ${ALL_NODE_IDS}"
+          echo "Total expected nodes: ${{ strategy.job-total }}"
+
           source env/bin/activate
           export PATH="/usr/local/bin:/opt/homebrew/bin:$PATH"
+
+          echo "Starting exo daemon..."
           DEBUG=6 exo --node-id="${MY_NODE_ID}" --node-id-filter="${ALL_NODE_IDS}" --chatgpt-api-port 52415 > output1.log 2>&1 &
           PID1=$!
+          echo "Exo process started with PID: $PID1"
           tail -f output1.log &
           TAIL1=$!
 
           trap 'kill $TAIL1' EXIT
           trap 'kill $PID1' EXIT
 
+          echo "Waiting for all nodes to connect..."
           for i in {1..100}; do
-            nodes=$(curl http://localhost:52415/topology | jq ".nodes | length")
+            echo "Attempt $i: Checking node count..."
+            nodes=$(curl -s http://localhost:52415/topology | jq ".nodes | length")
+            echo "Current node count: $nodes"
             if [ "$nodes" -eq "${{ strategy.job-total }}" ]; then
+              echo "All nodes connected successfully!"
               break
             fi
             sleep 5
           done
 
           if ! kill -0 $PID1 2>/dev/null; then
-              echo "Instance (PID $PID1) died unexpectedly. Log output:"
+              echo "ERROR: Instance (PID $PID1) died unexpectedly. Full log output:"
               cat output1.log
               exit 1
           fi
 
           if [ "${{ strategy.job-index }}" -eq "0" ]; then
+            echo "This is the primary node (index 0). Running benchmark..."
             GITHUB_JOB=$CALLING_JOB python .github/bench.py
           else
+            echo "This is a secondary node (index ${{ strategy.job-index }}). Waiting for completion..."
             sleep 10
             while true; do
-              nodes=$(curl http://localhost:52415/topology | jq ".nodes | length")
+              echo "Checking if primary node is still running..."
+              nodes=$(curl -s http://localhost:52415/topology | jq ".nodes | length")
+              echo "Current node count: $nodes"
               if [ "$nodes" -lt "${{ strategy.job-total }}" ]; then
+                echo "Primary node completed, exiting..."
                 break
               fi
               sleep 5