11 meses atrás · e3a524fd89
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -0,0 +1,180 @@
 
				+version: 2.1
			
 
				+
			
 
				+orbs:
			
 
				+  python: circleci/python@2
			
 
				+
			
 
				+jobs:
			
 
				+  unit_test:
			
 
				+    macos:
			
 
				+      xcode: "15.2.0"
			
 
				+    resource_class: macos.m1.medium.gen1
			
 
				+    steps:
			
 
				+      - checkout
			
 
				+      - run:
			
 
				+          name: Set up Python
			
 
				+          command: |
			
 
				+            brew install python@3.12
			
 
				+            python3.12 -m venv env
			
 
				+            source env/bin/activate
			
 
				+      - run:
			
 
				+          name: Install dependencies
			
 
				+          command: |
			
 
				+            source env/bin/activate
			
 
				+            pip install --upgrade pip
			
 
				+            pip install .
			
 
				+      - run:
			
 
				+          name: Run tests
			
 
				+          command: |
			
 
				+            source env/bin/activate
			
 
				+            METAL_XCODE=1 python3 -m exo.inference.test_inference_engine
			
 
				+
			
 
				+  discovery_integration_test:
			
 
				+    macos:
			
 
				+      xcode: "15.2.0"
			
 
				+    steps:
			
 
				+      - checkout
			
 
				+      - run:
			
 
				+          name: Set up Python
			
 
				+          command: |
			
 
				+            brew install python@3.12
			
 
				+            python3.12 -m venv env
			
 
				+            source env/bin/activate
			
 
				+      - run:
			
 
				+          name: Install dependencies
			
 
				+          command: |
			
 
				+            source env/bin/activate
			
 
				+            pip install --upgrade pip
			
 
				+            pip install .
			
 
				+      - run:
			
 
				+          name: Run discovery integration test
			
 
				+          command: |
			
 
				+            source env/bin/activate
			
 
				+            DEBUG_DISCOVERY=9 DEBUG=9 python3 main.py --node-id "node1" --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 > output1.log 2>&1 &
			
 
				+            PID1=$!
			
 
				+            DEBUG_DISCOVERY=9 DEBUG=9 python3 main.py --node-id "node2" --listen-port 5679 --broadcast-port 5678 --chatgpt-api-port 8001 > output2.log 2>&1 &
			
 
				+            PID2=$!
			
 
				+            sleep 10
			
 
				+            kill $PID1 $PID2
			
 
				+            if grep -q "Connected to peer" output1.log && grep -q "Connected to peer" output2.log; then
			
 
				+              echo "Test passed: Both instances discovered each other"
			
 
				+              exit 0
			
 
				+            else
			
 
				+              echo "Test failed: Devices did not discover each other"
			
 
				+              echo "Output of first instance:"
			
 
				+              cat output1.log
			
 
				+              echo "Output of second instance:"
			
 
				+              cat output2.log
			
 
				+              exit 1
			
 
				+            fi
			
 
				+
			
 
				+  chatgpt_api_integration_test:
			
 
				+    macos:
			
 
				+      xcode: "15.2.0"
			
 
				+    steps:
			
 
				+      - checkout
			
 
				+      - run:
			
 
				+          name: Set up Python
			
 
				+          command: |
			
 
				+            brew install python@3.12
			
 
				+            python3.12 -m venv env
			
 
				+            source env/bin/activate
			
 
				+      - run:
			
 
				+          name: Install dependencies
			
 
				+          command: |
			
 
				+            source env/bin/activate
			
 
				+            pip install --upgrade pip
			
 
				+            pip install .
			
 
				+      - run:
			
 
				+          name: Run chatgpt api integration test
			
 
				+          command: |
			
 
				+            source env/bin/activate
			
 
				+
			
 
				+            # Start first instance
			
 
				+            DEBUG_DISCOVERY=9 DEBUG=9 python3 main.py --inference-engine mlx --node-id "node1" --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 --chatgpt-api-response-timeout-secs 900 > output1.log 2>&1 &
			
 
				+            PID1=$!
			
 
				+
			
 
				+            # Start second instance
			
 
				+            DEBUG_DISCOVERY=9 DEBUG=9 python3 main.py --inference-engine mlx --node-id "node2" --listen-port 5679 --broadcast-port 5678 --chatgpt-api-port 8001 --chatgpt-api-response-timeout-secs 900 > output2.log 2>&1 &
			
 
				+            PID2=$!
			
 
				+
			
 
				+            # Wait for discovery
			
 
				+            sleep 10
			
 
				+
			
 
				+            # Function to check if processes are still running
			
 
				+            check_processes() {
			
 
				+              if ! kill -0 $PID1 2>/dev/null; then
			
 
				+                echo "First instance (PID $PID1) died unexpectedly. Log output:"
			
 
				+                cat output1.log
			
 
				+                exit 1
			
 
				+              fi
			
 
				+              if ! kill -0 $PID2 2>/dev/null; then
			
 
				+                echo "Second instance (PID $PID2) died unexpectedly. Log output:"
			
 
				+                cat output2.log
			
 
				+                exit 1
			
 
				+              fi
			
 
				+            }
			
 
				+
			
 
				+            # Check processes before proceeding
			
 
				+            check_processes
			
 
				+
			
 
				+            # first one to load the model
			
 
				+            curl -s http://localhost:8000/v1/chat/completions \
			
 
				+                -H "Content-Type: application/json" \
			
 
				+                -d '{
			
 
				+                  "model": "llama-3.1-8b",
			
 
				+                  "messages": [{"role": "user", "content": "Keep responses concise. Placeholder to load model..."}],
			
 
				+                  "temperature": 0.7
			
 
				+                }'
			
 
				+
			
 
				+            # Check processes after model load
			
 
				+            check_processes
			
 
				+
			
 
				+            response_1=$(curl -s http://localhost:8000/v1/chat/completions \
			
 
				+              -H "Content-Type: application/json" \
			
 
				+              -d '{
			
 
				+                "model": "llama-3.1-8b",
			
 
				+                "messages": [{"role": "user", "content": "Keep responses concise. Who was the king of pop?"}],
			
 
				+                "temperature": 0.7
			
 
				+              }')
			
 
				+            echo "Response 1: $response_1"
			
 
				+
			
 
				+            # Check processes after first response
			
 
				+            check_processes
			
 
				+
			
 
				+            response_2=$(curl -s http://localhost:8000/v1/chat/completions \
			
 
				+              -H "Content-Type: application/json" \
			
 
				+              -d '{
			
 
				+                "model": "llama-3.1-8b",
			
 
				+                "messages": [{"role": "user", "content": "Keep responses concise. Who was the king of pop?"}],
			
 
				+                "temperature": 0.7
			
 
				+              }')
			
 
				+            echo "Response 2: $response_2"
			
 
				+
			
 
				+            # Check processes after second response
			
 
				+            check_processes
			
 
				+
			
 
				+            # Stop both instances
			
 
				+            kill $PID1 $PID2
			
 
				+
			
 
				+            echo ""
			
 
				+            if ! echo "$response_1" | grep -q "Michael Jackson" || ! echo "$response_2" | grep -q "Michael Jackson"; then
			
 
				+              echo "Test failed: Response does not contain 'Michael Jackson'"
			
 
				+              echo "Response 1: $response_1"
			
 
				+              echo ""
			
 
				+              echo "Response 2: $response_2"
			
 
				+              echo "Output of first instance:"
			
 
				+              cat output1.log
			
 
				+              echo "Output of second instance:"
			
 
				+              cat output2.log
			
 
				+              exit 1
			
 
				+            else
			
 
				+              echo "Test passed: Response from both nodes contains 'Michael Jackson'"
			
 
				+            fi
			
 
				+
			
 
				+workflows:
			
 
				+  version: 2
			
 
				+  build_and_test:
			
 
				+    jobs:
			
 
				+      - unit_test
			
 
				+      - discovery_integration_test
			
 
				+      - chatgpt_api_integration_test
			
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -1,200 +0,0 @@
 
				-name: Python Tests on M1 Mac
			
 
				-
			
 
				-on:
			
 
				-  push:
			
 
				-    branches: [ main ]
			
 
				-  pull_request:
			
 
				-    branches: [ main ]
			
 
				-
			
 
				-jobs:
			
 
				-  unit_test:
			
 
				-    runs-on: macos-14
			
 
				-    steps:
			
 
				-    - uses: actions/checkout@v2
			
 
				-
			
 
				-    - name: Set up Python
			
 
				-      uses: actions/setup-python@v2
			
 
				-      with:
			
 
				-        python-version: '3.12'
			
 
				-
			
 
				-    - name: Cache huggingface hub models
			
 
				-      uses: actions/cache@v3
			
 
				-      with:
			
 
				-        path: ~/.cache/huggingface/hub
			
 
				-        key: ${{ runner.os }}-huggingface-hub-${{ hashFiles('~/.cache/huggingface/hub/**/*') }}-${{ github.job }}
			
 
				-
			
 
				-    - name: Install dependencies
			
 
				-      run: |
			
 
				-        python3 -m pip install --upgrade pip
			
 
				-        pip install .
			
 
				-
			
 
				-    - name: Run tests
			
 
				-      run: |
			
 
				-        # Check if cached files are present
			
 
				-        ls ~/.cache/huggingface/hub/models--mlx-community--Meta-Llama-3-8B-Instruct-4bit/**/* || true
			
 
				-
			
 
				-        # Run unit tests
			
 
				-        METAL_XCODE=1 python3 -m exo.inference.test_inference_engine
			
 
				-
			
 
				-  discovery_integration_test:
			
 
				-    runs-on: macos-latest
			
 
				-    steps:
			
 
				-    - uses: actions/checkout@v2
			
 
				-
			
 
				-    - name: Set up Python
			
 
				-      uses: actions/setup-python@v2
			
 
				-      with:
			
 
				-        python-version: '3.x'
			
 
				-
			
 
				-    - name: Install dependencies
			
 
				-      run: |
			
 
				-        python3 -m pip install --upgrade pip
			
 
				-        pip install .
			
 
				-
			
 
				-    - name: Run discovery integration test
			
 
				-      run: |
			
 
				-        # Start first instance
			
 
				-        DEBUG_DISCOVERY=9 DEBUG=9 python3 main.py --node-id "node1" --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 > output1.log 2>&1 &
			
 
				-        PID1=$!
			
 
				-
			
 
				-        # Start second instance
			
 
				-        DEBUG_DISCOVERY=9 DEBUG=9 python3 main.py --node-id "node2" --listen-port 5679 --broadcast-port 5678 --chatgpt-api-port 8001 > output2.log 2>&1 &
			
 
				-        PID2=$!
			
 
				-
			
 
				-        # Wait for discovery
			
 
				-        sleep 10
			
 
				-
			
 
				-        # Stop both instances
			
 
				-        kill $PID1 $PID2
			
 
				-
			
 
				-        # Check outputs
			
 
				-        if grep -q "Connected to peer" output1.log && grep -q "Connected to peer" output2.log; then
			
 
				-          echo "Test passed: Both instances discovered each other"
			
 
				-          exit 0
			
 
				-        else
			
 
				-          echo "Test failed: Devices did not discover each other"
			
 
				-          echo "Output of first instance:"
			
 
				-          cat output1.log
			
 
				-          echo "Output of second instance:"
			
 
				-          cat output2.log
			
 
				-          exit 1
			
 
				-        fi
			
 
				-
			
 
				-  chatgpt_api_integration_test:
			
 
				-    runs-on: macos-latest
			
 
				-    steps:
			
 
				-    - uses: actions/checkout@v2
			
 
				-
			
 
				-    - name: Set up Python
			
 
				-      uses: actions/setup-python@v2
			
 
				-      with:
			
 
				-        python-version: '3.x'
			
 
				-
			
 
				-    - name: Cache huggingface hub models
			
 
				-      uses: actions/cache@v3
			
 
				-      with:
			
 
				-        path: ~/.cache/huggingface/hub
			
 
				-        key: ${{ runner.os }}-huggingface-hub-${{ hashFiles('~/.cache/huggingface/hub/**/*') }}-${{ github.job }}
			
 
				-        restore-keys: |
			
 
				-          ${{ runner.os }}-huggingface-hub-
			
 
				-
			
 
				-    - name: Cache tinygrad downloaded models
			
 
				-      uses: actions/cache@v3
			
 
				-      with:
			
 
				-        path: ~/Library/Caches/tinygrad/downloads
			
 
				-        key: ${{ runner.os }}-tinygrad-downloads-${{ hashFiles('~/Library/Caches/tinygrad/downloads/**/*') }}-${{ github.job }}
			
 
				-        restore-keys: |
			
 
				-          ${{ runner.os }}-tinygrad-downloads-
			
 
				-
			
 
				-    - name: Install dependencies
			
 
				-      run: |
			
 
				-        python3 -m pip install --upgrade pip
			
 
				-        pip install .
			
 
				-
			
 
				-    - name: Run chatgpt api integration test
			
 
				-      run: |
			
 
				-        exit 0 # TODO
			
 
				-        # Check if cached files are present
			
 
				-        ls ~/.cache/huggingface/hub/models--mlx-community--Meta-Llama-3-8B-Instruct-4bit/**/* || true
			
 
				-
			
 
				-        # Start first instance
			
 
				-        DEBUG_DISCOVERY=9 DEBUG=9 python3 main.py --inference-engine mlx --node-id "node1" --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 --chatgpt-api-response-timeout-secs 900 > output1.log 2>&1 &
			
 
				-        PID1=$!
			
 
				-
			
 
				-        # Start second instance
			
 
				-        DEBUG_DISCOVERY=9 DEBUG=9 python3 main.py --inference-engine mlx --node-id "node2" --listen-port 5679 --broadcast-port 5678 --chatgpt-api-port 8001 --chatgpt-api-response-timeout-secs 900 > output2.log 2>&1 &
			
 
				-        PID2=$!
			
 
				-
			
 
				-        # Wait for discovery
			
 
				-        sleep 10
			
 
				-
			
 
				-        # Function to check if processes are still running
			
 
				-        check_processes() {
			
 
				-          if ! kill -0 $PID1 2>/dev/null; then
			
 
				-            echo "First instance (PID $PID1) died unexpectedly. Log output:"
			
 
				-            cat output1.log
			
 
				-            exit 1
			
 
				-          fi
			
 
				-          if ! kill -0 $PID2 2>/dev/null; then
			
 
				-            echo "Second instance (PID $PID2) died unexpectedly. Log output:"
			
 
				-            cat output2.log
			
 
				-            exit 1
			
 
				-          fi
			
 
				-        }
			
 
				-
			
 
				-        # Check processes before proceeding
			
 
				-        check_processes
			
 
				-
			
 
				-        # first one to load the model
			
 
				-        curl -s http://localhost:8000/v1/chat/completions \
			
 
				-            -H "Content-Type: application/json" \
			
 
				-            -d '{
			
 
				-              "model": "llama-3-8b",
			
 
				-              "messages": [{"role": "user", "content": "Keep responses concise. Placeholder to load model..."}],
			
 
				-              "temperature": 0.7
			
 
				-            }'
			
 
				-
			
 
				-        # Check processes after model load
			
 
				-        check_processes
			
 
				-
			
 
				-        response_1=$(curl -s http://localhost:8000/v1/chat/completions \
			
 
				-          -H "Content-Type: application/json" \
			
 
				-          -d '{
			
 
				-            "model": "llama-3-8b",
			
 
				-            "messages": [{"role": "user", "content": "Keep responses concise. Who was the king of pop?"}],
			
 
				-            "temperature": 0.7
			
 
				-          }')
			
 
				-        echo "Response 1: $response_1"
			
 
				-
			
 
				-        # Check processes after first response
			
 
				-        check_processes
			
 
				-
			
 
				-        response_2=$(curl -s http://localhost:8000/v1/chat/completions \
			
 
				-          -H "Content-Type: application/json" \
			
 
				-          -d '{
			
 
				-            "model": "llama-3-8b",
			
 
				-            "messages": [{"role": "user", "content": "Keep responses concise. Who was the king of pop?"}],
			
 
				-            "temperature": 0.7
			
 
				-          }')
			
 
				-        echo "Response 2: $response_2"
			
 
				-
			
 
				-        # Check processes after second response
			
 
				-        check_processes
			
 
				-
			
 
				-        # Stop both instances
			
 
				-        kill $PID1 $PID2
			
 
				-
			
 
				-        echo ""
			
 
				-        if ! echo "$response_1" | grep -q "Michael Jackson" || ! echo "$response_2" | grep -q "Michael Jackson"; then
			
 
				-          echo "Test failed: Response does not contain 'Michael Jackson'"
			
 
				-          echo "Response 1: $response_1"
			
 
				-          echo ""
			
 
				-          echo "Response 2: $response_2"
			
 
				-          echo "Output of first instance:"
			
 
				-          cat output1.log
			
 
				-          echo "Output of second instance:"
			
 
				-          cat output2.log
			
 
				-          exit 1
			
 
				-        else
			
 
				-          echo "Test passed: Response from both nodes contains 'Michael Jackson'"
			
 
				-        fi
			
--- a/exo/api/chatgpt_api.py
+++ b/exo/api/chatgpt_api.py
@@ -307,9 +307,9 @@ class ChatGPTAPI:
 
				     prompt, image_str = build_prompt(tokenizer, chat_request.messages)
			
 
				     request_id = None
			
 
				     match = self.prompts.find_longest_prefix(prompt)
			
 
				-    if match:
			
 
				+    if match and len(prompt) > len(match[1].prompt):
			
 
				         if DEBUG >= 2:
			
 
				-            print(f"Prompt for request starts with previous prompt {len(match[1].prompt)} of {len(prompt)}: {match[1].prompt}")
			
 
				+          print(f"Prompt for request starts with previous prompt {len(match[1].prompt)} of {len(prompt)}: {match[1].prompt}")
			
 
				         request_id = match[1].request_id
			
 
				         self.prompts.add(prompt, PromptSession(request_id=request_id, timestamp=int(time.time()), prompt=prompt))
			
 
				         # remove the matching prefix from the prompt