1 سال پیش · 5a9f4ba5c1
--- a/examples/chatgpt_api.sh
+++ b/examples/chatgpt_api.sh
@@ -0,0 +1,39 @@
 
				+# exo provides an API that aims to be a drop-in replacements for the ChatGPT-API.
			
 
				+# This example shows how you can use the API first without streaming and second with streaming.
			
 
				+# This works the same in a single-node set up and in a multi-node setup.
			
 
				+# You need to start exo before running this by running `python3 main.py`.
			
 
				+
			
 
				+API_ENDPOINT="http://${API_ENDPOINT:-$(ifconfig | grep 'inet ' | grep -v '127.0.0.1' | awk '{print $2}' | head -n 1):8000}"
			
 
				+MODEL="llama-3.1-8b"
			
 
				+PROMPT="What is the meaning of exo?"
			
 
				+TEMPERATURE=0.7
			
 
				+
			
 
				+echo ""
			
 
				+echo ""
			
 
				+echo "--- Output without streaming:"
			
 
				+echo ""
			
 
				+curl "${API_ENDPOINT}/v1/chat/completions" --silent \
			
 
				+  -H "Content-Type: application/json" \
			
 
				+  -d '{
			
 
				+     "model": "'"${MODEL}"'",
			
 
				+     "messages": [{"role": "user", "content": "'"${PROMPT}"'"}],
			
 
				+     "temperature": '"${TEMPERATURE}"'
			
 
				+   }'
			
 
				+
			
 
				+echo ""
			
 
				+echo ""
			
 
				+echo "--- Output with streaming:"
			
 
				+echo ""
			
 
				+curl "${API_ENDPOINT}/v1/chat/completions" --silent \
			
 
				+  -H "Content-Type: application/json" \
			
 
				+  -d '{
			
 
				+     "model": "'"${MODEL}"'",
			
 
				+     "messages": [{"role": "user", "content": "'"${PROMPT}"'"}],
			
 
				+     "temperature": '"${TEMPERATURE}"',
			
 
				+     "stream": true
			
 
				+   }' | while read -r line; do
			
 
				+       if [[ $line == data:* ]]; then
			
 
				+           content=$(echo "$line" | sed 's/^data: //')
			
 
				+           echo "$content" | jq -r '.choices[].delta.content' --unbuffered | tr -d '\n'
			
 
				+       fi
			
 
				+   done
			
--- a/examples/llama3_distributed.py
+++ b/examples/llama3_distributed.py
@@ -1,81 +0,0 @@
 
				-# In this example, a user is running a home cluster with 3 shards.
			
 
				-# They are prompting the cluster to generate a response to a question.
			
 
				-# The cluster is given the question, and the user is given the response.
			
 
				-
			
 
				-from exo.inference.mlx.sharded_utils import get_model_path, load_tokenizer
			
 
				-from exo.inference.shard import Shard
			
 
				-from exo.networking.peer_handle import PeerHandle
			
 
				-from exo.networking.grpc.grpc_peer_handle import GRPCPeerHandle
			
 
				-from exo.topology.device_capabilities import DeviceCapabilities, DeviceFlops
			
 
				-from typing import List
			
 
				-import asyncio
			
 
				-import argparse
			
 
				-import uuid
			
 
				-
			
 
				-models = {
			
 
				-  "mlx-community/Meta-Llama-3-8B-Instruct-4bit": Shard(model_id="mlx-community/Meta-Llama-3-8B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=32),
			
 
				-  "mlx-community/Meta-Llama-3-70B-Instruct-4bit": Shard(model_id="mlx-community/Meta-Llama-3-70B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=80)
			
 
				-}
			
 
				-
			
 
				-path_or_hf_repo = "mlx-community/Meta-Llama-3-8B-Instruct-4bit"
			
 
				-model_path = get_model_path(path_or_hf_repo)
			
 
				-tokenizer_config = {}
			
 
				-tokenizer = load_tokenizer(model_path, tokenizer_config)
			
 
				-
			
 
				-# we intentionally leave out peer1 to demonstrate equality of nodes in exo.
			
 
				-# there is no "master" node in exo, all nodes are equal and can take on any role.
			
 
				-# peer1 = GRPCPeerHandle(
			
 
				-#     "node1",
			
 
				-#     "localhost:8080",
			
 
				-#     DeviceCapabilities(model="placeholder", chip="placeholder", memory=0)
			
 
				-# )
			
 
				-peer2 = GRPCPeerHandle("node2", "localhost:8081", DeviceCapabilities(model="placeholder", chip="placeholder", memory=0, flops=DeviceFlops(fp32=0, fp16=0, int8=0)))
			
 
				-shard = models[path_or_hf_repo]
			
 
				-request_id = str(uuid.uuid4())
			
 
				-
			
 
				-
			
 
				-async def run_prompt(prompt: str):
			
 
				-  if tokenizer.chat_template is None:
			
 
				-    tokenizer.chat_template = tokenizer.default_chat_template
			
 
				-  if (hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template is not None):
			
 
				-    messages = [{"role": "user", "content": prompt}]
			
 
				-    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
			
 
				-
			
 
				-  await peer2.connect()
			
 
				-
			
 
				-  try:
			
 
				-    await peer2.send_prompt(shard, prompt, request_id)
			
 
				-  except Exception as e:
			
 
				-    print(e)
			
 
				-
			
 
				-  import time
			
 
				-  # poll 10 times per second for result (even though generation is faster, any more than this it's not nice for the user)
			
 
				-  previous_length = 0
			
 
				-  n_tokens = 0
			
 
				-  start_time = time.perf_counter()
			
 
				-  while True:
			
 
				-    try:
			
 
				-      result, is_finished = await peer2.get_inference_result(request_id)
			
 
				-    except Exception as e:
			
 
				-      continue
			
 
				-    await asyncio.sleep(0.1)
			
 
				-
			
 
				-    # Print the updated string in place
			
 
				-    updated_string = tokenizer.decode(result)
			
 
				-    n_tokens = len(result)
			
 
				-    print(updated_string[previous_length:], end='', flush=True)
			
 
				-    previous_length = len(updated_string)
			
 
				-
			
 
				-    if is_finished:
			
 
				-      print("\nDone")
			
 
				-      break
			
 
				-  end_time = time.perf_counter()
			
 
				-  print(f"\nDone. Processed {n_tokens} tokens in {end_time - start_time:.2f} seconds ({n_tokens / (end_time - start_time):.2f} tokens/second)")
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-  parser = argparse.ArgumentParser(description="Run prompt")
			
 
				-  parser.add_argument("--prompt", type=str, help="The prompt to run")
			
 
				-  args = parser.parse_args()
			
 
				-
			
 
				-  asyncio.run(run_prompt(args.prompt))