llama3_distributed.py 3.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. # In this example, a user is running a home cluster with 3 shards.
  2. # They are prompting the cluster to generate a response to a question.
  3. # The cluster is given the question, and the user is given the response.
  4. from exo.inference.mlx.sharded_utils import get_model_path, load_tokenizer
  5. from exo.inference.shard import Shard
  6. from exo.networking.peer_handle import PeerHandle
  7. from exo.networking.grpc.grpc_peer_handle import GRPCPeerHandle
  8. from exo.topology.device_capabilities import DeviceCapabilities
  9. from typing import List
  10. import asyncio
  11. import argparse
  12. import uuid
  13. models = {
  14. "mlx-community/Meta-Llama-3-8B-Instruct-4bit": Shard(model_id="mlx-community/Meta-Llama-3-8B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=32),
  15. "mlx-community/Meta-Llama-3-70B-Instruct-4bit": Shard(model_id="mlx-community/Meta-Llama-3-70B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=80)
  16. }
  17. path_or_hf_repo = "mlx-community/Meta-Llama-3-8B-Instruct-4bit"
  18. model_path = get_model_path(path_or_hf_repo)
  19. tokenizer_config = {}
  20. tokenizer = load_tokenizer(model_path, tokenizer_config)
  21. # we intentionally leave out peer1 to demonstrate equality of nodes in exo.
  22. # there is no "master" node in exo, all nodes are equal and can take on any role.
  23. # peer1 = GRPCPeerHandle(
  24. # "node1",
  25. # "localhost:8080",
  26. # DeviceCapabilities(model="placeholder", chip="placeholder", memory=0)
  27. # )
  28. peer2 = GRPCPeerHandle(
  29. "node2",
  30. "localhost:8081",
  31. DeviceCapabilities(model="placeholder", chip="placeholder", memory=0)
  32. )
  33. shard = models[path_or_hf_repo]
  34. request_id = str(uuid.uuid4())
  35. async def run_prompt(prompt: str):
  36. if tokenizer.chat_template is None:
  37. tokenizer.chat_template = tokenizer.default_chat_template
  38. if (
  39. hasattr(tokenizer, "apply_chat_template")
  40. and tokenizer.chat_template is not None
  41. ):
  42. messages = [{"role": "user", "content": prompt}]
  43. prompt = tokenizer.apply_chat_template(
  44. messages, tokenize=False, add_generation_prompt=True
  45. )
  46. await peer2.connect()
  47. await peer2.global_reset(shard, set(), 2)
  48. try:
  49. await peer2.send_prompt(shard, prompt, request_id)
  50. except Exception as e:
  51. print(e)
  52. import time
  53. # poll 10 times per second for result (even though generation is faster, any more than this it's not nice for the user)
  54. previous_length = 0
  55. n_tokens = 0
  56. start_time = time.perf_counter()
  57. while True:
  58. try:
  59. result, is_finished = await peer2.get_inference_result(request_id)
  60. except Exception as e:
  61. continue
  62. await asyncio.sleep(0.1)
  63. # Print the updated string in place
  64. updated_string = tokenizer.decode(result)
  65. n_tokens = len(result)
  66. print(updated_string[previous_length:], end='', flush=True)
  67. previous_length = len(updated_string)
  68. if is_finished:
  69. print("\nDone")
  70. break
  71. end_time = time.perf_counter()
  72. print(f"\nDone. Processed {n_tokens} tokens in {end_time - start_time:.2f} seconds ({n_tokens / (end_time - start_time):.2f} tokens/second)")
  73. if __name__ == "__main__":
  74. parser = argparse.ArgumentParser(description="Run prompt")
  75. parser.add_argument("--prompt", type=str, help="The prompt to run")
  76. args = parser.parse_args()
  77. asyncio.run(run_prompt(args.prompt))