vor 1 Jahr · db010d51fb
--- a/exo/api/chatgpt_api.py
+++ b/exo/api/chatgpt_api.py
@@ -314,13 +314,13 @@ class ChatGPTAPI:
 
				 
			
 
				   async def handle_post_chat_completions(self, request):
			
 
				     data = await request.json()
			
 
				-    if DEBUG >= 2: print(f"Handling chat completions request from {request.remote}: {data}")
			
 
				+    if DEBUG >= 2: print(f"[ChatGPTAPI] Handling chat completions request from {request.remote}: {data}")
			
 
				     stream = data.get("stream", False)
			
 
				     chat_request = parse_chat_request(data, self.default_model)
			
 
				     if chat_request.model and chat_request.model.startswith("gpt-"):  # to be compatible with ChatGPT tools, point all gpt- model requests to default model
			
 
				       chat_request.model = self.default_model
			
 
				     if not chat_request.model or chat_request.model not in model_cards:
			
 
				-      if DEBUG >= 1: print(f"Invalid model: {chat_request.model}. Supported: {list(model_cards.keys())}. Defaulting to {self.default_model}")
			
 
				+      if DEBUG >= 1: print(f"[ChatGPTAPI] Invalid model: {chat_request.model}. Supported: {list(model_cards.keys())}. Defaulting to {self.default_model}")
			
 
				       chat_request.model = self.default_model
			
 
				     shard = build_base_shard(chat_request.model, self.inference_engine_classname)
			
 
				     if not shard:
			
@@ -331,7 +331,7 @@ class ChatGPTAPI:
 
				       )
			
 
				 
			
 
				     tokenizer = await resolve_tokenizer(get_repo(shard.model_id, self.inference_engine_classname))
			
 
				-    if DEBUG >= 4: print(f"Resolved tokenizer: {tokenizer}")
			
 
				+    if DEBUG >= 4: print(f"[ChatGPTAPI] Resolved tokenizer: {tokenizer}")
			
 
				 
			
 
				     prompt = build_prompt(tokenizer, chat_request.messages)
			
 
				     request_id = str(uuid.uuid4())
			
@@ -340,25 +340,13 @@ class ChatGPTAPI:
 
				         self.on_chat_completion_request(request_id, chat_request, prompt)
			
 
				       except Exception as e:
			
 
				         if DEBUG >= 2: traceback.print_exc()
			
 
				-    # request_id = None
			
 
				-    # match = self.prompts.find_longest_prefix(prompt)
			
 
				-    # if match and len(prompt) > len(match[1].prompt):
			
 
				-    #     if DEBUG >= 2:
			
 
				-    #       print(f"Prompt for request starts with previous prompt {len(match[1].prompt)} of {len(prompt)}: {match[1].prompt}")
			
 
				-    #     request_id = match[1].request_id
			
 
				-    #     self.prompts.add(prompt, PromptSession(request_id=request_id, timestamp=int(time.time()), prompt=prompt))
			
 
				-    #     # remove the matching prefix from the prompt
			
 
				-    #     prompt = prompt[len(match[1].prompt):]
			
 
				-    # else:
			
 
				-    #   request_id = str(uuid.uuid4())
			
 
				-    #   self.prompts.add(prompt, PromptSession(request_id=request_id, timestamp=int(time.time()), prompt=prompt))
			
 
				-
			
 
				-    if DEBUG >= 2: print(f"Sending prompt from ChatGPT api {request_id=} {shard=} {prompt=}")
			
 
				+
			
 
				+    if DEBUG >= 2: print(f"[ChatGPTAPI] Processing prompt: {request_id=} {shard=} {prompt=}")
			
 
				 
			
 
				     try:
			
 
				       await asyncio.wait_for(asyncio.shield(asyncio.create_task(self.node.process_prompt(shard, prompt, request_id=request_id))), timeout=self.response_timeout)
			
 
				 
			
 
				-      if DEBUG >= 2: print(f"Waiting for response to finish. timeout={self.response_timeout}s")
			
 
				+      if DEBUG >= 2: print(f"[ChatGPTAPI] Waiting for response to finish. timeout={self.response_timeout}s")
			
 
				 
			
 
				       if stream:
			
 
				         response = web.StreamResponse(
			
@@ -374,10 +362,12 @@ class ChatGPTAPI:
 
				         try:
			
 
				           # Stream tokens while waiting for inference to complete
			
 
				           while True:
			
 
				+            if DEBUG >= 2: print(f"[ChatGPTAPI] Waiting for token from queue: {request_id=}")
			
 
				             token, is_finished = await asyncio.wait_for(
			
 
				               self.token_queues[request_id].get(),
			
 
				               timeout=self.response_timeout
			
 
				             )
			
 
				+            if DEBUG >= 2: print(f"[ChatGPTAPI] Got token from queue: {request_id=} {token=} {is_finished=}")
			
 
				 
			
 
				             finish_reason = None
			
 
				             eos_token_id = tokenizer.special_tokens_map.get("eos_token_id") if hasattr(tokenizer, "_tokenizer") else getattr(tokenizer, "eos_token_id", None)
			
@@ -408,10 +398,13 @@ class ChatGPTAPI:
 
				           return response
			
 
				 
			
 
				         except asyncio.TimeoutError:
			
 
				+          if DEBUG >= 2: print(f"[ChatGPTAPI] Timeout waiting for token: {request_id=}")
			
 
				           return web.json_response({"detail": "Response generation timed out"}, status=408)
			
 
				 
			
 
				         except Exception as e:
			
 
				-          if DEBUG >= 2: traceback.print_exc()
			
 
				+          if DEBUG >= 2: 
			
 
				+            print(f"[ChatGPTAPI] Error processing prompt: {e}")
			
 
				+            traceback.print_exc()
			
 
				           return web.json_response(
			
 
				             {"detail": f"Error processing prompt: {str(e)}"},
			
 
				             status=500
			
@@ -420,6 +413,7 @@ class ChatGPTAPI:
 
				         finally:
			
 
				           # Clean up the queue for this request
			
 
				           if request_id in self.token_queues:
			
 
				+            if DEBUG >= 2: print(f"[ChatGPTAPI] Cleaning up token queue: {request_id=}")
			
 
				             del self.token_queues[request_id]
			
 
				       else:
			
 
				         tokens = []
			
--- a/exo/download/hf/hf_helpers.py
+++ b/exo/download/hf/hf_helpers.py
@@ -437,7 +437,7 @@ def get_allow_patterns(weight_map: Dict[str, str], shard: Shard) -> List[str]:
 
				       shard_specific_patterns.add(sorted_file_names[-1])
			
 
				   else:
			
 
				     shard_specific_patterns = set(["*.safetensors"])
			
 
				-  if DEBUG >= 2: print(f"get_allow_patterns {weight_map=} {shard=} {shard_specific_patterns=}")
			
 
				+  if DEBUG >= 4: print(f"get_allow_patterns {weight_map=} {shard=} {shard_specific_patterns=}")
			
 
				   return list(default_patterns | shard_specific_patterns)
			
 
				 
			
 
				 async def get_file_download_percentage(
			
--- a/exo/main.py
+++ b/exo/main.py
@@ -38,6 +38,7 @@ import concurrent.futures
 
				 import socket
			
 
				 import resource
			
 
				 import psutil
			
 
				+import grpc
			
 
				 
			
 
				 # Configure uvloop for maximum performance
			
 
				 def configure_uvloop():
			
@@ -308,6 +309,61 @@ async def train_model_cli(node: Node, inference_engine: InferenceEngine, model_n
 
				 async def main():
			
 
				   loop = asyncio.get_running_loop()
			
 
				 
			
 
				+  # Set up OpenTelemetry
			
 
				+  from opentelemetry import trace
			
 
				+  from opentelemetry.sdk.trace import TracerProvider
			
 
				+  from opentelemetry.sdk.trace.export import BatchSpanProcessor, SimpleSpanProcessor
			
 
				+  from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
			
 
				+  from opentelemetry.sdk.resources import Resource
			
 
				+  
			
 
				+  # Check if Jaeger is available
			
 
				+  def check_jaeger_connection():
			
 
				+    try:
			
 
				+      # Try to connect to the OTLP gRPC port
			
 
				+      sock = socket.create_connection(("localhost", 4317), timeout=1)
			
 
				+      sock.close()
			
 
				+      return True
			
 
				+    except (socket.timeout, socket.error):
			
 
				+      return False
			
 
				+  
			
 
				+  # Create and configure the tracer
			
 
				+  resource = Resource.create({
			
 
				+    "service.name": "exo-distributed",
			
 
				+    "service.instance.id": args.node_id
			
 
				+  })
			
 
				+  
			
 
				+  tracer_provider = TracerProvider(resource=resource)
			
 
				+  
			
 
				+  if check_jaeger_connection():
			
 
				+    print("Jaeger connection successful, setting up tracing...")
			
 
				+    # Configure the OTLP exporter with better defaults for high throughput
			
 
				+    otlp_exporter = OTLPSpanExporter(
			
 
				+      endpoint="http://localhost:4317",
			
 
				+      # Increase timeout to handle larger batches
			
 
				+      timeout=30.0,
			
 
				+    )
			
 
				+    
			
 
				+    # Configure the BatchSpanProcessor with appropriate batch settings
			
 
				+    span_processor = BatchSpanProcessor(
			
 
				+      otlp_exporter,
			
 
				+      # Reduce export frequency
			
 
				+      schedule_delay_millis=5000,
			
 
				+      # Increase max batch size
			
 
				+      max_export_batch_size=512,
			
 
				+      # Limit queue size to prevent memory issues
			
 
				+      max_queue_size=2048,
			
 
				+    )
			
 
				+    
			
 
				+    tracer_provider.add_span_processor(span_processor)
			
 
				+  else:
			
 
				+    print("Warning: Could not connect to Jaeger, tracing will be disabled")
			
 
				+    # Use a no-op span processor if Jaeger is not available
			
 
				+    from opentelemetry.sdk.trace.export import ConsoleSpanExporter
			
 
				+    tracer_provider.add_span_processor(SimpleSpanProcessor(ConsoleSpanExporter()))
			
 
				+  
			
 
				+  # Set the tracer provider
			
 
				+  trace.set_tracer_provider(tracer_provider)
			
 
				+
			
 
				   # Check HuggingFace directory permissions
			
 
				   hf_home, has_read, has_write = get_hf_home(), await has_hf_home_read_access(), await has_hf_home_write_access()
			
 
				   if DEBUG >= 1: print(f"Model storage directory: {hf_home}")
			
--- a/exo/networking/grpc/grpc_peer_handle.py
+++ b/exo/networking/grpc/grpc_peer_handle.py
@@ -90,34 +90,66 @@ class GRPCPeerHandle(PeerHandle):
 
				         traceback.print_exc()
			
 
				       return False
			
 
				 
			
 
				-  async def send_prompt(self, shard: Shard, prompt: str, request_id: Optional[str] = None) -> None:
			
 
				-    request = node_service_pb2.PromptRequest(
			
 
				-      prompt=prompt,
			
 
				+  async def send_prompt(
			
 
				+    self,
			
 
				+    shard: Shard,
			
 
				+    prompt: str,
			
 
				+    request_id: Optional[str] = None,
			
 
				+    sequence_number: Optional[int] = None,
			
 
				+    trace_parent: Optional[str] = None
			
 
				+  ) -> None:
			
 
				+    request = node_service_pb2.SendPromptRequest(
			
 
				       shard=node_service_pb2.Shard(
			
 
				         model_id=shard.model_id,
			
 
				         start_layer=shard.start_layer,
			
 
				         end_layer=shard.end_layer,
			
 
				         n_layers=shard.n_layers,
			
 
				       ),
			
 
				+      prompt=prompt,
			
 
				       request_id=request_id,
			
 
				+      sequence_number=sequence_number,
			
 
				+      trace_parent=trace_parent
			
 
				     )
			
 
				     await self.stub.SendPrompt(request)
			
 
				 
			
 
				-  async def send_tensor(self, shard: Shard, tensor: np.ndarray, request_id: Optional[str] = None) -> None:
			
 
				-    request = node_service_pb2.TensorRequest(
			
 
				+  async def send_tensor(
			
 
				+    self,
			
 
				+    shard: Shard,
			
 
				+    tensor: np.ndarray,
			
 
				+    request_id: Optional[str] = None,
			
 
				+    sequence_number: Optional[int] = None,
			
 
				+    trace_parent: Optional[str] = None
			
 
				+  ) -> None:
			
 
				+    request = node_service_pb2.SendTensorRequest(
			
 
				       shard=node_service_pb2.Shard(
			
 
				         model_id=shard.model_id,
			
 
				         start_layer=shard.start_layer,
			
 
				         end_layer=shard.end_layer,
			
 
				         n_layers=shard.n_layers,
			
 
				       ),
			
 
				-      tensor=node_service_pb2.Tensor(tensor_data=tensor.tobytes(), shape=tensor.shape, dtype=str(tensor.dtype)),
			
 
				+      tensor=node_service_pb2.Tensor(
			
 
				+        tensor_data=tensor.tobytes(),
			
 
				+        shape=tensor.shape,
			
 
				+        dtype=str(tensor.dtype)
			
 
				+      ),
			
 
				       request_id=request_id,
			
 
				+      sequence_number=sequence_number,
			
 
				+      trace_parent=trace_parent
			
 
				     )
			
 
				     await self.stub.SendTensor(request)
			
 
				-  
			
 
				-  async def send_example(self, shard: Shard, example: np.ndarray, target: np.ndarray, length: np.ndarray, train: bool, request_id: Optional[str] = None) -> Optional[np.array]:
			
 
				-    request = node_service_pb2.ExampleRequest(
			
 
				+
			
 
				+  async def send_example(
			
 
				+    self,
			
 
				+    shard: Shard,
			
 
				+    example: np.ndarray,
			
 
				+    target: np.ndarray,
			
 
				+    length: np.ndarray,
			
 
				+    train: bool,
			
 
				+    request_id: Optional[str] = None,
			
 
				+    sequence_number: Optional[int] = None,
			
 
				+    trace_parent: Optional[str] = None
			
 
				+  ) -> Optional[np.array]:
			
 
				+    request = node_service_pb2.SendExampleRequest(
			
 
				       shard=node_service_pb2.Shard(
			
 
				         model_id=shard.model_id,
			
 
				         start_layer=shard.start_layer,
			
@@ -129,6 +161,8 @@ class GRPCPeerHandle(PeerHandle):
 
				       length=node_service_pb2.Tensor(tensor_data=length.tobytes(), shape=length.shape, dtype=str(length.dtype)),
			
 
				       train=train,
			
 
				       request_id=request_id,
			
 
				+      sequence_number=sequence_number,
			
 
				+      trace_parent=trace_parent
			
 
				     )
			
 
				     response = await self.stub.SendExample(request)
			
 
				     loss = response.loss
			
@@ -137,7 +171,7 @@ class GRPCPeerHandle(PeerHandle):
 
				       return loss, grads
			
 
				     else:
			
 
				       return loss
			
 
				-  
			
 
				+
			
 
				   async def send_loss(self, shard: Shard, tensor: np.ndarray, request_id: Optional[str] = None) -> Optional[np.array]:
			
 
				     request = node_service_pb2.TensorRequest(
			
 
				       shard=node_service_pb2.Shard(
			
@@ -156,27 +190,78 @@ class GRPCPeerHandle(PeerHandle):
 
				 
			
 
				     return np.frombuffer(response.tensor_data, dtype=np.dtype(response.dtype)).reshape(response.shape)
			
 
				 
			
 
				-  async def collect_topology(self, visited: set[str], max_depth: int) -> Topology:
			
 
				-    request = node_service_pb2.CollectTopologyRequest(visited=visited, max_depth=max_depth)
			
 
				+  async def collect_topology(self, visited: set[str], max_depth: int = 4) -> Topology:
			
 
				+    if DEBUG >= 2: print(f"[GRPCPeerHandle] Collecting topology from {self.id()} with {visited=} {max_depth=}")
			
 
				+    
			
 
				+    # Convert set to list for GRPC request
			
 
				+    request = node_service_pb2.CollectTopologyRequest(
			
 
				+      visited=list(visited),
			
 
				+      max_depth=max_depth
			
 
				+    )
			
 
				+    
			
 
				+    # Make GRPC call
			
 
				     response = await self.stub.CollectTopology(request)
			
 
				+    if DEBUG >= 2: print(f"[GRPCPeerHandle] Got topology response from {self.id()}")
			
 
				+    
			
 
				+    # Convert proto topology to Topology object
			
 
				     topology = Topology()
			
 
				-    for node_id, capabilities in response.nodes.items():
			
 
				-      device_capabilities = DeviceCapabilities(
			
 
				-        model=capabilities.model,
			
 
				-        chip=capabilities.chip,
			
 
				-        memory=capabilities.memory,
			
 
				-        flops=DeviceFlops(fp16=capabilities.flops.fp16, fp32=capabilities.flops.fp32, int8=capabilities.flops.int8)
			
 
				+    proto_topology = response.topology
			
 
				+    
			
 
				+    # Convert nodes and their capabilities
			
 
				+    for node in proto_topology.nodes:
			
 
				+      # Convert DeviceCapabilities
			
 
				+      flops = DeviceFlops(
			
 
				+        fp32=node.capabilities.flops.fp32,
			
 
				+        fp16=node.capabilities.flops.fp16,
			
 
				+        int8=node.capabilities.flops.int8
			
 
				+      )
			
 
				+      capabilities = DeviceCapabilities(
			
 
				+        model=node.capabilities.model,
			
 
				+        chip=node.capabilities.chip,
			
 
				+        memory=node.capabilities.memory,
			
 
				+        flops=flops
			
 
				       )
			
 
				-      topology.update_node(node_id, device_capabilities)
			
 
				-    for node_id, peer_connections in response.peer_graph.items():
			
 
				-      for conn in peer_connections.connections:
			
 
				-        topology.add_edge(node_id, conn.to_id, conn.description)
			
 
				+      
			
 
				+      # Add node to topology
			
 
				+      topology.update_node(node.id, capabilities)
			
 
				+      
			
 
				+      # Add connections
			
 
				+      for conn in node.connections:
			
 
				+        topology.add_edge(node.id, conn.to_id, conn.description if conn.HasField("description") else None)
			
 
				+    
			
 
				+    # Set active node
			
 
				+    if proto_topology.HasField("active_node_id"):
			
 
				+      topology.active_node_id = proto_topology.active_node_id
			
 
				+    
			
 
				+    if DEBUG >= 2: print(f"[GRPCPeerHandle] Converted topology from {self.id()} with {len(topology.nodes)} nodes")
			
 
				     return topology
			
 
				 
			
 
				-  async def send_new_token(self, request_id: str, token: int, is_finished: bool) -> None:
			
 
				-    request = node_service_pb2.SendNewTokenRequest(request_id=request_id, token=token, is_finished=is_finished)
			
 
				+  async def send_new_token(
			
 
				+    self,
			
 
				+    request_id: str,
			
 
				+    token: int,
			
 
				+    is_finished: bool,
			
 
				+    sequence_number: Optional[int] = None,
			
 
				+    trace_parent: Optional[str] = None
			
 
				+  ) -> None:
			
 
				+    request = node_service_pb2.SendNewTokenRequest(
			
 
				+      request_id=request_id,
			
 
				+      token=token,
			
 
				+      is_finished=is_finished,
			
 
				+      sequence_number=sequence_number,
			
 
				+      trace_parent=trace_parent
			
 
				+    )
			
 
				     await self.stub.SendNewToken(request)
			
 
				 
			
 
				-  async def send_opaque_status(self, request_id: str, status: str) -> None:
			
 
				-    request = node_service_pb2.SendOpaqueStatusRequest(request_id=request_id, status=status)
			
 
				+  async def send_opaque_status(
			
 
				+    self,
			
 
				+    request_id: str,
			
 
				+    status: str,
			
 
				+    trace_parent: Optional[str] = None
			
 
				+  ) -> None:
			
 
				+    request = node_service_pb2.SendOpaqueStatusRequest(
			
 
				+      request_id=request_id,
			
 
				+      status=status,
			
 
				+      trace_parent=trace_parent
			
 
				+    )
			
 
				     await self.stub.SendOpaqueStatus(request)
			
--- a/exo/networking/grpc/grpc_server.py
+++ b/exo/networking/grpc/grpc_server.py
@@ -58,8 +58,21 @@ class GRPCServer(node_service_pb2_grpc.NodeServiceServicer):
 
				     )
			
 
				     prompt = request.prompt
			
 
				     request_id = request.request_id
			
 
				+    sequence_number = request.sequence_number if hasattr(request, 'sequence_number') else None
			
 
				+    trace_parent = request.trace_parent if hasattr(request, 'trace_parent') else None
			
 
				+    
			
 
				+    # Update trace context if sequence number or trace parent is provided
			
 
				+    if sequence_number is not None or trace_parent is not None:
			
 
				+      from exo.orchestration.tracing import tracer, TraceContext
			
 
				+      context = TraceContext(
			
 
				+        request_id=request_id,
			
 
				+        sequence_number=sequence_number or 0,
			
 
				+        trace_parent=trace_parent
			
 
				+      )
			
 
				+      tracer.set_context(request_id, context)
			
 
				+    
			
 
				     await self.node.process_prompt(shard, prompt, request_id)
			
 
				-    if DEBUG >= 5: print(f"SendPrompt {shard=} {prompt=} {request_id=}")
			
 
				+    if DEBUG >= 5: print(f"SendPrompt {shard=} {prompt=} {request_id=} {sequence_number=}")
			
 
				     return node_service_pb2.Empty()
			
 
				 
			
 
				   async def SendTensor(self, request, context):
			
@@ -71,8 +84,21 @@ class GRPCServer(node_service_pb2_grpc.NodeServiceServicer):
 
				     )
			
 
				     tensor = np.frombuffer(request.tensor.tensor_data, dtype=np.dtype(request.tensor.dtype)).reshape(request.tensor.shape)
			
 
				     request_id = request.request_id
			
 
				+    sequence_number = request.sequence_number if hasattr(request, 'sequence_number') else None
			
 
				+    trace_parent = request.trace_parent if hasattr(request, 'trace_parent') else None
			
 
				+    
			
 
				+    # Update trace context if sequence number or trace parent is provided
			
 
				+    if sequence_number is not None or trace_parent is not None:
			
 
				+      from exo.orchestration.tracing import tracer, TraceContext
			
 
				+      context = TraceContext(
			
 
				+        request_id=request_id,
			
 
				+        sequence_number=sequence_number or 0,
			
 
				+        trace_parent=trace_parent
			
 
				+      )
			
 
				+      tracer.set_context(request_id, context)
			
 
				+    
			
 
				     await self.node.process_tensor(shard, tensor, request_id)
			
 
				-    if DEBUG >= 5: print(f"SendTensor tensor {shard=} {tensor=} {request_id=}")
			
 
				+    if DEBUG >= 5: print(f"SendTensor tensor {shard=} {tensor=} {request_id=} {sequence_number=}")
			
 
				     return node_service_pb2.Empty()
			
 
				   
			
 
				   async def SendExample(self, request, context):
			
@@ -87,6 +113,18 @@ class GRPCServer(node_service_pb2_grpc.NodeServiceServicer):
 
				     length = np.frombuffer(request.length.tensor_data, dtype=np.dtype(request.length.dtype)).reshape(request.length.shape)
			
 
				     train = request.train
			
 
				     request_id = request.request_id
			
 
				+    sequence_number = request.sequence_number if hasattr(request, 'sequence_number') else None
			
 
				+    trace_parent = request.trace_parent if hasattr(request, 'trace_parent') else None
			
 
				+    
			
 
				+    # Update trace context if sequence number or trace parent is provided
			
 
				+    if sequence_number is not None or trace_parent is not None:
			
 
				+      from exo.orchestration.tracing import tracer, TraceContext
			
 
				+      context = TraceContext(
			
 
				+        request_id=request_id,
			
 
				+        sequence_number=sequence_number or 0,
			
 
				+        trace_parent=trace_parent
			
 
				+      )
			
 
				+      tracer.set_context(request_id, context)
			
 
				 
			
 
				     if train and not shard.is_first_layer():
			
 
				       loss, grad = await self.node.process_example(shard, example, target, length, train, request_id)
			
@@ -97,43 +135,100 @@ class GRPCServer(node_service_pb2_grpc.NodeServiceServicer):
 
				       loss = await self.node.process_example(shard, example, target, length, train, request_id)
			
 
				       return node_service_pb2.Loss(loss=loss, grads=None)
			
 
				     
			
 
				-  async def CollectTopology(self, request, context):
			
 
				-    max_depth = request.max_depth
			
 
				+  async def CollectTopology(
			
 
				+    self,
			
 
				+    request: node_service_pb2.CollectTopologyRequest,
			
 
				+    context: grpc.aio.ServicerContext,
			
 
				+  ) -> node_service_pb2.CollectTopologyResponse:
			
 
				+    # Convert visited list back to set
			
 
				     visited = set(request.visited)
			
 
				-    topology = self.node.current_topology
			
 
				-    nodes = {
			
 
				-      node_id:
			
 
				-        node_service_pb2.DeviceCapabilities(
			
 
				-          model=cap.model,
			
 
				-          chip=cap.chip,
			
 
				-          memory=cap.memory,
			
 
				-          flops=node_service_pb2.DeviceFlops(fp32=cap.flops.fp32, fp16=cap.flops.fp16, int8=cap.flops.int8),
			
 
				-        )
			
 
				-      for node_id, cap in topology.nodes.items()
			
 
				-    }
			
 
				-    peer_graph = {
			
 
				-      node_id: node_service_pb2.PeerConnections(
			
 
				-        connections=[
			
 
				-          node_service_pb2.PeerConnection(to_id=conn.to_id, description=conn.description)
			
 
				-          for conn in connections
			
 
				-        ]
			
 
				+    if DEBUG >= 2: print(f"[GRPCServer] CollectTopology request with {visited=} {request.max_depth=}")
			
 
				+    
			
 
				+    # Get topology from node
			
 
				+    topology = await self.node.collect_topology(visited, request.max_depth)
			
 
				+    if DEBUG >= 2: print(f"[GRPCServer] Got topology: {topology}")
			
 
				+    
			
 
				+    # Convert Topology to proto message
			
 
				+    proto_topology = node_service_pb2.CollectTopologyResponse.Topology()
			
 
				+    
			
 
				+    # Convert nodes and their capabilities
			
 
				+    for node_id, capabilities in topology.nodes.items():
			
 
				+      # Create DeviceFlops
			
 
				+      flops = node_service_pb2.CollectTopologyResponse.DeviceFlops(
			
 
				+        fp32=capabilities.flops.fp32,
			
 
				+        fp16=capabilities.flops.fp16,
			
 
				+        int8=capabilities.flops.int8
			
 
				       )
			
 
				-      for node_id, connections in topology.peer_graph.items()
			
 
				-    }
			
 
				-    if DEBUG >= 5: print(f"CollectTopology {max_depth=} {visited=} {nodes=} {peer_graph=}")
			
 
				-    return node_service_pb2.Topology(nodes=nodes, peer_graph=peer_graph)
			
 
				+      
			
 
				+      # Create DeviceCapabilities
			
 
				+      device_caps = node_service_pb2.CollectTopologyResponse.DeviceCapabilities(
			
 
				+        model=capabilities.model,
			
 
				+        chip=capabilities.chip,
			
 
				+        memory=capabilities.memory,
			
 
				+        flops=flops
			
 
				+      )
			
 
				+      
			
 
				+      # Get connections for this node
			
 
				+      connections = []
			
 
				+      if node_id in topology.peer_graph:
			
 
				+        for conn in topology.peer_graph[node_id]:
			
 
				+          proto_conn = node_service_pb2.CollectTopologyResponse.PeerConnection(
			
 
				+            to_id=conn.to_id,
			
 
				+            description=conn.description if conn.description else None
			
 
				+          )
			
 
				+          connections.append(proto_conn)
			
 
				+      
			
 
				+      # Create Node with its connections
			
 
				+      node = node_service_pb2.CollectTopologyResponse.Node(
			
 
				+        id=node_id,
			
 
				+        capabilities=device_caps,
			
 
				+        connections=connections
			
 
				+      )
			
 
				+      proto_topology.nodes.append(node)
			
 
				+    
			
 
				+    # Set active node if present
			
 
				+    if topology.active_node_id:
			
 
				+      proto_topology.active_node_id = topology.active_node_id
			
 
				+    
			
 
				+    if DEBUG >= 2: print(f"[GRPCServer] Sending topology response with {len(proto_topology.nodes)} nodes")
			
 
				+    return node_service_pb2.CollectTopologyResponse(topology=proto_topology)
			
 
				 
			
 
				   async def SendNewToken(self, request, context):
			
 
				     request_id = request.request_id
			
 
				     token = request.token
			
 
				     is_finished = request.is_finished
			
 
				-    if DEBUG >= 5: print(f"Received SendNewToken request: {request_id=} {token=} {is_finished=}")
			
 
				+    sequence_number = request.sequence_number if hasattr(request, 'sequence_number') else None
			
 
				+    trace_parent = request.trace_parent if hasattr(request, 'trace_parent') else None
			
 
				+    
			
 
				+    # Update trace context if sequence number or trace parent is provided
			
 
				+    if sequence_number is not None or trace_parent is not None:
			
 
				+      from exo.orchestration.tracing import tracer, TraceContext
			
 
				+      context = TraceContext(
			
 
				+        request_id=request_id,
			
 
				+        sequence_number=sequence_number or 0,
			
 
				+        trace_parent=trace_parent
			
 
				+      )
			
 
				+      tracer.set_context(request_id, context)
			
 
				+    
			
 
				+    if DEBUG >= 5: print(f"Received SendNewToken request: {request_id=} {token=} {is_finished=} {sequence_number=}")
			
 
				     self.node.on_token.trigger_all(request_id, token, is_finished)
			
 
				     return node_service_pb2.Empty()
			
 
				 
			
 
				   async def SendOpaqueStatus(self, request, context):
			
 
				     request_id = request.request_id
			
 
				     status = request.status
			
 
				+    trace_parent = request.trace_parent if hasattr(request, 'trace_parent') else None
			
 
				+    
			
 
				+    # Update trace context if trace parent is provided
			
 
				+    if trace_parent is not None:
			
 
				+      from exo.orchestration.tracing import tracer, TraceContext
			
 
				+      context = TraceContext(
			
 
				+        request_id=request_id,
			
 
				+        sequence_number=0,
			
 
				+        trace_parent=trace_parent
			
 
				+      )
			
 
				+      tracer.set_context(request_id, context)
			
 
				+    
			
 
				     if DEBUG >= 8: print(f"Received SendOpaqueStatus request: {request_id=} {status=}")
			
 
				     self.node.on_opaque_status.trigger_all(request_id, status)
			
 
				     return node_service_pb2.Empty()
			
--- a/exo/networking/grpc/node_service.proto
+++ b/exo/networking/grpc/node_service.proto
@@ -3,10 +3,10 @@ syntax = "proto3";
 
				 package node_service;
			
 
				 
			
 
				 service NodeService {
			
 
				-  rpc SendPrompt (PromptRequest) returns (Empty) {}
			
 
				-  rpc SendTensor (TensorRequest) returns (Empty) {}
			
 
				-  rpc SendExample (ExampleRequest) returns (Loss) {}
			
 
				-  rpc CollectTopology (CollectTopologyRequest) returns (Topology) {}
			
 
				+  rpc SendPrompt (SendPromptRequest) returns (Empty) {}
			
 
				+  rpc SendTensor (SendTensorRequest) returns (Empty) {}
			
 
				+  rpc SendExample (SendExampleRequest) returns (Empty) {}
			
 
				+  rpc CollectTopology (CollectTopologyRequest) returns (CollectTopologyResponse) {}
			
 
				   rpc SendNewToken (SendNewTokenRequest) returns (Empty) {}
			
 
				   rpc SendOpaqueStatus (SendOpaqueStatusRequest) returns (Empty) {}
			
 
				   rpc HealthCheck (HealthCheckRequest) returns (HealthCheckResponse) {}
			
@@ -19,25 +19,30 @@ message Shard {
 
				   int32 n_layers = 4;
			
 
				 }
			
 
				 
			
 
				-message PromptRequest {
			
 
				+message SendPromptRequest {
			
 
				   Shard shard = 1;
			
 
				   string prompt = 2;
			
 
				-  optional string request_id = 3;
			
 
				+  string request_id = 3;
			
 
				+  int32 sequence_number = 4;
			
 
				+  string trace_parent = 5;
			
 
				 }
			
 
				 
			
 
				-message TensorRequest {
			
 
				+message SendTensorRequest {
			
 
				   Shard shard = 1;
			
 
				   Tensor tensor = 2;
			
 
				-  optional string request_id = 3;
			
 
				+  string request_id = 3;
			
 
				+  int32 sequence_number = 4;
			
 
				+  string trace_parent = 5;
			
 
				 }
			
 
				 
			
 
				-message ExampleRequest {
			
 
				+message SendExampleRequest {
			
 
				   Shard shard = 1;
			
 
				-  Tensor example = 2;
			
 
				-  Tensor target = 3;
			
 
				-  Tensor length = 4;
			
 
				-  bool train = 5;
			
 
				-  optional string request_id = 6;
			
 
				+  bytes example = 2;
			
 
				+  bytes target = 3;
			
 
				+  bytes length = 4;
			
 
				+  string request_id = 5;
			
 
				+  bool train = 6;
			
 
				+  string trace_parent = 7;
			
 
				 }
			
 
				 
			
 
				 message Loss {
			
@@ -56,42 +61,51 @@ message CollectTopologyRequest {
 
				   int32 max_depth = 2;
			
 
				 }
			
 
				 
			
 
				-message Topology {
			
 
				-  map<string, DeviceCapabilities> nodes = 1;
			
 
				-  map<string, PeerConnections> peer_graph = 2;
			
 
				-}
			
 
				+message CollectTopologyResponse {
			
 
				+  message DeviceFlops {
			
 
				+    double fp32 = 1;
			
 
				+    double fp16 = 2;
			
 
				+    double int8 = 3;
			
 
				+  }
			
 
				 
			
 
				-message PeerConnection {
			
 
				-  string to_id = 1;
			
 
				-  optional string description = 2;
			
 
				-}
			
 
				+  message DeviceCapabilities {
			
 
				+    string model = 1;
			
 
				+    string chip = 2;
			
 
				+    int32 memory = 3;
			
 
				+    DeviceFlops flops = 4;
			
 
				+  }
			
 
				 
			
 
				-message PeerConnections {
			
 
				-  repeated PeerConnection connections = 1;
			
 
				-}
			
 
				+  message PeerConnection {
			
 
				+    string to_id = 1;
			
 
				+    optional string description = 2;
			
 
				+  }
			
 
				 
			
 
				-message DeviceFlops {
			
 
				-  double fp32 = 1;
			
 
				-  double fp16 = 2;
			
 
				-  double int8 = 3;
			
 
				-}
			
 
				+  message Node {
			
 
				+    string id = 1;
			
 
				+    DeviceCapabilities capabilities = 2;
			
 
				+    repeated PeerConnection connections = 3;
			
 
				+  }
			
 
				+
			
 
				+  message Topology {
			
 
				+    repeated Node nodes = 1;
			
 
				+    optional string active_node_id = 2;
			
 
				+  }
			
 
				 
			
 
				-message DeviceCapabilities {
			
 
				-  string model = 1;
			
 
				-  string chip = 2;
			
 
				-  int32 memory = 3;
			
 
				-  DeviceFlops flops = 4;
			
 
				+  Topology topology = 1;
			
 
				 }
			
 
				 
			
 
				 message SendNewTokenRequest {
			
 
				   string request_id = 1;
			
 
				   int32 token = 2;
			
 
				   bool is_finished = 3;
			
 
				+  int32 sequence_number = 4;
			
 
				+  string trace_parent = 5;
			
 
				 }
			
 
				 
			
 
				 message SendOpaqueStatusRequest {
			
 
				   string request_id = 1;
			
 
				   string status = 2;
			
 
				+  string trace_parent = 3;
			
 
				 }
			
 
				 
			
 
				 message HealthCheckRequest {}
			
--- a/exo/networking/grpc/node_service_pb2.py
+++ b/exo/networking/grpc/node_service_pb2.py
--- a/exo/networking/grpc/node_service_pb2_grpc.py
+++ b/exo/networking/grpc/node_service_pb2_grpc.py
@@ -36,23 +36,23 @@ class NodeServiceStub(object):
 
				         """
			
 
				         self.SendPrompt = channel.unary_unary(
			
 
				                 '/node_service.NodeService/SendPrompt',
			
 
				-                request_serializer=node__service__pb2.PromptRequest.SerializeToString,
			
 
				+                request_serializer=node__service__pb2.SendPromptRequest.SerializeToString,
			
 
				                 response_deserializer=node__service__pb2.Empty.FromString,
			
 
				                 _registered_method=True)
			
 
				         self.SendTensor = channel.unary_unary(
			
 
				                 '/node_service.NodeService/SendTensor',
			
 
				-                request_serializer=node__service__pb2.TensorRequest.SerializeToString,
			
 
				+                request_serializer=node__service__pb2.SendTensorRequest.SerializeToString,
			
 
				                 response_deserializer=node__service__pb2.Empty.FromString,
			
 
				                 _registered_method=True)
			
 
				         self.SendExample = channel.unary_unary(
			
 
				                 '/node_service.NodeService/SendExample',
			
 
				-                request_serializer=node__service__pb2.ExampleRequest.SerializeToString,
			
 
				-                response_deserializer=node__service__pb2.Loss.FromString,
			
 
				+                request_serializer=node__service__pb2.SendExampleRequest.SerializeToString,
			
 
				+                response_deserializer=node__service__pb2.Empty.FromString,
			
 
				                 _registered_method=True)
			
 
				         self.CollectTopology = channel.unary_unary(
			
 
				                 '/node_service.NodeService/CollectTopology',
			
 
				                 request_serializer=node__service__pb2.CollectTopologyRequest.SerializeToString,
			
 
				-                response_deserializer=node__service__pb2.Topology.FromString,
			
 
				+                response_deserializer=node__service__pb2.CollectTopologyResponse.FromString,
			
 
				                 _registered_method=True)
			
 
				         self.SendNewToken = channel.unary_unary(
			
 
				                 '/node_service.NodeService/SendNewToken',
			
@@ -121,23 +121,23 @@ def add_NodeServiceServicer_to_server(servicer, server):
 
				     rpc_method_handlers = {
			
 
				             'SendPrompt': grpc.unary_unary_rpc_method_handler(
			
 
				                     servicer.SendPrompt,
			
 
				-                    request_deserializer=node__service__pb2.PromptRequest.FromString,
			
 
				+                    request_deserializer=node__service__pb2.SendPromptRequest.FromString,
			
 
				                     response_serializer=node__service__pb2.Empty.SerializeToString,
			
 
				             ),
			
 
				             'SendTensor': grpc.unary_unary_rpc_method_handler(
			
 
				                     servicer.SendTensor,
			
 
				-                    request_deserializer=node__service__pb2.TensorRequest.FromString,
			
 
				+                    request_deserializer=node__service__pb2.SendTensorRequest.FromString,
			
 
				                     response_serializer=node__service__pb2.Empty.SerializeToString,
			
 
				             ),
			
 
				             'SendExample': grpc.unary_unary_rpc_method_handler(
			
 
				                     servicer.SendExample,
			
 
				-                    request_deserializer=node__service__pb2.ExampleRequest.FromString,
			
 
				-                    response_serializer=node__service__pb2.Loss.SerializeToString,
			
 
				+                    request_deserializer=node__service__pb2.SendExampleRequest.FromString,
			
 
				+                    response_serializer=node__service__pb2.Empty.SerializeToString,
			
 
				             ),
			
 
				             'CollectTopology': grpc.unary_unary_rpc_method_handler(
			
 
				                     servicer.CollectTopology,
			
 
				                     request_deserializer=node__service__pb2.CollectTopologyRequest.FromString,
			
 
				-                    response_serializer=node__service__pb2.Topology.SerializeToString,
			
 
				+                    response_serializer=node__service__pb2.CollectTopologyResponse.SerializeToString,
			
 
				             ),
			
 
				             'SendNewToken': grpc.unary_unary_rpc_method_handler(
			
 
				                     servicer.SendNewToken,
			
@@ -180,7 +180,7 @@ class NodeService(object):
 
				             request,
			
 
				             target,
			
 
				             '/node_service.NodeService/SendPrompt',
			
 
				-            node__service__pb2.PromptRequest.SerializeToString,
			
 
				+            node__service__pb2.SendPromptRequest.SerializeToString,
			
 
				             node__service__pb2.Empty.FromString,
			
 
				             options,
			
 
				             channel_credentials,
			
@@ -207,7 +207,7 @@ class NodeService(object):
 
				             request,
			
 
				             target,
			
 
				             '/node_service.NodeService/SendTensor',
			
 
				-            node__service__pb2.TensorRequest.SerializeToString,
			
 
				+            node__service__pb2.SendTensorRequest.SerializeToString,
			
 
				             node__service__pb2.Empty.FromString,
			
 
				             options,
			
 
				             channel_credentials,
			
@@ -234,8 +234,8 @@ class NodeService(object):
 
				             request,
			
 
				             target,
			
 
				             '/node_service.NodeService/SendExample',
			
 
				-            node__service__pb2.ExampleRequest.SerializeToString,
			
 
				-            node__service__pb2.Loss.FromString,
			
 
				+            node__service__pb2.SendExampleRequest.SerializeToString,
			
 
				+            node__service__pb2.Empty.FromString,
			
 
				             options,
			
 
				             channel_credentials,
			
 
				             insecure,
			
@@ -262,7 +262,7 @@ class NodeService(object):
 
				             target,
			
 
				             '/node_service.NodeService/CollectTopology',
			
 
				             node__service__pb2.CollectTopologyRequest.SerializeToString,
			
 
				-            node__service__pb2.Topology.FromString,
			
 
				+            node__service__pb2.CollectTopologyResponse.FromString,
			
 
				             options,
			
 
				             channel_credentials,
			
 
				             insecure,
			
--- a/exo/orchestration/node.py
+++ b/exo/orchestration/node.py
@@ -16,6 +16,7 @@ from exo.viz.topology_viz import TopologyViz
 
				 from exo.download.hf.hf_helpers import RepoProgressEvent
			
 
				 from exo.inference.inference_engine import get_inference_engine, InferenceEngine
			
 
				 from exo.download.hf.hf_shard_download import HFShardDownloader
			
 
				+from exo.orchestration.tracing import tracer, TraceContext
			
 
				 
			
 
				 class Node:
			
 
				   def __init__(
			
@@ -111,44 +112,79 @@ class Node:
 
				   def get_topology_inference_engines(self) -> List[List[str]]:
			
 
				     return self.topology_inference_engines_pool
			
 
				   
			
 
				-  token_count = 0
			
 
				-  first_token_time = 0
			
 
				   async def process_inference_result(
			
 
				     self,
			
 
				     shard,
			
 
				     result: np.ndarray,
			
 
				     request_id: Optional[str] = None,
			
 
				   ):
			
 
				-    if request_id not in self.buffered_token_output:
			
 
				-      self.buffered_token_output[request_id] = ([], False)
			
 
				-    is_finished = len(self.buffered_token_output[request_id][0]) >= self.max_generate_tokens
			
 
				-    
			
 
				-    if shard.is_last_layer() and not is_finished:
			
 
				-      self.token_count += 1
			
 
				-      if self.token_count == 1:
			
 
				-        self.first_token_time = time.perf_counter_ns()
			
 
				-      if self.token_count % 20 == 0:
			
 
				-        print(f"[{request_id}] TPS: {self.token_count / ((time.perf_counter_ns() - self.first_token_time) / 1e9)}")
			
 
				-
			
 
				-      token = await self.inference_engine.sample(result, temp=self.default_sample_temperature)
			
 
				-      await self.inference_engine.ensure_shard(shard)
			
 
				-      self.buffered_token_output[request_id][0].append(token.item())
			
 
				-      is_finished = token.item() == self.inference_engine.tokenizer.eos_token_id or is_finished or len(self.buffered_token_output[request_id][0]) >= self.max_generate_tokens
			
 
				-      if DEBUG >= 2: print(f"[{request_id}] result size: {result.size}, is finished: {is_finished}, buffered tokens: {len(self.buffered_token_output[request_id][0])}")
			
 
				-      forward = token.reshape(1, -1)
			
 
				-      self.trigger_on_token_callbacks(request_id, token.item(), is_finished)
			
 
				-      asyncio.create_task(self.broadcast_new_token(request_id, token.item(), is_finished))
			
 
				-    else:
			
 
				-      forward = result
			
 
				-
			
 
				-    if is_finished:
			
 
				-      self.buffered_token_output[request_id] = (self.buffered_token_output[request_id][0], True)
			
 
				-      self.outstanding_requests.pop(request_id)
			
 
				-    else:
			
 
				-      self.outstanding_requests[request_id] = "waiting"
			
 
				-      asyncio.create_task(self.forward_tensor(shard, forward, request_id, self.get_partition_index(offset = 1)))
			
 
				+    context = tracer.get_context(request_id)
			
 
				+    if not context:
			
 
				+      context = TraceContext(request_id=request_id or str(uuid.uuid4()), sequence_number=0)
			
 
				+      tracer.set_context(request_id, context)
			
 
				 
			
 
				-    return np.array(self.buffered_token_output[request_id][0])
			
 
				+    try:
			
 
				+      with tracer.start_span(
			
 
				+        f"process_inference_result.{self.get_partition_index()}",
			
 
				+        context,
			
 
				+        extra_attributes={
			
 
				+          "partition_index": self.get_partition_index(),
			
 
				+          "node_id": self.id,
			
 
				+          "start_layer": shard.start_layer,
			
 
				+          "end_layer": shard.end_layer
			
 
				+        }
			
 
				+      ):
			
 
				+        if request_id not in self.buffered_token_output:
			
 
				+          self.buffered_token_output[request_id] = ([], False)
			
 
				+        is_finished = len(self.buffered_token_output[request_id][0]) >= self.max_generate_tokens
			
 
				+        
			
 
				+        if shard.is_last_layer() and not is_finished:
			
 
				+          token = await self.inference_engine.sample(result, temp=self.default_sample_temperature)
			
 
				+          forward = token.reshape(1, -1)
			
 
				+          
			
 
				+          # Increment sequence number for next forward pass
			
 
				+          next_sequence = context.sequence_number + 1
			
 
				+          # Create new context but preserve request span
			
 
				+          next_context = TraceContext(
			
 
				+            request_id=context.request_id, 
			
 
				+            sequence_number=next_sequence,
			
 
				+            request_span=context.request_span  # Preserve request span
			
 
				+          )
			
 
				+          tracer.set_context(request_id, next_context)
			
 
				+          
			
 
				+          self.buffered_token_output[request_id][0].append(token.item())
			
 
				+          is_finished = token.item() == self.inference_engine.tokenizer.eos_token_id or is_finished
			
 
				+          self.trigger_on_token_callbacks(request_id, token.item(), is_finished)
			
 
				+          await self.broadcast_new_token(request_id, token.item(), is_finished)
			
 
				+          
			
 
				+          if not is_finished:
			
 
				+            self.outstanding_requests[request_id] = "waiting"
			
 
				+            asyncio.create_task(self.forward_tensor(shard, forward, request_id, self.get_partition_index(offset = 1)))
			
 
				+        else:
			
 
				+          forward = result
			
 
				+          if not is_finished:
			
 
				+            self.outstanding_requests[request_id] = "waiting"
			
 
				+            asyncio.create_task(self.forward_tensor(shard, forward, request_id, self.get_partition_index(offset = 1)))
			
 
				+
			
 
				+        if is_finished:
			
 
				+          # End the request span when generation is complete
			
 
				+          if context.request_span:
			
 
				+            context.request_span.set_attribute("total_tokens", len(self.buffered_token_output[request_id][0]))
			
 
				+            context.request_span.end()
			
 
				+            context.request_span = None
			
 
				+          self.buffered_token_output[request_id] = (self.buffered_token_output[request_id][0], True)
			
 
				+          self.outstanding_requests.pop(request_id)
			
 
				+
			
 
				+        return np.array(self.buffered_token_output[request_id][0])
			
 
				+    except Exception as e:
			
 
				+      if request_id in self.outstanding_requests:
			
 
				+        self.outstanding_requests.pop(request_id)
			
 
				+      # End request span on error
			
 
				+      if context and context.request_span:
			
 
				+        context.request_span.set_status(Status(StatusCode.ERROR, str(e)))
			
 
				+        context.request_span.end()
			
 
				+        context.request_span = None
			
 
				+      raise
			
 
				 
			
 
				   async def process_prompt(
			
 
				     self,
			
@@ -195,18 +231,46 @@ class Node:
 
				   async def _process_prompt(self, base_shard: Shard, prompt: str, request_id: Optional[str] = None) -> Optional[np.ndarray]:
			
 
				     if request_id is None:
			
 
				       request_id = str(uuid.uuid4())
			
 
				+      
			
 
				+    # Create or get trace context
			
 
				+    context = tracer.get_context(request_id)
			
 
				+    if not context:
			
 
				+      # Create new context with request span
			
 
				+      request_span = tracer.tracer.start_span(
			
 
				+        "request",
			
 
				+        attributes={
			
 
				+          "request_id": request_id,
			
 
				+          "prompt": prompt,
			
 
				+          "node_id": self.id,
			
 
				+          "request_type": "process_prompt"
			
 
				+        }
			
 
				+      )
			
 
				+      context = TraceContext(
			
 
				+        request_id=request_id,
			
 
				+        sequence_number=0,
			
 
				+        request_span=request_span,
			
 
				+        current_span=request_span,
			
 
				+        trace_parent=tracer.inject_context(request_span)
			
 
				+      )
			
 
				+      tracer.set_context(request_id, context)
			
 
				+      
			
 
				     shard = self.get_current_shard(base_shard)
			
 
				     if DEBUG >= 2: print(f"[{request_id}] process prompt: {base_shard=} {shard=} {prompt=}")
			
 
				 
			
 
				-    if not shard.is_first_layer():
			
 
				-      if DEBUG >= 2: print(f"[{request_id}] forwarding to next shard: {base_shard=} {shard=} {prompt=}")
			
 
				-      self.outstanding_requests[request_id] = "waiting"
			
 
				-      await self.forward_prompt(shard, prompt, request_id, 0)
			
 
				-      return None
			
 
				+    try:
			
 
				+      if not shard.is_first_layer():
			
 
				+        if DEBUG >= 2: print(f"[{request_id}] forwarding to next shard: {base_shard=} {shard=} {prompt=}")
			
 
				+        self.outstanding_requests[request_id] = "waiting"
			
 
				+        await self.forward_prompt(shard, prompt, request_id, 0)
			
 
				+        return None
			
 
				 
			
 
				-    self.outstanding_requests[request_id] = "processing"
			
 
				-    result = await self.inference_engine.infer_prompt(request_id, shard, prompt)
			
 
				-    await self.process_inference_result(shard, result, request_id)
			
 
				+      self.outstanding_requests[request_id] = "processing"
			
 
				+      result = await self.inference_engine.infer_prompt(request_id, shard, prompt)
			
 
				+      await self.process_inference_result(shard, result, request_id)
			
 
				+    except Exception as e:
			
 
				+      if context.request_span:
			
 
				+        context.request_span.set_status(Status(StatusCode.ERROR, str(e)))
			
 
				+      raise
			
 
				 
			
 
				   async def enqueue_example(
			
 
				     self,
			
@@ -350,33 +414,36 @@ class Node:
 
				     base_shard: Shard,
			
 
				     tensor: np.ndarray,
			
 
				     request_id: Optional[str] = None,
			
 
				-  ) -> None:
			
 
				-    shard = self.get_current_shard(base_shard)
			
 
				-    start_time = time.perf_counter_ns()
			
 
				-    await self._process_tensor(shard, tensor, request_id)
			
 
				-    end_time = time.perf_counter_ns()
			
 
				-    elapsed_time_ns = end_time - start_time
			
 
				-    if DEBUG >= 2: print(f"[{request_id}] process_tensor: {base_shard=} {shard=} {tensor.size=} {tensor.shape=} {elapsed_time_ns=}")
			
 
				-
			
 
				-  async def _process_tensor(
			
 
				-    self,
			
 
				-    base_shard: Shard,
			
 
				-    tensor: np.ndarray,
			
 
				-    request_id: Optional[str] = None,
			
 
				-  ) -> None:
			
 
				-    if request_id is None:
			
 
				-      request_id = str(uuid.uuid4())
			
 
				-    shard = self.get_current_shard(base_shard)
			
 
				+  ):
			
 
				+    context = tracer.get_context(request_id)
			
 
				+    if not context:
			
 
				+      context = TraceContext(request_id=request_id or str(uuid.uuid4()), sequence_number=0)
			
 
				+      tracer.set_context(request_id, context)
			
 
				 
			
 
				     try:
			
 
				       self.outstanding_requests[request_id] = "processing"
			
 
				-      result = await self.inference_engine.infer_tensor(request_id, shard, tensor)
			
 
				-      await self.process_inference_result(shard, result, request_id) 
			
 
				+      with tracer.start_span(
			
 
				+        f"process_tensor.{self.get_partition_index()}",
			
 
				+        context,
			
 
				+        extra_attributes={
			
 
				+          "partition_index": self.get_partition_index(),
			
 
				+          "node_id": self.id,
			
 
				+          "start_layer": base_shard.start_layer,
			
 
				+          "end_layer": base_shard.end_layer,
			
 
				+          "tensor_shape": str(tensor.shape)
			
 
				+        }
			
 
				+      ):
			
 
				+        result = await self.inference_engine.infer_tensor(request_id, base_shard, tensor)
			
 
				+        await self.process_inference_result(base_shard, result, request_id)
			
 
				     except Exception as e:
			
 
				-      self.outstanding_requests.pop(request_id)
			
 
				-      print(f"Error processing tensor for shard {shard}: {e}")
			
 
				+      if request_id in self.outstanding_requests:
			
 
				+        self.outstanding_requests.pop(request_id)
			
 
				+      if context and context.request_span:
			
 
				+        context.request_span.set_status(Status(StatusCode.ERROR, str(e)))
			
 
				+      print(f"Error processing tensor for shard {base_shard}: {e}")
			
 
				       traceback.print_exc()
			
 
				-  
			
 
				+      raise
			
 
				+
			
 
				   async def forward_example(
			
 
				     self,
			
 
				     base_shard: Shard,
			
@@ -405,18 +472,39 @@ class Node:
 
				     request_id: str,
			
 
				     target_index: int,
			
 
				   ) -> None:
			
 
				-    if DEBUG >= 1: print(f"target partition index: {target_index}")
			
 
				-    target_id = self.partitioning_strategy.partition(self.topology)[target_index].node_id
			
 
				-    next_shard = self.get_current_shard(base_shard, target_index)
			
 
				-    if DEBUG >= 2: print(f"Computed target from: {base_shard} {target_index}, {self.topology}. next shard: {next_shard}")
			
 
				-    if target_id == self.id:
			
 
				-      await self.process_prompt(next_shard, prompt, request_id)
			
 
				-    else:
			
 
				-      target_peer = next((p for p in self.peers if p.id() == target_id), None)
			
 
				-      if not target_peer:
			
 
				-        raise ValueError(f"Peer for {target_index} not found")
			
 
				-      if DEBUG >= 1: print(f"Sending prompt to {target_peer.id()}: {prompt}")
			
 
				-      await target_peer.send_prompt(next_shard, prompt, request_id=request_id)
			
 
				+    context = tracer.get_context(request_id)
			
 
				+    if not context:
			
 
				+      context = TraceContext(request_id=request_id, sequence_number=0)
			
 
				+      tracer.set_context(request_id, context)
			
 
				+
			
 
				+    with tracer.start_span(
			
 
				+      "forward_prompt",
			
 
				+      context,
			
 
				+      extra_attributes={
			
 
				+        "source_node": self.id,
			
 
				+        "target_index": target_index,
			
 
				+        "prompt": prompt
			
 
				+      }
			
 
				+    ) as span:
			
 
				+      if DEBUG >= 1: print(f"target partition index: {target_index}")
			
 
				+      target_id = self.partitioning_strategy.partition(self.topology)[target_index].node_id
			
 
				+      next_shard = self.get_current_shard(base_shard, target_index)
			
 
				+      span.set_attribute("target_node", target_id)
			
 
				+      
			
 
				+      # Get trace context for propagation
			
 
				+      trace_parent = tracer.inject_context(span)
			
 
				+      
			
 
				+      if DEBUG >= 2: print(f"Computed target from: {base_shard} {target_index}, {self.topology}. next shard: {next_shard}")
			
 
				+      if target_id == self.id:
			
 
				+        # Update local context with trace parent
			
 
				+        context.trace_parent = trace_parent
			
 
				+        await self.process_prompt(next_shard, prompt, request_id)
			
 
				+      else:
			
 
				+        target_peer = next((p for p in self.peers if p.id() == target_id), None)
			
 
				+        if not target_peer:
			
 
				+          raise ValueError(f"Peer for {target_index} not found")
			
 
				+        if DEBUG >= 1: print(f"Sending prompt to {target_peer.id()}: {prompt}")
			
 
				+        await target_peer.send_prompt(next_shard, prompt, request_id=request_id, sequence_number=context.sequence_number, trace_parent=trace_parent)
			
 
				   
			
 
				   async def forward_tensor(
			
 
				     self,
			
@@ -424,19 +512,39 @@ class Node:
 
				     tensor: np.ndarray,
			
 
				     request_id: str,
			
 
				     target_index: int,
			
 
				-  ) -> None:
			
 
				-    if DEBUG >= 1: print(f"target partition index: {target_index}")
			
 
				-    target_id = self.partitioning_strategy.partition(self.topology)[target_index].node_id
			
 
				-    next_shard = self.get_current_shard(base_shard, target_index)
			
 
				-    if DEBUG >= 2: print(f"Computed target from: {base_shard} {target_index}, {self.topology}. target shard: {next_shard}")
			
 
				-    if target_id == self.id:
			
 
				-      await self.process_tensor(next_shard, tensor, request_id)
			
 
				-    else:
			
 
				-      target_peer = next((p for p in self.peers if p.id() == target_id), None)
			
 
				-      if not target_peer:
			
 
				-        raise ValueError(f"Peer for {target_index} not found")
			
 
				-      if DEBUG >= 1: print(f"Sending tensor to {target_peer.id()}: {tensor}")
			
 
				-      await target_peer.send_tensor(next_shard, tensor, request_id=request_id)
			
 
				+  ):
			
 
				+    context = tracer.get_context(request_id)
			
 
				+    if not context:
			
 
				+      context = TraceContext(request_id=request_id, sequence_number=0)
			
 
				+      tracer.set_context(request_id, context)
			
 
				+
			
 
				+    with tracer.start_span(
			
 
				+      "forward_tensor",
			
 
				+      context,
			
 
				+      extra_attributes={
			
 
				+        "source_node": self.id,
			
 
				+        "target_index": target_index,
			
 
				+        "tensor_shape": str(tensor.shape)
			
 
				+      }
			
 
				+    ) as span:
			
 
				+      target_id = self.partitioning_strategy.partition(self.topology)[target_index].node_id
			
 
				+      next_shard = self.get_current_shard(base_shard, target_index)
			
 
				+      span.set_attribute("target_node", target_id)
			
 
				+      
			
 
				+      # Get trace context for propagation
			
 
				+      trace_parent = tracer.inject_context(context.request_span or span)
			
 
				+      
			
 
				+      if target_id == self.id:
			
 
				+        # Update local context with trace parent
			
 
				+        context.trace_parent = trace_parent
			
 
				+        await self.process_tensor(next_shard, tensor, request_id)
			
 
				+      else:
			
 
				+        target_peer = next((p for p in self.peers if p.id() == target_id), None)
			
 
				+        if not target_peer:
			
 
				+          raise ValueError(f"Peer for {target_index} not found")
			
 
				+        
			
 
				+        if DEBUG >= 1: print(f"Sending tensor to {target_peer.id()}: {tensor}")
			
 
				+        await target_peer.send_tensor(next_shard, tensor, request_id=request_id, sequence_number=context.sequence_number, trace_parent=trace_parent)
			
 
				 
			
 
				   def get_partition_index(self, offset: int = 0):
			
 
				     if not self.partitioning_strategy:
			
@@ -570,20 +678,32 @@ class Node:
 
				     return self._on_opaque_status
			
 
				 
			
 
				   def trigger_on_token_callbacks(self, request_id: str, token: int, is_finished: bool) -> None:
			
 
				-    if DEBUG >= 2: print(f"Triggering all on_token callbacks with {request_id=} {token=} {is_finished=}")
			
 
				+    if DEBUG >= 2: print(f"[Node] Triggering token callbacks: {request_id=} {token=} {is_finished=}")
			
 
				     self.on_token.trigger_all(request_id, token, is_finished)
			
 
				   
			
 
				-  async def broadcast_new_token(self, request_id: str, token: int, is_finished: bool) -> None:
			
 
				-    async def send_new_token_to_peer(peer):
			
 
				-      try:
			
 
				-        await asyncio.wait_for(peer.send_new_token(request_id, token, is_finished), timeout=15.0)
			
 
				-      except asyncio.TimeoutError:
			
 
				-        print(f"Timeout broadcasting new token to {peer.id()}")
			
 
				-      except Exception as e:
			
 
				-        print(f"Error broadcasting new token to {peer.id()}: {e}")
			
 
				-        traceback.print_exc()
			
 
				-
			
 
				-    await asyncio.gather(*[send_new_token_to_peer(peer) for peer in self.peers], return_exceptions=True)
			
 
				+  async def broadcast_new_token(self, request_id: str, token: int, is_finished: bool):
			
 
				+    """Broadcast a new token to all peers."""
			
 
				+    context = tracer.get_context(request_id)
			
 
				+    if context:
			
 
				+      # Handle token in tracer for grouping
			
 
				+      tracer.handle_token(context, token, is_finished)
			
 
				+      # Get current trace context for propagation
			
 
				+      trace_parent = ""
			
 
				+      if context.current_span:
			
 
				+        trace_parent = tracer.inject_context(context.current_span)
			
 
				+    
			
 
				+    tasks = []
			
 
				+    for peer in self.peers:
			
 
				+      tasks.append(
			
 
				+        peer.send_new_token(
			
 
				+          request_id,
			
 
				+          token,
			
 
				+          is_finished,
			
 
				+          context.sequence_number if context else 0,
			
 
				+          trace_parent if context else ""
			
 
				+        )
			
 
				+      )
			
 
				+    await asyncio.gather(*tasks)
			
 
				 
			
 
				   async def broadcast_opaque_status(self, request_id: str, status: str) -> None:
			
 
				     if DEBUG >= 8: print(f"Broadcasting opaque status: {request_id=} {status=}")
			
--- a/exo/orchestration/tracing.py
+++ b/exo/orchestration/tracing.py
@@ -0,0 +1,166 @@
 
				+from dataclasses import dataclass
			
 
				+from typing import Dict, Optional, Any
			
 
				+from opentelemetry import trace, context
			
 
				+from opentelemetry.trace import Status, StatusCode, SpanContext
			
 
				+from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
			
 
				+from contextlib import contextmanager
			
 
				+import time
			
 
				+from threading import Lock
			
 
				+
			
 
				+@dataclass
			
 
				+class TraceContext:
			
 
				+  request_id: str
			
 
				+  sequence_number: int
			
 
				+  current_span: Optional[trace.Span] = None
			
 
				+  trace_parent: Optional[str] = None
			
 
				+  token_group_span: Optional[trace.Span] = None
			
 
				+  token_count: int = 0
			
 
				+  token_group_size: int = 10  # Default group size
			
 
				+  request_span: Optional[trace.Span] = None  # Track the main request span
			
 
				+
			
 
				+class Tracer:
			
 
				+  def __init__(self):
			
 
				+    self.tracer = trace.get_tracer("exo")
			
 
				+    self.contexts: Dict[str, TraceContext] = {}
			
 
				+    self._lock = Lock()
			
 
				+    self.propagator = TraceContextTextMapPropagator()
			
 
				+    
			
 
				+  def get_context(self, request_id: str) -> Optional[TraceContext]:
			
 
				+    with self._lock:
			
 
				+      return self.contexts.get(request_id)
			
 
				+
			
 
				+  def set_context(self, request_id: str, context: TraceContext):
			
 
				+    with self._lock:
			
 
				+      self.contexts[request_id] = context
			
 
				+
			
 
				+  def inject_context(self, span: trace.Span) -> str:
			
 
				+    """Inject current span context into carrier for propagation"""
			
 
				+    carrier = {}
			
 
				+    ctx = trace.set_span_in_context(span)
			
 
				+    self.propagator.inject(carrier, context=ctx)
			
 
				+    return carrier.get("traceparent", "")
			
 
				+
			
 
				+  def extract_context(self, trace_parent: str) -> Optional[context.Context]:
			
 
				+    """Extract span context from carrier"""
			
 
				+    if not trace_parent:
			
 
				+      return None
			
 
				+    carrier = {"traceparent": trace_parent}
			
 
				+    return self.propagator.extract(carrier)
			
 
				+
			
 
				+  def create_context_from_parent(self, request_id: str, trace_parent: str, sequence_number: int = 0) -> TraceContext:
			
 
				+    """Create a new context with the given trace parent"""
			
 
				+    parent_ctx = self.extract_context(trace_parent)
			
 
				+    if parent_ctx:
			
 
				+      # Create a new request span that links to the parent context
			
 
				+      request_span = self.tracer.start_span(
			
 
				+        "request",
			
 
				+        context=parent_ctx,
			
 
				+        attributes={
			
 
				+          "request_id": request_id,
			
 
				+          "sequence_number": sequence_number
			
 
				+        }
			
 
				+      )
			
 
				+      return TraceContext(
			
 
				+        request_id=request_id,
			
 
				+        sequence_number=sequence_number,
			
 
				+        request_span=request_span,
			
 
				+        current_span=request_span,
			
 
				+        trace_parent=trace_parent
			
 
				+      )
			
 
				+    return TraceContext(request_id=request_id, sequence_number=sequence_number)
			
 
				+
			
 
				+  def handle_token(self, context: TraceContext, token: int, is_finished: bool = False):
			
 
				+    """Handle token generation and manage token group spans"""
			
 
				+    context.token_count += 1
			
 
				+    
			
 
				+    # Start a new token group span if needed
			
 
				+    if not context.token_group_span and context.request_span:
			
 
				+      group_number = (context.token_count - 1) // context.token_group_size + 1
			
 
				+      
			
 
				+      # Create token group span as child of request span
			
 
				+      parent_ctx = trace.set_span_in_context(context.request_span)
			
 
				+      context.token_group_span = self.tracer.start_span(
			
 
				+        f"token_group_{group_number}",
			
 
				+        context=parent_ctx,
			
 
				+        attributes={
			
 
				+          "request_id": context.request_id,
			
 
				+          "group.number": group_number,
			
 
				+          "group.start_token": context.token_count,
			
 
				+          "group.max_tokens": context.token_group_size
			
 
				+        }
			
 
				+      )
			
 
				+    
			
 
				+    # Add token to current group span
			
 
				+    if context.token_group_span:
			
 
				+      relative_pos = ((context.token_count - 1) % context.token_group_size) + 1
			
 
				+      context.token_group_span.set_attribute(f"token.{relative_pos}", token)
			
 
				+      context.token_group_span.set_attribute("token.count", relative_pos)
			
 
				+      
			
 
				+      # End current group span if we've reached the group size or if generation is finished
			
 
				+      if context.token_count % context.token_group_size == 0 or is_finished:
			
 
				+        context.token_group_span.set_attribute("token.final_count", relative_pos)
			
 
				+        context.token_group_span.end()
			
 
				+        context.token_group_span = None
			
 
				+
			
 
				+  @contextmanager
			
 
				+  def start_span(self, name: str, context: TraceContext, extra_attributes: Optional[Dict[str, Any]] = None):
			
 
				+    """Start a new span with proper parent context"""
			
 
				+    attributes = {
			
 
				+      "request_id": context.request_id,
			
 
				+      "sequence_number": context.sequence_number
			
 
				+    }
			
 
				+    if extra_attributes:
			
 
				+      attributes.update(extra_attributes)
			
 
				+      
			
 
				+    # Use request span as parent if available
			
 
				+    parent_ctx = None
			
 
				+    if context.request_span:
			
 
				+      parent_ctx = trace.set_span_in_context(context.request_span)
			
 
				+    elif context.trace_parent:
			
 
				+      parent_ctx = self.extract_context(context.trace_parent)
			
 
				+      if parent_ctx and not context.request_span:
			
 
				+        # Create a new request span that links to the parent context
			
 
				+        context.request_span = self.tracer.start_span(
			
 
				+          "request",
			
 
				+          context=parent_ctx,
			
 
				+          attributes={
			
 
				+            "request_id": context.request_id,
			
 
				+            "sequence_number": context.sequence_number
			
 
				+          }
			
 
				+        )
			
 
				+        parent_ctx = trace.set_span_in_context(context.request_span)
			
 
				+    elif context.current_span:
			
 
				+      parent_ctx = trace.set_span_in_context(context.current_span)
			
 
				+    
			
 
				+    # Create span with parent context if it exists
			
 
				+    if parent_ctx:
			
 
				+      span = self.tracer.start_span(
			
 
				+        name,
			
 
				+        context=parent_ctx,
			
 
				+        attributes=attributes
			
 
				+      )
			
 
				+    else:
			
 
				+      span = self.tracer.start_span(
			
 
				+        name,
			
 
				+        attributes=attributes
			
 
				+      )
			
 
				+    
			
 
				+    # Update context with current span
			
 
				+    prev_span = context.current_span
			
 
				+    context.current_span = span
			
 
				+    
			
 
				+    try:
			
 
				+      start_time = time.perf_counter()
			
 
				+      yield span
			
 
				+      duration = time.perf_counter() - start_time
			
 
				+      span.set_attribute("duration_s", duration)
			
 
				+      span.set_status(Status(StatusCode.OK))
			
 
				+    except Exception as e:
			
 
				+      span.set_status(Status(StatusCode.ERROR, str(e)))
			
 
				+      raise
			
 
				+    finally:
			
 
				+      span.end()
			
 
				+      context.current_span = prev_span
			
 
				+
			
 
				+# Global tracer instance
			
 
				+tracer = Tracer() 
			
--- a/setup.py
+++ b/setup.py
@@ -16,6 +16,10 @@ install_requires = [
 
				   "nuitka==2.5.1",
			
 
				   "nvidia-ml-py==12.560.30",
			
 
				   "opencv-python==4.10.0.84",
			
 
				+  "opentelemetry-api==1.29.0",
			
 
				+  "opentelemetry-sdk==1.29.0",
			
 
				+  "opentelemetry-exporter-otlp==1.29.0",
			
 
				+  "opentelemetry-instrumentation==0.50b0",
			
 
				   "pillow==10.4.0",
			
 
				   "prometheus-client==0.20.0",
			
 
				   "protobuf==5.28.1",