Browse Source

initial setup of manual networking config

Ian Paul 8 months ago
parent
commit
f092b08bc1

+ 0 - 0
exo/networking/manual/__init__.py


+ 74 - 0
exo/networking/manual/manual_discovery.py

@@ -0,0 +1,74 @@
+import asyncio
+from exo.networking.discovery import Discovery
+from typing import Dict, List, Callable
+
+from exo.topology.device_capabilities import DeviceCapabilities
+from exo.networking.manual.network_topology_config import NetworkTopology, PeerConfig
+from exo.helpers import DEBUG_DISCOVERY
+from exo.networking.peer_handle import PeerHandle
+
+
+class ManualDiscovery(Discovery):
+  def __init__(
+    self,
+    network_config_path: str,
+    node_id: str,
+    create_peer_handle: Callable[[str, str, DeviceCapabilities], PeerHandle],
+    discovery_timeout: int = 30,
+  ):
+    self.topology = NetworkTopology.from_path(network_config_path)
+    self.node_id = node_id
+    self.create_peer_handle = create_peer_handle
+    self.discovery_timeout = discovery_timeout
+
+    try:
+      self.node = self.topology.peers[node_id]
+    except KeyError as e:
+      print(f"Node ID {node_id} not found in network config file {network_config_path}. Please run with `node_id` set to one of the keys in the config file: {[k for k, _ in self.topology.peers]}")
+      raise e
+
+    self.node_port = self.node.port
+
+    self.listen_task = None
+    self.cleanup_task = None
+
+    self.known_peers: Dict[str, PeerHandle] = {}
+    self.peers_in_network: Dict[str, PeerConfig] = self.topology.peers
+    self.node_config = self.peers_in_network.pop(node_id)
+
+  async def start(self) -> None:
+    self.listen_task = asyncio.create_task(self.task_find_peers_from_config())
+
+  async def stop(self) -> None:
+    if self.listen_task:
+      self.listen_task.cancel()
+
+  async def discover_peers(self, wait_for_peers: int = 0) -> List[PeerHandle]:
+    if DEBUG_DISCOVERY >= 2: print("Starting discovery...")
+    if wait_for_peers > 0:
+      while len(self.known_peers) < wait_for_peers:
+        if DEBUG_DISCOVERY >= 2: print(f"Current peers: {len(self.known_peers)}/{wait_for_peers}. Waiting for more peers...")
+        await asyncio.sleep(0.1)
+    return list(self.known_peers.values())
+
+  async def task_find_peers_from_config(self):
+    if DEBUG_DISCOVERY >= 2: print("Starting task to find peers from config...")
+    while True:
+      for peer_id, peer_config in self.peers_in_network.items():
+        if DEBUG_DISCOVERY >= 2: print(f"Checking peer {peer_id=} at {peer_config.address}:{peer_config.port}")
+        peer = self.known_peers.get(peer_id)
+        if not peer:
+          if DEBUG_DISCOVERY >= 2: print(f"{peer_id=} not found in known peers. Adding.")
+          new_peer_handle = self.create_peer_handle(peer_id, f"{peer_config.address}:{peer_config.port}", peer_config.device_capabilities)
+          self.known_peers[peer_id] = new_peer_handle
+          peer = new_peer_handle
+
+        is_healthy = await peer.health_check()
+        if not is_healthy:
+          if DEBUG_DISCOVERY >= 2: print(f"{peer_id=} at {peer_config.address}:{peer_config.port} is not healthy. Removing.")
+          del self.known_peers[peer_id]
+        elif DEBUG_DISCOVERY >= 2: print(f"{peer_id=} at {peer_config.address}:{peer_config.port} is healthy.")
+
+        if DEBUG_DISCOVERY >= 2: print(f"Current known peers: {[peer.id() for peer in self.known_peers.values()]}")
+
+      await asyncio.sleep(self.discovery_timeout)

+ 48 - 0
exo/networking/manual/network_topology_config.py

@@ -0,0 +1,48 @@
+from typing import Dict
+from dataclasses import dataclass
+
+import json
+
+from exo.topology.device_capabilities import DeviceCapabilities
+
+
+@dataclass
+class PeerConfig:
+  address: str
+  port: int
+  device_capabilities: DeviceCapabilities
+
+
+@dataclass
+class NetworkTopology:
+  """Configuration of the network. A collection outlining all nodes in the network, including the node this is running from."""
+
+  peers: Dict[str, PeerConfig]
+  """
+  node_id to PeerConfig. The node_id is used to identify the peer in the discovery process. The node that this is running from should be included in this dict.
+  """
+
+  @classmethod
+  def from_path(cls, path: str) -> "NetworkTopology":
+    try:
+      with open(path, "r") as f:
+        config = json.load(f)
+    except FileNotFoundError:
+      raise FileNotFoundError(f"Config file not found at {path}")
+    except json.JSONDecodeError as e:
+      raise json.JSONDecodeError(f"Error decoding JSON data from {path}: {e}", e.doc, e.pos)
+
+    try:
+      peers = {}
+      for node_id, peer_data in config["peers"].items():
+        device_capabilities = DeviceCapabilities(**peer_data["device_capabilities"])
+        peer_config = PeerConfig(address=peer_data["address"], port=peer_data["port"], device_capabilities=device_capabilities)
+        peers[node_id] = peer_config
+
+      networking_config = cls(peers=peers)
+    except KeyError as e:
+      raise KeyError(f"Missing required key in config file: {e}")
+    except TypeError as e:
+      raise TypeError(f"Error parsing networking config from {path}: {e}")
+
+    return networking_config