Alex Cheema 1 рік тому
батько
коміт
d9484906a3

+ 0 - 5
exo/inference/mlx/sharded_inference_engine.py

@@ -8,7 +8,6 @@ from typing import Optional
 
 class MLXFixedShardInferenceEngine(InferenceEngine):
     def __init__(self, model_path: str, shard: Shard):
-        print("initializing fixed shard inference", shard)
         self.shard = shard
         model_shard, self.tokenizer = load_shard(model_path, shard)
         self.stateful_sharded_model = StatefulShardedModel(shard, model_shard)
@@ -18,7 +17,6 @@ class MLXFixedShardInferenceEngine(InferenceEngine):
             raise ValueError(f"Shard mismatch: {shard} != {self.shard}")
 
         output_data: np.ndarray = np.array(self.stateful_sharded_model.step(mx.array(self.tokenizer.encode(prompt))))
-        print(f"output_data size: {output_data.size}, output_data: {output_data}")
         return output_data, "", output_data.size == 1 and output_data.item() == self.tokenizer.eos_token_id
 
     async def infer_tensor(self, shard: Shard, input_data: np.ndarray) -> (np.ndarray, str, bool):
@@ -32,7 +30,6 @@ class MLXFixedShardInferenceEngine(InferenceEngine):
         if shard != self.shard:
             raise ValueError(f"Shard mismatch: {shard} != {self.shard}")
 
-        print(f"Resetting shard: {shard}")
         self.stateful_sharded_model.reset()
 
 class MLXDynamicShardInferenceEngine(InferenceEngine):
@@ -51,8 +48,6 @@ class MLXDynamicShardInferenceEngine(InferenceEngine):
 
     async def reset_shard(self, shard: Shard):
         await self.ensure_shard(shard)
-
-        print(f"Resetting shard: {shard}")
         self.stateful_sharded_model.reset()
 
     async def ensure_shard(self, shard: Shard):

+ 1 - 1
exo/topology/device_capabilities.py

@@ -108,7 +108,7 @@ def linux_device_capabilities() -> DeviceCapabilities:
         gpu_name = pynvml.nvmlDeviceGetName(handle)
         gpu_memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
 
-        print(f"NVIDIA device {gpu_name=} {gpu_memory_info=}")
+        if DEBUG >= 2: print(f"NVIDIA device {gpu_name=} {gpu_memory_info=}")
 
         return DeviceCapabilities(model=f"Linux Box ({gpu_name})", chip=gpu_name, memory=gpu_memory_info.total // 2**20, flops=CHIP_FLOPS.get(gpu_name, DeviceFlops(fp32=0, fp16=0, int8=0)))
     elif Device.DEFAULT == "AMD":

+ 0 - 1
main.py

@@ -63,7 +63,6 @@ api = ChatGPTAPI(node, inference_engine.__class__.__name__)
 
 topology_viz = TopologyViz()
 node.on_token.register("main_log").on_next(lambda _, tokens , __: print(inference_engine.tokenizer.decode(tokens) if hasattr(inference_engine, "tokenizer") else tokens))
-node.on_opaque_status.register("main_log").on_next(lambda request_id, status: print(f"!!! [{request_id}] Opaque Status: {status}"))
 
 async def shutdown(signal, loop):
     """Gracefully shutdown the server and close the asyncio loop."""