| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970 |
- from __future__ import annotations
- import ctypes, functools
- from typing import Tuple
- import tinygrad.runtime.autogen.hip as hip
- from tinygrad.helpers import DEBUG, init_c_var, from_mv, init_c_struct_t
- from tinygrad.device import Compiled, LRUAllocator, BufferOptions
- from tinygrad.runtime.ops_amd import AMDCompiler, disasm
- from tinygrad.renderer.cstyle import HIPRenderer
- def check(status):
- if status != 0: raise RuntimeError(f"HIP Error {status}, {ctypes.string_at(hip.hipGetErrorString(status)).decode()}")
- class HIPProgram:
- def __init__(self, device:HIPDevice, name:str, lib:bytes):
- self.device, self.name, self.lib = device, name, lib
- if DEBUG >= 6: print(disasm(lib))
- check(hip.hipSetDevice(self.device.device_id))
- self.module = init_c_var(hip.hipModule_t(), lambda x: check(hip.hipModuleLoadData(ctypes.byref(x), lib)))
- self.prg = init_c_var(hip.hipFunction_t(), lambda x: check(hip.hipModuleGetFunction(ctypes.byref(x), self.module, name.encode("utf-8"))))
- def __del__(self):
- if hasattr(self, 'module'): check(hip.hipModuleUnload(self.module))
- def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
- check(hip.hipSetDevice(self.device.device_id))
- if not hasattr(self, "vargs"):
- self.c_args = init_c_struct_t(tuple([(f'f{i}', hip.hipDeviceptr_t) for i in range(len(args))] +
- [(f'v{i}', ctypes.c_int) for i in range(len(vals))]))(*args, *vals)
- self.vargs = (ctypes.c_void_p * 5)(1, ctypes.cast(ctypes.byref(self.c_args), ctypes.c_void_p), 2,
- ctypes.cast(ctypes.pointer(ctypes.c_size_t(ctypes.sizeof(self.c_args))), ctypes.c_void_p), 3)
- for i in range(len(args)): self.c_args.__setattr__(f'f{i}', args[i])
- for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
- if wait: check(hip.hipEventRecord(self.device.time_event_st, None))
- check(hip.hipModuleLaunchKernel(self.prg, *global_size, *local_size, 0, None, None, self.vargs))
- if wait:
- check(hip.hipEventRecord(self.device.time_event_en, None))
- check(hip.hipEventSynchronize(self.device.time_event_en))
- check(hip.hipEventElapsedTime(ctypes.byref(ret := ctypes.c_float()), self.device.time_event_st, self.device.time_event_en))
- return ret.value * 1e-3
- class HIPAllocator(LRUAllocator):
- def __init__(self, device:HIPDevice):
- self.device = device
- super().__init__()
- def _alloc(self, size:int, options:BufferOptions):
- check(hip.hipSetDevice(self.device.device_id))
- return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipMalloc(ctypes.byref(x), size)))
- def _free(self, opaque): check(hip.hipFree(opaque))
- def copyin(self, dest, src: memoryview):
- check(hip.hipSetDevice(self.device.device_id))
- check(hip.hipMemcpy(dest, from_mv(src), len(src), hip.hipMemcpyHostToDevice))
- def copyout(self, dest:memoryview, src):
- self.device.synchronize()
- check(hip.hipMemcpy(from_mv(dest), src, len(dest), hip.hipMemcpyDeviceToHost))
- class HIPDevice(Compiled):
- def __init__(self, device:str=""):
- self.device_id = int(device.split(":")[1]) if ":" in device else 0
- self.arch = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device_id))).gcnArchName.decode()
- self.time_event_st, self.time_event_en = [init_c_var(hip.hipEvent_t(), lambda x: hip.hipEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
- super().__init__(device, HIPAllocator(self), HIPRenderer(), AMDCompiler(self.arch), functools.partial(HIPProgram, self))
- def synchronize(self):
- check(hip.hipSetDevice(self.device_id))
- check(hip.hipDeviceSynchronize())
|