ops_amd.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516
  1. from __future__ import annotations
  2. from typing import Tuple, List, Any
  3. import os, fcntl, ctypes, ctypes.util, functools, re, pathlib, mmap, errno, subprocess, time, array
  4. from dataclasses import dataclass
  5. from tinygrad.device import HCQCompatCompiled, HCQCompatAllocator, HCQCompatAllocRes, HWComputeQueue, HWCopyQueue, hcq_profile, \
  6. HCQCompatProgram, Compiler, CompileError, BufferOptions
  7. from tinygrad.helpers import getenv, to_mv, round_up, DEBUG, PROFILE, mv_address
  8. from tinygrad.renderer.cstyle import AMDRenderer
  9. from tinygrad.runtime.support.hip_comgr import compile_hip
  10. import tinygrad.runtime.autogen.kfd as kfd
  11. import tinygrad.runtime.autogen.hsa as hsa
  12. import tinygrad.runtime.autogen.amd_gpu as amd_gpu
  13. import tinygrad.runtime.autogen.libc as libc
  14. from tinygrad.runtime.support.elf import elf_loader
  15. if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import
  16. if getenv("MOCKGPU"): import extra.mockgpu.mockgpu # noqa: F401 # pylint: disable=unused-import
  17. def is_usable_gpu(gpu_id):
  18. try:
  19. with gpu_id.open() as f:
  20. return int(f.read()) != 0
  21. except OSError:
  22. return False
  23. def kfd_ioctl(idir, nr, user_struct, fd, **kwargs):
  24. ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(made := user_struct(**kwargs))<<16) | (ord('K')<<8) | nr, made)
  25. if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
  26. return made
  27. def ioctls_from_header():
  28. #hdr = pathlib.Path("/usr/include/linux/kfd_ioctl.h").read_text().replace("\\\n", "")
  29. #pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_(IOW?R?)\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
  30. #matches = re.findall(pattern, hdr, re.MULTILINE)
  31. # get this from python instead
  32. hdrpy = (pathlib.Path(__file__).parent / "autogen" / "kfd.py").read_text()
  33. pattern = r'# (AMDKFD_IOC_[A-Z0-9_]+)\s=\s_(IOW?R?).*\(( 0x[0-9a-fA-F]+) ,\s+struct\s([A-Za-z0-9_]+)\s+\)'
  34. matches = re.findall(pattern, hdrpy, re.MULTILINE)
  35. idirs = {"IOW": 1, "IOR": 2, "IOWR": 3}
  36. fxns = {name.replace("AMDKFD_IOC_", "").lower():
  37. functools.partial(kfd_ioctl, idirs[idir], int(nr, 0x10), getattr(kfd, "struct_"+sname))
  38. for name, idir, nr, sname in matches}
  39. return type("KIO", (object, ), fxns)
  40. kio = ioctls_from_header()
  41. SIGNAL_SIZE, SIGNAL_COUNT = ctypes.sizeof(hsa.amd_signal_t), 65536
  42. regBIF_BX_PF1_GPU_HDP_FLUSH_REQ = 0x0106
  43. regBIF_BX_PF1_GPU_HDP_FLUSH_DONE = 0x0107
  44. # VGT_EVENT_TYPE in navi10_enum.h
  45. CACHE_FLUSH_AND_INV_TS_EVENT = 0x14
  46. WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
  47. WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
  48. COMPUTE_SHADER_EN, FORCE_START_AT_000, CS_W32_EN = (1 << 0), (1 << 2), (1 << 15)
  49. def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START
  50. def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2
  51. def data64_le(data): return (data & 0xFFFFFFFF, data >> 32)
  52. def signal_value_addr(signal): return ctypes.addressof(signal) + getattr(hsa.amd_signal_t, 'value').offset
  53. def signal_ts_addr(signal): return ctypes.addressof(signal) + getattr(hsa.amd_signal_t, 'start_ts').offset
  54. def disasm(lib):
  55. asm = subprocess.check_output(["/opt/rocm/llvm/bin/llvm-objdump", '-d', '-'], input=lib)
  56. return '\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x])
  57. class AMDCompiler(Compiler):
  58. def __init__(self, arch:str):
  59. self.arch = arch
  60. super().__init__(f"compile_hip_{self.arch}")
  61. def compile(self, src:str) -> bytes:
  62. try: return compile_hip(src, self.arch)
  63. except RuntimeError as e: raise CompileError(e) from e
  64. class AMDComputeQueue(HWComputeQueue):
  65. def __init__(self):
  66. self.ptr_to_dispatch_packet = {}
  67. super().__init__()
  68. def __del__(self):
  69. if self.binded_device is not None:
  70. self.binded_device.synchronize()
  71. self.binded_device._gpu_free(self.hw_page)
  72. def _invalidate_cache(self, addr=0x0, sz=(1 << 64)-1, gli=1, glm=1, glk=1, glv=1, gl1=1, gl2=1):
  73. self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0, *data64_le(sz), *data64_le(addr), 0,
  74. amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | \
  75. amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV(glm) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB(glm) | \
  76. amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB(glk) | \
  77. amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
  78. amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB(gl2)]
  79. def _memory_barrier(self):
  80. self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5), amd_gpu.WAIT_REG_MEM_MEM_SPACE(0) | amd_gpu.WAIT_REG_MEM_OPERATION(1) | \
  81. amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_EQ) | amd_gpu.WAIT_REG_MEM_ENGINE(0), nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_REQ),
  82. nbioreg(regBIF_BX_PF1_GPU_HDP_FLUSH_DONE), 0xffffffff, 0xffffffff, 0x20]
  83. self._invalidate_cache()
  84. def _exec(self, prg, kernargs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1)):
  85. self._invalidate_cache()
  86. user_data = [*data64_le(kernargs)]
  87. if hasattr(prg, 'dispatch_packet_offset'):
  88. dp = hsa.hsa_kernel_dispatch_packet_t.from_address(dp_addr:=kernargs + prg.dispatch_packet_offset)
  89. dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
  90. dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
  91. dp.group_segment_size, dp.private_segment_size, dp.kernarg_address = prg.group_segment_size, prg.private_segment_size, kernargs
  92. user_data = [*data64_le(dp_addr)] + user_data
  93. self.ptr_to_dispatch_packet[len(self)] = dp
  94. self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), gfxreg(amd_gpu.regCOMPUTE_PGM_LO), *data64_le(prg.prog_addr >> 8),
  95. *data64_le(0), *data64_le(prg.device.scratch.va_addr >> 8)]
  96. self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_PGM_RSRC1), prg.rsrc1, prg.rsrc2]
  97. self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_TMPRING_SIZE), prg.device.tmpring_size]
  98. self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_RESTART_X), 0, 0, 0, 0]
  99. self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE0)] + [0xFFFFFFFF] * 2
  100. self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE2)] + [0xFFFFFFFF] * 2
  101. self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 4), gfxreg(amd_gpu.regCOMPUTE_STATIC_THREAD_MGMT_SE4)] + [0xFFFFFFFF] * 4
  102. self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, len(user_data)), gfxreg(amd_gpu.regCOMPUTE_USER_DATA_0)] + user_data
  103. self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), gfxreg(amd_gpu.regCOMPUTE_START_X), 0, 0, 0, *local_size, 0, 0]
  104. self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 1), gfxreg(amd_gpu.regCOMPUTE_RESOURCE_LIMITS), 0]
  105. self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3), *global_size, CS_W32_EN | FORCE_START_AT_000 | COMPUTE_SHADER_EN]
  106. self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_EVENT_WRITE, 0), amd_gpu.EVENT_TYPE(7) | amd_gpu.EVENT_INDEX(4)]
  107. def _update_exec(self, cmd_idx, global_size, local_size):
  108. self._patch(cmd_idx, offset=52, data=local_size)
  109. self._patch(cmd_idx, offset=61, data=global_size)
  110. if (dp:=self.ptr_to_dispatch_packet.get(cmd_idx)) is not None:
  111. dp.workgroup_size_x, dp.workgroup_size_y, dp.workgroup_size_z = local_size[0], local_size[1], local_size[2]
  112. dp.grid_size_x, dp.grid_size_y, dp.grid_size_z = global_size[0]*local_size[0], global_size[1]*local_size[1], global_size[2]*local_size[2]
  113. def _wait(self, signal:hsa.amd_signal_t, value=0):
  114. self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_WAIT_REG_MEM, 5),
  115. amd_gpu.WAIT_REG_MEM_MEM_SPACE(1) | amd_gpu.WAIT_REG_MEM_OPERATION(0) | amd_gpu.WAIT_REG_MEM_FUNCTION(WAIT_REG_MEM_FUNCTION_GEQ) | \
  116. amd_gpu.WAIT_REG_MEM_ENGINE(0), *data64_le(signal_value_addr(signal)), value, 0xffffffff, 4]
  117. def _release_mem(self, mem_event_type, mem_data_sel, mem_int_sel, address, value=0, cst=0, cache_flush=False):
  118. cache_flush_flags = 0
  119. if cache_flush:
  120. cache_flush_flags = amd_gpu.PACKET3_RELEASE_MEM_GCR_GLV_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GL1_INV | \
  121. amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_INV | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_GLM_INV | \
  122. amd_gpu.PACKET3_RELEASE_MEM_GCR_GL2_WB | amd_gpu.PACKET3_RELEASE_MEM_GCR_SEQ
  123. # event_index__mec_release_mem__end_of_pipe = 5
  124. # event_index__mec_release_mem__shader_done = 6
  125. self.q += [amd_gpu.PACKET3(amd_gpu.PACKET3_RELEASE_MEM, 6),
  126. amd_gpu.PACKET3_RELEASE_MEM_EVENT_TYPE(mem_event_type) | amd_gpu.PACKET3_RELEASE_MEM_EVENT_INDEX(5) | cache_flush_flags,
  127. amd_gpu.PACKET3_RELEASE_MEM_DATA_SEL(mem_data_sel) | amd_gpu.PACKET3_RELEASE_MEM_INT_SEL(mem_int_sel) | amd_gpu.PACKET3_RELEASE_MEM_DST_SEL(0),
  128. *data64_le(address), *data64_le(value), cst]
  129. def _timestamp(self, signal): self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=3, mem_int_sel=0, address=signal_ts_addr(signal))
  130. def _signal(self, signal:hsa.amd_signal_t, value=0):
  131. # NOTE: this needs an EOP buffer on the queue or it will NULL pointer
  132. self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal_value_addr(signal), value=value, cache_flush=True)
  133. if signal.event_mailbox_ptr != 0:
  134. self._release_mem(CACHE_FLUSH_AND_INV_TS_EVENT, mem_data_sel=1, mem_int_sel=2, address=signal.event_mailbox_ptr,
  135. value=signal.event_id, cst=signal.event_id, cache_flush=True)
  136. def _update_wait(self, cmd_idx, signal=None, value=None):
  137. if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(signal_value_addr(signal)))
  138. if value is not None: self._patch(cmd_idx, offset=4, data=[value])
  139. def _update_signal(self, cmd_idx, signal=None, value=None):
  140. if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(signal_value_addr(signal)))
  141. if value is not None: self._patch(cmd_idx, offset=5, data=data64_le(value))
  142. # Check if the signal command has mailptr part
  143. if signal is not None and self.cmds_len[cmd_idx] > 8:
  144. self._patch(cmd_idx, offset=11, data=[*data64_le(signal.event_mailbox_ptr), *data64_le(signal.event_id), signal.event_id])
  145. def bind(self, device: AMDDevice):
  146. self.binded_device = device
  147. self.hw_page = device._gpu_alloc(len(self.q) * 4, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
  148. hw_view = to_mv(self.hw_page.va_addr, self.hw_page.size).cast("I")
  149. for i, value in enumerate(self.q): hw_view[i] = value
  150. self.indirect_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_INDIRECT_BUFFER, 2), *data64_le(self.hw_page.va_addr),
  151. len(self.q) | amd_gpu.INDIRECT_BUFFER_VALID]
  152. self.q = hw_view # type: ignore
  153. def _submit(self, device):
  154. cmds = self.indirect_cmd if device == self.binded_device else self.q
  155. for i, value in enumerate(cmds): device.compute_queue.ring[(device.compute_queue.put_value + i) % len(device.compute_queue.ring)] = value
  156. device.compute_queue.put_value += len(cmds)
  157. device.compute_queue.write_ptr[0] = device.compute_queue.put_value
  158. device.compute_queue.doorbell[0] = device.compute_queue.put_value
  159. SDMA_MAX_COPY_SIZE = 0x400000
  160. class AMDCopyQueue(HWCopyQueue):
  161. def __init__(self):
  162. self.internal_cmd_sizes, self.copy_cmds_per_copy = [], {}
  163. super().__init__()
  164. def _q(self, arr):
  165. self.q += arr
  166. self.internal_cmd_sizes.append(len(arr))
  167. def _copy(self, dest, src, copy_size):
  168. # Invalidate cache inv
  169. self._q([amd_gpu.SDMA_OP_GCR_REQ, 0, amd_gpu.SDMA_GCR_GLM_INV | amd_gpu.SDMA_GCR_GLK_INV | amd_gpu.SDMA_GCR_GLK_WB | amd_gpu.SDMA_GCR_GLV_INV | \
  170. amd_gpu.SDMA_GCR_GL1_INV | amd_gpu.SDMA_GCR_GL2_WB | amd_gpu.SDMA_GCR_GL2_INV, 0, 0])
  171. copied, copy_commands = 0, (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
  172. self.copy_cmds_per_copy[len(self) - 1] = copy_commands
  173. for _ in range(copy_commands):
  174. step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
  175. self._q([amd_gpu.SDMA_OP_COPY | amd_gpu.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_COPY_LINEAR),
  176. amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied)])
  177. copied += step_copy_size
  178. # Invalidate cache wb
  179. self._q([amd_gpu.SDMA_OP_GCR_REQ, 0, amd_gpu.SDMA_GCR_GLK_WB | amd_gpu.SDMA_GCR_GL2_WB, 0, 0])
  180. def _update_copy(self, cmd_idx, dest=None, src=None):
  181. for i in range(self.copy_cmds_per_copy[cmd_idx]):
  182. if src is not None: self._patch(cmd_idx, offset=8+i*7, data=[*data64_le(src + SDMA_MAX_COPY_SIZE*i)])
  183. if dest is not None: self._patch(cmd_idx, offset=10+i*7, data=[*data64_le(dest + SDMA_MAX_COPY_SIZE*i)])
  184. def _signal(self, signal: hsa.amd_signal_t, value=0):
  185. self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal_value_addr(signal)), value])
  186. if signal.event_mailbox_ptr != 0:
  187. self._q([amd_gpu.SDMA_OP_FENCE | amd_gpu.SDMA_PKT_FENCE_HEADER_MTYPE(3), *data64_le(signal.event_mailbox_ptr), signal.event_id])
  188. self._q([amd_gpu.SDMA_OP_TRAP, amd_gpu.SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT(signal.event_id)])
  189. def _wait(self, signal: hsa.amd_signal_t, value=0):
  190. self._q([amd_gpu.SDMA_OP_POLL_REGMEM | amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_FUNC(WAIT_REG_MEM_FUNCTION_GEQ) | \
  191. amd_gpu.SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL(1), *data64_le(signal_value_addr(signal)), value, 0xffffffff,
  192. amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(0x04) | amd_gpu.SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff)])
  193. def _update_signal(self, cmd_idx, signal=None, value=None): return self._update_wait(cmd_idx, signal, value) # the same offsets and commands
  194. def _update_wait(self, cmd_idx, signal=None, value=None):
  195. if signal is not None: self._patch(cmd_idx, offset=1, data=data64_le(signal_value_addr(signal)))
  196. if value is not None: self._patch(cmd_idx, offset=3, data=[value])
  197. def _timestamp(self, signal:hsa.amd_signal_t):
  198. self._q([amd_gpu.SDMA_OP_TIMESTAMP | amd_gpu.SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_TIMESTAMP_GET_GLOBAL),
  199. *data64_le(signal_ts_addr(signal))])
  200. def _submit(self, device):
  201. if device.sdma_queue.put_value - device.sdma_queue.read_ptr[0] > device.sdma_queue.ring.nbytes: raise RuntimeError("SDMA queue overrun")
  202. tail_blit_dword = 0
  203. for cmdsz in self.internal_cmd_sizes:
  204. if (tail_blit_dword + cmdsz) * 4 >= device.sdma_queue.ring.nbytes - device.sdma_queue.put_value % device.sdma_queue.ring.nbytes: break
  205. tail_blit_dword += cmdsz
  206. start_idx = (device.sdma_queue.put_value % device.sdma_queue.ring.nbytes) // 4
  207. device.sdma_queue.ring[start_idx : start_idx + tail_blit_dword] = array.array('I', self.q[:tail_blit_dword])
  208. device.sdma_queue.put_value += tail_blit_dword * 4
  209. if (rem_packet_cnt := len(self.q) - tail_blit_dword) > 0:
  210. zero_fill = device.sdma_queue.ring.nbytes - device.sdma_queue.put_value % device.sdma_queue.ring.nbytes
  211. ctypes.memset(mv_address(device.sdma_queue.ring) + (device.sdma_queue.put_value % device.sdma_queue.ring.nbytes), 0, zero_fill)
  212. device.sdma_queue.put_value += zero_fill
  213. device.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', self.q[tail_blit_dword:])
  214. device.sdma_queue.put_value += rem_packet_cnt * 4
  215. device.sdma_queue.write_ptr[0] = device.sdma_queue.put_value
  216. device.sdma_queue.doorbell[0] = device.sdma_queue.put_value
  217. class AMDProgram(HCQCompatProgram):
  218. def __init__(self, device:AMDDevice, name:str, lib:bytes):
  219. # TODO; this API needs the type signature of the function and global_size/local_size
  220. self.device, self.name, self.lib = device, name, lib
  221. if DEBUG >= 6: print(disasm(lib))
  222. image, sections, _ = elf_loader(self.lib)
  223. self.lib_gpu = self.device._gpu_alloc(round_up(image.nbytes, 0x1000), kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=True)
  224. ctypes.memmove(self.lib_gpu.va_addr, mv_address(image), image.nbytes)
  225. entry_point = min(sh.header.sh_addr for sh in sections if sh.header.sh_type == libc.SHT_PROGBITS and sh.header.sh_flags & libc.SHF_ALLOC)
  226. self.group_segment_size = image[:-(len(image)%4)].cast("I")[entry_point//4]
  227. self.private_segment_size = image[:-(len(image)%4)].cast("I")[entry_point//4 + 1]
  228. self.kernargs_segment_size = image[:-(len(image)%4)].cast("I")[entry_point//4 + 2]
  229. lds_size = ((self.group_segment_size + 511) // 512) & 0x1FF
  230. if lds_size > (self.device.properties['lds_size_in_kb'] * 1024) // 512: raise RuntimeError("Too many resources requsted: group_segment_size")
  231. if self.private_segment_size > self.device.max_private_segment_size: raise RuntimeError("Too many resources requsted: private_segment_size")
  232. code = hsa.amd_kernel_code_t.from_address(self.lib_gpu.va_addr + entry_point) # NOTE: this is wrong, it's not this object
  233. self.rsrc1 = code.compute_pgm_rsrc1
  234. self.rsrc2 = code.compute_pgm_rsrc2 | (lds_size << 15)
  235. if code.kernel_code_properties & 0x2 == 0x2: # ENABLE_SGPR_DISPATCH_PTR
  236. # Allocate space for the dispatch packet in the kernargs to pass it to the GPU.
  237. self.dispatch_packet_offset = self.kernargs_alloc_size
  238. self.kernargs_alloc_size += ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t)
  239. assert code.kernel_code_properties & 0x400 == 0x400 # ENABLE_WAVEFRONT_SIZE32
  240. assert code.workitem_private_segment_byte_size == 0
  241. assert code.max_scratch_backing_memory_byte_size == 0
  242. assert code.kernel_code_prefetch_byte_size == 0
  243. self.prog_addr = self.lib_gpu.va_addr + entry_point + code.kernel_code_entry_byte_offset
  244. AMDComputeQueue().memory_barrier().submit(self.device)
  245. super().__init__(kernargs_alloc_size=self.kernargs_segment_size)
  246. def __del__(self):
  247. if hasattr(self, 'lib_gpu'): self.device._gpu_free(self.lib_gpu)
  248. def fill_kernargs(self, kernargs_ptr:int, bufs:Tuple[Any, ...], vals:Tuple[int, ...]=()):
  249. if (given:=len(bufs)*8 + len(vals)*4) != (want:=self.kernargs_segment_size): raise RuntimeError(f'incorrect args size {given=} != {want=}')
  250. if len(bufs): to_mv(kernargs_ptr, len(bufs) * 8).cast('Q')[:] = array.array('Q', [b.va_addr for b in bufs])
  251. if len(vals): to_mv(kernargs_ptr + len(bufs) * 8, len(vals) * 4).cast('I')[:] = array.array('I', vals)
  252. def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
  253. if self.device.kernargs_ptr + self.kernargs_alloc_size > (self.device.kernargs.va_addr + self.device.kernargs.size):
  254. self.device.kernargs_ptr = self.device.kernargs.va_addr
  255. self.fill_kernargs(self.device.kernargs_ptr, args, vals)
  256. q = AMDComputeQueue().wait(self.device.timeline_signal, self.device.timeline_value - 1).memory_barrier()
  257. with hcq_profile(self.device, queue=q, desc=self.name, enabled=wait or PROFILE) as (sig_st, sig_en):
  258. q.exec(self, self.device.kernargs_ptr, global_size, local_size)
  259. q.signal(self.device.timeline_signal, self.device.timeline_value).submit(self.device)
  260. self.device.timeline_value += 1
  261. self.device.kernargs_ptr += self.kernargs_alloc_size
  262. if wait:
  263. self.device._wait_signal(self.device.timeline_signal, self.device.timeline_value - 1)
  264. if not PROFILE: self.device.signals_pool += [sig_st, sig_en]
  265. return (sig_en.start_ts - sig_st.start_ts) / 1e8
  266. class AMDAllocator(HCQCompatAllocator):
  267. def __init__(self, device:AMDDevice): super().__init__(device, batch_size=SDMA_MAX_COPY_SIZE)
  268. def _alloc(self, size:int, options:BufferOptions) -> HCQCompatAllocRes:
  269. if options.host: return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, public=True)
  270. return self.device._gpu_alloc(size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM, public=options.cpu_access)
  271. def _free(self, opaque, options:BufferOptions): self.device._gpu_free(opaque)
  272. MAP_FIXED, MAP_NORESERVE = 0x10, 0x400
  273. @dataclass
  274. class AMDQueueDesc:
  275. ring: memoryview
  276. read_ptr: memoryview
  277. write_ptr: memoryview
  278. doorbell: memoryview
  279. put_value: int = 0
  280. class AMDDevice(HCQCompatCompiled):
  281. kfd:int = -1
  282. event_page:Any = None # TODO: fix types in kfd, Optional[kfd.struct_kfd_ioctl_alloc_memory_of_gpu_args]
  283. signals_page:Any = None
  284. signals_pool:List[hsa.amd_signal_t] = []
  285. gpus:List[pathlib.Path] = []
  286. def _gpu_map(self, mem):
  287. mem = mem._base if hasattr(mem, '_base') else mem
  288. if self.gpu_id in getattr(mem, "mapped_gpu_ids", []): return
  289. mem.__setattr__("mapped_gpu_ids", getattr(mem, "mapped_gpu_ids", []) + [self.gpu_id])
  290. c_gpus = (ctypes.c_int32 * len(mem.mapped_gpu_ids))(*mem.mapped_gpu_ids)
  291. stm = kio.map_memory_to_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(mem.mapped_gpu_ids))
  292. assert stm.n_success == len(mem.mapped_gpu_ids)
  293. def _gpu_alloc(self, size:int, flags:int, uncached=False, public=False, map_to_gpu=True):
  294. flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
  295. if uncached: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED
  296. if public: flags |= kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
  297. if flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR:
  298. buf = addr = libc.mmap(0, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
  299. else:
  300. buf, addr = 0, libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
  301. assert addr != 0xffffffffffffffff
  302. try: mem = kio.alloc_memory_of_gpu(self.kfd, va_addr=addr, size=size, base=addr, length=size, gpu_id=self.gpu_id, flags=flags, mmap_offset=buf)
  303. except OSError as e:
  304. if e.errno == errno.EINVAL and (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) and public:
  305. raise MemoryError("Cannot allocate host-visible VRAM. Ensure the resizable BAR option is enabled on your system.") from e
  306. if e.errno == errno.ENOMEM: raise MemoryError("Cannot allocate memory: no memory is available.") from e
  307. raise
  308. if not (flags & kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR):
  309. buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.drm_fd, mem.mmap_offset)
  310. assert addr == buf == mem.va_addr
  311. if map_to_gpu: self._gpu_map(mem)
  312. return mem
  313. def _gpu_free(self, mem):
  314. if len(gpus:=getattr(mem, "mapped_gpu_ids", [])):
  315. c_gpus = (ctypes.c_int32 * len(gpus))(*gpus)
  316. stm = kio.unmap_memory_from_gpu(self.kfd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(c_gpus), n_devices=len(gpus))
  317. assert stm.n_success == len(gpus)
  318. libc.munmap(mem.va_addr, mem.size)
  319. kio.free_memory_of_gpu(self.kfd, handle=mem.handle)
  320. @classmethod
  321. def _read_signal(self, signal): return signal.value
  322. @classmethod
  323. def _read_timestamp(self, signal): return signal.start_ts
  324. @classmethod
  325. def _set_signal(self, signal, value): signal.value = value
  326. @classmethod
  327. def _alloc_signal(self, value=0, **kwargs) -> hsa.amd_signal_t:
  328. self._set_signal(ret := self.signals_pool.pop(), value)
  329. if (sync_event:=kwargs.get('sync_event')) is not None:
  330. ret.event_mailbox_ptr = AMDDevice.event_page.va_addr + sync_event.event_slot_index*8
  331. ret.event_id = sync_event.event_id
  332. else: ret.event_mailbox_ptr = ret.event_id = 0
  333. return ret
  334. @classmethod
  335. def _free_signal(self, signal): self.signals_pool.append(signal)
  336. @classmethod
  337. def _wait_signal(self, signal:hsa.amd_signal_t, value=0, timeout=10000):
  338. assert signal.event_id != 0, "can't wait on this signal"
  339. evt_arr = (kfd.struct_kfd_event_data)(event_id=signal.event_id)
  340. # Wait active for 5s, then going to sleep.
  341. start_time = time.time() * 1000
  342. while (time_spent:=time.time() * 1000 - start_time) < timeout:
  343. if signal.value >= value: return
  344. if time_spent > 5000: kio.wait_events(AMDDevice.kfd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=1, timeout=1000)
  345. raise RuntimeError(f"wait_signal: not set to {value}, but {signal.value}, {timeout} ms TIMEOUT!")
  346. def __init__(self, device:str=""):
  347. if AMDDevice.kfd == -1:
  348. AMDDevice.kfd = os.open("/dev/kfd", os.O_RDWR)
  349. gpus = [g.parent for g in pathlib.Path("/sys/devices/virtual/kfd/kfd/topology/nodes").glob("*/gpu_id") if is_usable_gpu(g)]
  350. gpus = sorted(gpus, key=lambda x: int(x.name.split('/')[-1]))
  351. visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
  352. AMDDevice.gpus = [gpus[x] for x in visible_devices] if visible_devices else gpus
  353. self.device_id = int(device.split(":")[1]) if ":" in device else 0
  354. if self.device_id >= len(AMDDevice.gpus): raise RuntimeError(f"No device found for {device}. Requesting more devices than the system has?")
  355. with open(f"{AMDDevice.gpus[self.device_id]}/gpu_id", "r") as f: self.gpu_id = int(f.read())
  356. with open(f"{AMDDevice.gpus[self.device_id]}/properties", "r") as f: self.properties = {line.split()[0]: int(line.split()[1]) for line in f}
  357. self.drm_fd = os.open(f"/dev/dri/renderD{self.properties['drm_render_minor']}", os.O_RDWR)
  358. target = int(self.properties['gfx_target_version'])
  359. self.arch = "gfx%d%x%x" % (target // 10000, (target // 100) % 100, target % 100)
  360. kio.acquire_vm(AMDDevice.kfd, drm_fd=self.drm_fd, gpu_id=self.gpu_id)
  361. if AMDDevice.event_page is None:
  362. AMDDevice.signals_page = self._gpu_alloc(SIGNAL_SIZE*SIGNAL_COUNT, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
  363. AMDDevice.event_page = self._gpu_alloc(0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
  364. for off in range(0, AMDDevice.signals_page.size, SIGNAL_SIZE):
  365. AMDDevice.signals_pool.append(hsa.amd_signal_t.from_address(AMDDevice.signals_page.va_addr + off))
  366. sync_event = kio.create_event(AMDDevice.kfd, event_page_offset=AMDDevice.event_page.handle, auto_reset=1)
  367. else:
  368. self._gpu_map(AMDDevice.signals_page)
  369. self._gpu_map(AMDDevice.event_page)
  370. sync_event = kio.create_event(AMDDevice.kfd, auto_reset=1)
  371. self.kernargs = self._gpu_alloc(0x1000000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
  372. self.kernargs_ptr = self.kernargs.va_addr
  373. # Scratch setup
  374. max_cu_id = self.properties['simd_count'] // self.properties['simd_per_cu'] - 1
  375. max_wave_id = self.properties['max_waves_per_simd'] * self.properties['simd_per_cu'] - 1
  376. self.max_private_segment_size = 4096
  377. wave_scratch_len = round_up(((max_wave_id + 1) * self.max_private_segment_size), 256) # gfx11 requires alignment of 256
  378. self.scratch_len = (max_cu_id + 1) * self.properties['max_slots_scratch_cu'] * wave_scratch_len
  379. self.scratch = self._gpu_alloc(self.scratch_len, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
  380. engines = self.properties['array_count'] // self.properties['simd_arrays_per_engine']
  381. self.tmpring_size = (wave_scratch_len // 256) << 12 | (self.scratch_len // (wave_scratch_len * engines))
  382. self.compute_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x100000, ctx_save_restore_size=0x2C02000, eop_buffer_size=0x1000)
  383. self.sdma_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x100000)
  384. timeline_signals=(self._alloc_signal(sync_event=sync_event), self._alloc_signal(sync_event=kio.create_event(AMDDevice.kfd, auto_reset=1)))
  385. super().__init__(device, AMDAllocator(self), AMDRenderer(), AMDCompiler(self.arch), functools.partial(AMDProgram, self),
  386. AMDComputeQueue, AMDCopyQueue, timeline_signals)
  387. def _gpu2cpu_time(self, gpu_time, is_copy):
  388. if is_copy: return self.copy_cpu_start_time + (gpu_time - self.copy_gpu_start_time) / 1e2
  389. return self.cpu_start_time + (gpu_time - self.gpu_start_time) / 1e2
  390. def _alloc_queue(self, queue_type, ring_size, ctx_save_restore_size=None, eop_buffer_size=None) -> AMDQueueDesc:
  391. gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
  392. ring = self._gpu_alloc(ring_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
  393. cwsr_ctx = self._gpu_alloc(ctx_save_restore_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if ctx_save_restore_size else None
  394. eop_buffer = self._gpu_alloc(eop_buffer_size, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM) if eop_buffer_size else None
  395. queue = kio.create_queue(AMDDevice.kfd, ring_base_address=ring.va_addr, ring_size=ring.size, gpu_id=self.gpu_id,
  396. queue_type=queue_type, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
  397. eop_buffer_address=eop_buffer.va_addr if eop_buffer else 0, eop_buffer_size=eop_buffer.size if eop_buffer else 0,
  398. ctx_save_restore_address=cwsr_ctx.va_addr if cwsr_ctx else 0, ctx_save_restore_size=cwsr_ctx.size if cwsr_ctx else 0,
  399. write_pointer_address=gart.va_addr, read_pointer_address=gart.va_addr + 8)
  400. if not hasattr(self, 'doorbells'):
  401. self.doorbells_base = queue.doorbell_offset & (~0x1fff) # doorbell is two pages
  402. self.doorbells = libc.mmap(0, 0x2000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, AMDDevice.kfd, self.doorbells_base)
  403. return AMDQueueDesc(ring=to_mv(ring.va_addr, ring_size).cast("I"),
  404. read_ptr=to_mv(queue.read_pointer_address, 8).cast("Q"), write_ptr=to_mv(queue.write_pointer_address, 8).cast("Q"),
  405. doorbell=to_mv(self.doorbells + queue.doorbell_offset - self.doorbells_base, 8).cast("Q"))
  406. def synchronize(self):
  407. AMDDevice._wait_signal(self.timeline_signal, self.timeline_value - 1)
  408. # reset kernargs
  409. self.kernargs_ptr = self.kernargs.va_addr
  410. if self.timeline_value > (1 << 31): self._wrap_timeline_signal()
  411. if PROFILE: self._prof_process_events()