amdgpu.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. import ctypes, time
  2. from extra.mockgpu.gpu import VirtGPU
  3. from tinygrad.helpers import to_mv, init_c_struct_t, mv_address
  4. import tinygrad.runtime.autogen.amd_gpu as amd_gpu
  5. SDMA_MAX_COPY_SIZE = 0x400000
  6. BASE_ADDR = 0x00001260
  7. PACKET3_SET_SH_REG_START = 0x2c00
  8. SUB = PACKET3_SET_SH_REG_START - BASE_ADDR
  9. regCOMPUTE_PGM_LO = 0x1bac - SUB
  10. regCOMPUTE_USER_DATA_0 = 0x1be0 - SUB
  11. regCOMPUTE_START_X = 0x1ba4 - SUB
  12. CACHE_FLUSH_AND_INV_TS_EVENT = 0x14
  13. WAIT_REG_MEM_FUNCTION_ALWAYS = 0
  14. WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
  15. WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
  16. try:
  17. remu = ctypes.CDLL("/usr/local/lib/libremu.so")
  18. remu.run_asm.restype = ctypes.c_int32
  19. remu.run_asm.argtypes = [ctypes.c_void_p, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_void_p]
  20. except Exception: pass
  21. def create_sdma_packets():
  22. # TODO: clean up this, if we want to keep it
  23. structs = {}
  24. for name,pkt in [(name,s) for name,s in amd_gpu.__dict__.items() if name.startswith("struct_SDMA_PKT_") and name.endswith("_TAG")]:
  25. names = set()
  26. fields = []
  27. for pkt_fields in pkt._fields_:
  28. if not pkt_fields[0].endswith("_UNION"): fields.append(pkt_fields)
  29. else:
  30. assert pkt_fields[1]._fields_[0][0] == '_0'
  31. for union_fields in pkt_fields[1]._fields_[0][1]._fields_:
  32. fname = union_fields[0]
  33. if fname in names: fname = pkt_fields[0]+fname
  34. names.add(fname)
  35. # merge together 64-bit fields, otherwise just append them
  36. if fname.endswith("_63_32") and fields[-1][0].endswith("_31_0"): fields[-1] = tuple([fname[:-6], ctypes.c_ulong, 64])
  37. else: fields.append(tuple([fname, *union_fields[1:]]))
  38. new_name = name[16:-4].lower()
  39. structs[new_name] = init_c_struct_t(tuple(fields))
  40. assert ctypes.sizeof(structs[new_name]) == ctypes.sizeof(pkt), f"{ctypes.sizeof(structs[new_name])} != {ctypes.sizeof(pkt)}"
  41. return type("SDMA_PKTS", (object, ), structs)
  42. sdma_pkts = create_sdma_packets()
  43. class AMDQueue():
  44. def __init__(self, base, size, rptr, wptr):
  45. self.queue, self.size = to_mv(base, size).cast("I"), size
  46. self.rptr = to_mv(rptr, 8).cast("Q")
  47. self.wptr = to_mv(wptr, 8).cast("Q")
  48. class PM4Executor(AMDQueue):
  49. def __init__(self, gpu, base, size, rptr, wptr):
  50. self.gpu = gpu
  51. super().__init__(base, size, rptr, wptr)
  52. def _next_dword(self):
  53. x = self.queue[self.rptr[0] % (self.size // 4)]
  54. self.rptr[0] += 1
  55. return x
  56. def execute(self):
  57. while self.rptr[0] < self.wptr[0]:
  58. cont = True
  59. header = self._next_dword()
  60. packet_type = header >> 30
  61. op = (header >> 8) & 0xFF
  62. n = (header >> 16) & 0x3FFF
  63. assert packet_type == 3, "Can parse only packet3"
  64. if op == amd_gpu.PACKET3_SET_SH_REG: self._exec_set_sh_reg(n)
  65. elif op == amd_gpu.PACKET3_ACQUIRE_MEM: self._exec_acquire_mem(n)
  66. elif op == amd_gpu.PACKET3_RELEASE_MEM: self._exec_release_mem(n)
  67. elif op == amd_gpu.PACKET3_WAIT_REG_MEM: cont = self._exec_wait_reg_mem(n)
  68. elif op == amd_gpu.PACKET3_DISPATCH_DIRECT: self._exec_dispatch_direct(n)
  69. elif op == amd_gpu.PACKET3_INDIRECT_BUFFER: self._exec_indirect_buffer(n)
  70. elif op == amd_gpu.PACKET3_EVENT_WRITE: self._exec_event_write(n)
  71. else: raise RuntimeError(f"PM4: Unknown opcode: {op}")
  72. if not cont: return
  73. def _exec_acquire_mem(self, n):
  74. assert n == 6
  75. for _ in range(7): self._next_dword() # TODO: implement
  76. def _exec_release_mem(self, n):
  77. assert n == 6
  78. mem_event_type = (self._next_dword() >> 0) & 0xff
  79. selectors = self._next_dword()
  80. mem_data_sel = (selectors >> 29) & 0b111
  81. int_sel = (selectors >> 24) & 0b11
  82. mem_dst_sel = (selectors >> 16) & 0b1
  83. addr_lo = self._next_dword()
  84. addr_hi = self._next_dword()
  85. val_lo = self._next_dword()
  86. val_hi = self._next_dword()
  87. val = val_lo + (val_hi << 32)
  88. ev = self._next_dword()
  89. ptr = to_mv(addr_lo + (addr_hi << 32), 8)
  90. if mem_data_sel == 1 or mem_data_sel == 2: ptr.cast('Q')[0] = val
  91. elif mem_data_sel == 3:
  92. if mem_event_type == CACHE_FLUSH_AND_INV_TS_EVENT: ptr.cast('Q')[0] = int(time.perf_counter() * 1e8)
  93. else: raise RuntimeError(f"Unknown {mem_data_sel=} {mem_event_type=}")
  94. else: raise RuntimeError(f"Unknown {mem_data_sel=}")
  95. def _exec_wait_reg_mem(self, n):
  96. assert n == 5
  97. info = self._next_dword()
  98. addr_lo = self._next_dword()
  99. addr_hi = self._next_dword()
  100. val = self._next_dword()
  101. mask = self._next_dword()
  102. timeout = self._next_dword()
  103. mem_function = (info >> 0) & 0b111
  104. mem_space = (info >> 4) & 0b1
  105. mem_op = (info >> 6) & 0b1
  106. mem_engine = (info >> 8) & 0b1
  107. if mem_space == 0: read_op = lambda: val
  108. elif mem_space == 1: read_op = lambda: to_mv(addr_lo + (addr_hi << 32), 4).cast('I')[0]
  109. if mem_function == WAIT_REG_MEM_FUNCTION_GEQ: cmp = lambda x,y: x >= y
  110. elif mem_function == WAIT_REG_MEM_FUNCTION_EQ: cmp = lambda x,y: x == y
  111. else: raise RuntimeError(f"Do not support {mem_function=}")
  112. mval = read_op()
  113. can_cont = cmp(mval, val)
  114. if not can_cont: self.rptr[0] = self.rptr[0] - 7 # revert packet, need to wait again
  115. return can_cont
  116. def _exec_set_sh_reg(self, n):
  117. reg = self._next_dword()
  118. for i in range(n):
  119. self.gpu.regs[reg] = self._next_dword()
  120. reg += 1
  121. def _exec_dispatch_direct(self, n):
  122. assert n == 3
  123. gl = [self._next_dword() for _ in range(3)]
  124. flags = self._next_dword()
  125. prg_addr = (self.gpu.regs[regCOMPUTE_PGM_LO] + (self.gpu.regs[regCOMPUTE_PGM_LO + 1] << 32)) << 8
  126. args_addr = self.gpu.regs[regCOMPUTE_USER_DATA_0] + (self.gpu.regs[regCOMPUTE_USER_DATA_0 + 1] << 32)
  127. lc = [self.gpu.regs[i] for i in range(regCOMPUTE_START_X+3, regCOMPUTE_START_X+6)]
  128. prg_sz = 0
  129. for st,sz in self.gpu.mapped_ranges:
  130. if st <= prg_addr <= st+sz: prg_sz = sz - (prg_addr - st)
  131. assert prg_sz > 0, "Invalid prg ptr (not found in mapped ranges)"
  132. err = remu.run_asm(prg_addr, prg_sz, *gl, *lc, args_addr)
  133. if err != 0: raise RuntimeError("remu does not support the new instruction introduced in this kernel")
  134. def _exec_indirect_buffer(self, n):
  135. addr_lo = self._next_dword()
  136. addr_hi = self._next_dword()
  137. buf_sz = self._next_dword() & (0x7fffff)
  138. rptr = memoryview(bytearray(8)).cast('Q')
  139. wptr = memoryview(bytearray(8)).cast('Q')
  140. rptr[0] = 0
  141. wptr[0] = buf_sz
  142. PM4Executor(self.gpu, (addr_hi << 32) | addr_lo, buf_sz * 4, mv_address(rptr), mv_address(wptr)).execute()
  143. assert rptr[0] == wptr[0], "not everything executed in amdgpu"
  144. def _exec_event_write(self, n):
  145. assert n == 0
  146. _ = self._next_dword() # do not emulate events for now
  147. class SDMAExecutor(AMDQueue):
  148. def __init__(self, gpu, base, size, rptr, wptr):
  149. self.gpu, self.base = gpu, base
  150. super().__init__(base, size, rptr, wptr)
  151. def execute(self):
  152. while self.rptr[0] < self.wptr[0]:
  153. cont = True
  154. header = self.queue[(self.rptr[0] // 4) % (self.size // 4)]
  155. op = (header >> 0) & 0xff
  156. if op == 0: self.rptr[0] += 4
  157. elif op == amd_gpu.SDMA_OP_FENCE: self._execute_fence()
  158. elif op == amd_gpu.SDMA_OP_TRAP: self._execute_trap()
  159. elif op == amd_gpu.SDMA_OP_POLL_REGMEM: cont = self._execute_poll_regmem()
  160. elif op == amd_gpu.SDMA_OP_GCR: self._execute_gcr()
  161. elif op == amd_gpu.SDMA_OP_COPY: self._execute_copy()
  162. elif op == amd_gpu.SDMA_OP_TIMESTAMP: self._execute_timestamp()
  163. else: raise RuntimeError(f"Unknown SDMA op {op}")
  164. if not cont: return
  165. def _execute_fence(self):
  166. struct = sdma_pkts.fence.from_address(self.base + self.rptr[0] % self.size)
  167. to_mv(struct.addr, 8).cast('Q')[0] = struct.data
  168. self.rptr[0] += ctypes.sizeof(struct)
  169. def _execute_trap(self):
  170. struct = sdma_pkts.trap.from_address(self.base + self.rptr[0] % self.size)
  171. self.rptr[0] += ctypes.sizeof(struct)
  172. def _execute_poll_regmem(self):
  173. struct = sdma_pkts.poll_regmem.from_address(self.base + self.rptr[0] % self.size)
  174. if struct.mem_poll == 0: read_op = lambda: struct.value
  175. elif struct.mem_poll == 1: read_op = lambda: to_mv(struct.addr, 4).cast('I')[0]
  176. if struct.func == WAIT_REG_MEM_FUNCTION_GEQ: cmp = lambda x,y: x >= y
  177. elif struct.func == WAIT_REG_MEM_FUNCTION_EQ: cmp = lambda x,y: x == y
  178. elif struct.func == WAIT_REG_MEM_FUNCTION_ALWAYS: cmp = lambda x,y: True
  179. else: raise RuntimeError(f"Do not support {struct.func=}")
  180. mval = read_op() & struct.mask
  181. if not cmp(mval, struct.value): return False
  182. self.rptr[0] += ctypes.sizeof(struct)
  183. return True
  184. def _execute_timestamp(self):
  185. struct = sdma_pkts.timestamp.from_address(self.base + self.rptr[0] % self.size)
  186. mem = to_mv(struct.addr, 8).cast('Q')
  187. mem[0] = int(time.perf_counter() * 1e8)
  188. self.rptr[0] += ctypes.sizeof(struct)
  189. def _execute_gcr(self):
  190. struct = sdma_pkts.gcr.from_address(self.base + self.rptr[0] % self.size)
  191. self.rptr[0] += ctypes.sizeof(struct)
  192. def _execute_copy(self):
  193. struct = sdma_pkts.copy_linear.from_address(self.base + self.rptr[0] % self.size)
  194. ctypes.memmove(struct.dst_addr, struct.src_addr, struct.count + 1)
  195. self.rptr[0] += ctypes.sizeof(struct)
  196. class AMDGPU(VirtGPU):
  197. def __init__(self, gpuid):
  198. super().__init__(gpuid)
  199. self.mapped_ranges = set()
  200. self.queues = []
  201. def map_range(self, vaddr, size): self.mapped_ranges.add((vaddr, size))
  202. def unmap_range(self, vaddr, size): self.mapped_ranges.remove((vaddr, size))
  203. def add_pm4_queue(self, base, size, rptr, wptr):
  204. self.queues.append(PM4Executor(self, base, size, rptr, wptr))
  205. return len(self.queues) - 1
  206. def add_sdma_queue(self, base, size, rptr, wptr):
  207. self.queues.append(SDMAExecutor(self, base, size, rptr, wptr))
  208. return len(self.queues) - 1
  209. gpu_props = """cpu_cores_count 0
  210. simd_count 192
  211. mem_banks_count 1
  212. caches_count 206
  213. io_links_count 1
  214. p2p_links_count 5
  215. cpu_core_id_base 0
  216. simd_id_base 2147488032
  217. max_waves_per_simd 16
  218. lds_size_in_kb 64
  219. gds_size_in_kb 0
  220. num_gws 64
  221. wave_front_size 32
  222. array_count 12
  223. simd_arrays_per_engine 2
  224. cu_per_simd_array 8
  225. simd_per_cu 2
  226. max_slots_scratch_cu 32
  227. gfx_target_version 110000
  228. vendor_id 4098
  229. device_id 29772
  230. location_id 34304
  231. domain 0
  232. drm_render_minor {drm_render_minor}
  233. hive_id 0
  234. num_sdma_engines 2
  235. num_sdma_xgmi_engines 0
  236. num_sdma_queues_per_engine 6
  237. num_cp_queues 8
  238. max_engine_clk_fcompute 2482
  239. local_mem_size 0
  240. fw_version 2140
  241. capability 671588992
  242. debug_prop 1495
  243. sdma_fw_version 20
  244. unique_id 11673270660693242239
  245. num_xcc 1
  246. max_engine_clk_ccompute 2400"""