test_kfd_2.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. import os, ctypes, pathlib, re, fcntl, functools, mmap, time
  2. import tinygrad.runtime.autogen.kfd as kfd
  3. from tinygrad.helpers import to_mv, getenv
  4. from extra.hip_gpu_driver import hip_ioctl
  5. import tinygrad.runtime.autogen.hsa as hsa
  6. from hexdump import hexdump
  7. libc = ctypes.CDLL("libc.so.6")
  8. libc.memset.argtypes = [ctypes.c_void_p, ctypes.c_char, ctypes.c_int]
  9. libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
  10. libc.mmap.restype = ctypes.c_void_p
  11. MAP_NORESERVE = 0x4000
  12. MAP_FIXED = 0x10
  13. def kfd_ioctl(idir, nr, user_struct, fd, **kwargs):
  14. made = user_struct(**kwargs)
  15. ret = fcntl.ioctl(fd, (idir<<30) | (ctypes.sizeof(user_struct)<<16) | (ord('K')<<8) | nr, made)
  16. if ret != 0: raise RuntimeError(f"ioctl returned {ret}")
  17. return made
  18. def format_struct(s):
  19. sdats = []
  20. for field_name, field_type in s._fields_:
  21. dat = getattr(s, field_name)
  22. if isinstance(dat, int): sdats.append(f"{field_name}:0x{dat:X}")
  23. else: sdats.append(f"{field_name}:{dat}")
  24. return sdats
  25. idirs = {"IOW": 1, "IOR": 2, "IOWR": 3}
  26. def ioctls_from_header():
  27. hdr = pathlib.Path("/usr/include/linux/kfd_ioctl.h").read_text().replace("\\\n", "")
  28. pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_(IOW?R?)\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
  29. matches = re.findall(pattern, hdr, re.MULTILINE)
  30. fxns = {}
  31. for name, idir, nr, sname in matches:
  32. fxns[name.replace("AMDKFD_IOC_", "").lower()] = functools.partial(kfd_ioctl, idirs[idir], int(nr, 0x10), getattr(kfd, "struct_"+sname))
  33. return type("KIO", (object, ), fxns)
  34. kio = ioctls_from_header()
  35. # sudo su -c "echo 'file drivers/gpu/drm/amd/* +p' > /sys/kernel/debug/dynamic_debug/control"
  36. def gpu_alloc_userptr(fd, size, flags):
  37. addr = libc.mmap(0, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
  38. assert addr != 0xffffffffffffffff
  39. mem = kio.alloc_memory_of_gpu(fd, va_addr=addr, size=size, gpu_id=GPU_ID, flags=flags, mmap_offset=addr)
  40. return mem
  41. def gpu_alloc(fd, size, flags):
  42. addr = libc.mmap(0, size, 0, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS|MAP_NORESERVE, -1, 0)
  43. assert addr != 0xffffffffffffffff
  44. mem = kio.alloc_memory_of_gpu(fd, va_addr=addr, size=size, gpu_id=GPU_ID, flags=flags)
  45. buf = libc.mmap(mem.va_addr, mem.size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, drm_fd, mem.mmap_offset)
  46. assert buf != 0xffffffffffffffff
  47. assert addr == buf == mem.va_addr
  48. return mem
  49. if __name__ == "__main__":
  50. fd = os.open("/dev/kfd", os.O_RDWR)
  51. gpu_num = getenv("GPU", 0)
  52. drm_fd = os.open(f"/dev/dri/renderD{128+gpu_num}", os.O_RDWR)
  53. with open(f"/sys/devices/virtual/kfd/kfd/topology/nodes/{1+gpu_num}/gpu_id", "r") as f: GPU_ID = int(f.read())
  54. #ver = kio.get_version(fd)
  55. st = kio.acquire_vm(fd, drm_fd=drm_fd, gpu_id=GPU_ID)
  56. #exit(0)
  57. # 0xF0000001 = KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC | KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
  58. # 0xD6000002 = KFD_IOC_ALLOC_MEM_FLAGS_GTT | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
  59. # 0xD6000004 = KFD_IOC_ALLOC_MEM_FLAGS_USERPTR | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
  60. # 0x94000010 = KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
  61. #addr = libc.mmap(0, 0x1000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS, -1, 0)
  62. #addr = libc.mmap(0, 0x1000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|mmap.MAP_ANONYMOUS, -1, 0)
  63. #mem = kio.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU(fd, va_addr=addr, size=0x1000, gpu_id=GPU_ID, flags=0xD6000004)
  64. #mem = gpu_alloc(fd, 0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM |
  65. # kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE |
  66. # kfd.KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
  67. #arr = (ctypes.c_int32 * 1)(GPU_ID)
  68. #stm = kio.map_memory_to_gpu(fd, handle=mem.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
  69. arr = (ctypes.c_int32 * 1)(GPU_ID)
  70. rw_ptr = gpu_alloc(fd, 0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
  71. kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED |
  72. kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
  73. stm = kio.map_memory_to_gpu(fd, handle=rw_ptr.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
  74. assert stm.n_success == 1
  75. event_page = gpu_alloc(fd, 0x8000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
  76. kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED |
  77. kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
  78. stm = kio.map_memory_to_gpu(fd, handle=event_page.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
  79. assert stm.n_success == 1
  80. ring_base = gpu_alloc_userptr(fd, 0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR | kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
  81. kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED |
  82. kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
  83. stm = kio.map_memory_to_gpu(fd, handle=ring_base.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
  84. assert stm.n_success == 1
  85. signals = gpu_alloc_userptr(fd, 0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_USERPTR | kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE |
  86. kfd.KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd.KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED |
  87. kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
  88. stm = kio.map_memory_to_gpu(fd, handle=signals.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
  89. assert stm.n_success == 1
  90. eop_buffer = gpu_alloc(fd, 0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM |
  91. kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE |
  92. kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
  93. stm = kio.map_memory_to_gpu(fd, handle=eop_buffer.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
  94. assert stm.n_success == 1
  95. ctx_save_restore_address = gpu_alloc(fd, 0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM |
  96. kfd.KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd.KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE |
  97. kfd.KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE)
  98. stm = kio.map_memory_to_gpu(fd, handle=ctx_save_restore_address.handle, device_ids_array_ptr=ctypes.addressof(arr), n_devices=1)
  99. assert stm.n_success == 1
  100. #113.00 ms + 0.00 ms : 0 = AMDKFD_IOC_CREATE_QUEUE ring_base_address:0x797465200000 write_pointer_address:0x79751C068038 read_pointer_address:0x79751C068080 doorbell_offset:0x0 ring_size:0x800000 gpu_id:0x433D queue_type:0x2 queue_per
  101. #centage:0x64 queue_priority:0x7 queue_id:0x0 eop_buffer_address:0x79751C064000 eop_buffer_size:0x1000 ctx_save_restore_address:0x796E52400000 ctx_save_restore_size:0x2BEA000 ctl_stack_size:0xA000
  102. #113.84 ms + 0.59 ms : 0 = AMDKFD_IOC_CREATE_QUEUE ring_base_address:0x71AC3F600000 write_pointer_address:0x71B302AB0038 read_pointer_address:0x71B302AB0080 doorbell_offset:0xD0CF400000000008 ring_size:0x800000 gpu_id:0x433D queue_typ
  103. #e:0x2 queue_percentage:0x64 queue_priority:0x7 queue_id:0x1 eop_buffer_address:0x71B302AAC000 eop_buffer_size:0x1000 ctx_save_restore_address:0x71AC3C800000 ctx_save_restore_size:0x2BEA000 ctl_stack_size:0xA000
  104. #define KFD_MMAP_TYPE_SHIFT 62
  105. #define KFD_MMAP_TYPE_DOORBELL (0x3ULL << KFD_MMAP_TYPE_SHIFT)
  106. evt = kio.create_event(fd, event_page_offset=event_page.handle, auto_reset=1)
  107. nq = kio.create_queue(fd, ring_base_address=ring_base.va_addr, ring_size=0x1000, gpu_id=GPU_ID,
  108. queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE_AQL, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE,
  109. queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
  110. eop_buffer_address=eop_buffer.va_addr, eop_buffer_size=0x1000,
  111. ctx_save_restore_address=ctx_save_restore_address.va_addr, ctx_save_restore_size=0x2C02000,
  112. ctl_stack_size = 0xa000,
  113. # write_pointer_address and read_pointer_address are on GART
  114. #write_pointer_address=0xaaaabbbb, read_pointer_address=0xaaaacccc)
  115. write_pointer_address=rw_ptr.va_addr+0, read_pointer_address=rw_ptr.va_addr+0x8)
  116. doorbell = libc.mmap(0, 8192, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, fd, nq.doorbell_offset)
  117. print("doorbell", hex(doorbell))
  118. to_mv(signals.va_addr, 0x40)
  119. """
  120. hexdump(to_mv(event_page.va_addr, 0x40))
  121. kio.set_event(fd, event_id=evt.event_id)
  122. hexdump(to_mv(event_page.va_addr, 0x40))
  123. kio.reset_event(fd, event_id=evt.event_id)
  124. hexdump(to_mv(event_page.va_addr, 0x40))
  125. """
  126. # KFD_EVENT_TYPE_SIGNAL
  127. BARRIER_HEADER = 1 << hsa.HSA_PACKET_HEADER_BARRIER
  128. BARRIER_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE
  129. BARRIER_HEADER |= hsa.HSA_FENCE_SCOPE_SYSTEM << hsa.HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE
  130. BARRIER_HEADER |= hsa.HSA_PACKET_TYPE_BARRIER_AND << hsa.HSA_PACKET_HEADER_TYPE
  131. AQL_PACKET_SIZE = ctypes.sizeof(hsa.hsa_kernel_dispatch_packet_t)
  132. EMPTY_SIGNAL = hsa.hsa_signal_t()
  133. ds = to_mv(rw_ptr.va_addr, 0x100).cast("Q")
  134. ds[0] = 1 #ring_base.va_addr + AQL_PACKET_SIZE
  135. ds[1] = 0 #ring_base.va_addr
  136. #libc.memset(rw_ptr.va_addr, 0xaa, 0x100)
  137. #hexdump(to_mv(rw_ptr.va_addr, 0x100))
  138. #packet = hsa.hsa_barrier_and_packet_t.from_address(rw_ptr.va_addr+0x38)
  139. packet = hsa.hsa_barrier_and_packet_t.from_address(ring_base.va_addr)
  140. packet.reserved0 = 0
  141. packet.reserved1 = 0
  142. for i in range(5): packet.dep_signal[i] = EMPTY_SIGNAL
  143. #packet.dep_signal[0] = hsa.hsa_signal_t(evt.event_id)
  144. packet.reserved2 = 0
  145. #packet.completion_signal = EMPTY_SIGNAL
  146. packet.completion_signal = hsa.hsa_signal_t(signals.va_addr)
  147. packet.header = BARRIER_HEADER
  148. hexdump(to_mv(ring_base.va_addr, AQL_PACKET_SIZE))
  149. # _HsaEventData
  150. to_mv(signals.va_addr, 0x40).cast("Q")[0] = 1
  151. to_mv(signals.va_addr, 0x40).cast("Q")[1] = 1
  152. #to_mv(signals.va_addr, 0x40).cast("Q")[2] = event_page
  153. to_mv(signals.va_addr, 0x40).cast("Q")[2] = event_page.va_addr + evt.event_slot_index*8 # HWData2=HWAddress
  154. to_mv(signals.va_addr, 0x40).cast("Q")[3] = evt.event_trigger_data # HWData3=HWData
  155. print(hex(ds[0]), hex(ds[1]), hex(ds[2]))
  156. hexdump(to_mv(signals.va_addr, 0x40))
  157. # 10 08 49 3E 46 77 00 00
  158. # ring doorbell
  159. print(hex(to_mv(doorbell, 0x10).cast("I")[0]))
  160. #to_mv(doorbell, 0x10).cast("I")[0] = 0xffffffff
  161. to_mv(doorbell, 0x10).cast("I")[0] = 0
  162. evt_arr = (kfd.struct_kfd_event_data * 1)()
  163. evt_arr[0].event_id = evt.event_id
  164. kio.wait_events(fd, events_ptr=ctypes.addressof(evt_arr), num_events=1, wait_for_all=0, timeout=1000)
  165. print(hex(ds[0]), hex(ds[1]), hex(ds[2]))
  166. hexdump(to_mv(signals.va_addr, 0x40))
  167. #nq = kio.create_queue(fd, ring_base_address=buf, ring_size=0x1000, gpu_id=GPU_ID,
  168. # queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE_AQL, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE,
  169. # queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY, write_pointer_address=buf+8, read_pointer_address=buf+0x10)
  170. #print(nq)
  171. #mv = to_mv(buf, 0x1000)
  172. #addr = libc.mmap(0, 0x1000, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_PRIVATE|mmap.MAP_ANONYMOUS, -1, 0)
  173. #print('\n'.join(format_struct(ver)))
  174. #print('\n'.join(format_struct(st)))