nv_ioctl.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. # type: ignore
  2. import ctypes, ctypes.util, struct, platform, pathlib, re, time, os, signal
  3. from tinygrad.helpers import from_mv, to_mv, getenv
  4. from hexdump import hexdump
  5. start = time.perf_counter()
  6. # *** ioctl lib ***
  7. libc = ctypes.CDLL(ctypes.util.find_library("c"))
  8. processor = platform.processor()
  9. IOCTL_SYSCALL = {"aarch64": 0x1d, "x86_64":16}[processor]
  10. MMAP_SYSCALL = {"aarch64": 0xde, "x86_64":0x09}[processor]
  11. def get_struct(argp, stype):
  12. return ctypes.cast(ctypes.c_void_p(argp), ctypes.POINTER(stype)).contents
  13. def dump_struct(st):
  14. print("\t", st.__class__.__name__, end=" { ")
  15. for v in type(st)._fields_: print(f"{v[0]}={getattr(st, v[0])}", end=" ")
  16. print("}")
  17. def format_struct(s):
  18. sdats = []
  19. for field in s._fields_:
  20. dat = getattr(s, field[0])
  21. if isinstance(dat, int): sdats.append(f"{field[0]}:0x{dat:X}")
  22. else: sdats.append(f"{field[0]}:{dat}")
  23. return sdats
  24. real_func_pool = {}
  25. def install_hook(c_function, python_function):
  26. orig_func = (ctypes.c_char*4096)()
  27. python_function_addr = ctypes.cast(ctypes.byref(python_function), ctypes.POINTER(ctypes.c_ulong)).contents.value
  28. # AARCH64 trampoline to ioctl
  29. if processor == "aarch64":
  30. # 0x0000000000000000: 70 00 00 10 adr x16, #0xc
  31. # 0x0000000000000004: 10 02 40 F9 ldr x16, [x16]
  32. # 0x0000000000000008: 00 02 1F D6 br x16
  33. tramp = b"\x70\x00\x00\x10\x10\x02\x40\xf9\x00\x02\x1f\xd6"
  34. tramp += struct.pack("Q", python_function_addr)
  35. elif processor == "x86_64":
  36. # 0x0000000000000000: 49 BB aa aa aa aa aa aa aa aa movabs r11, <address>
  37. # 0x000000000000000a: 41 FF E3 jmp r11
  38. tramp = b"\x49\xBB" + struct.pack("Q", python_function_addr) + b"\x41\xFF\xE3"
  39. else:
  40. raise Exception(f"processor {processor} not supported")
  41. # get real ioctl address
  42. ioctl_address = ctypes.cast(ctypes.byref(c_function), ctypes.POINTER(ctypes.c_ulong))
  43. # hook ioctl
  44. ret = libc.mprotect(ctypes.c_ulong((ioctl_address.contents.value//0x1000)*0x1000), 0x2000, 7)
  45. assert ret == 0
  46. ret = libc.mprotect(ctypes.c_ulong((ctypes.addressof(orig_func)//0x1000)*0x1000), 0x3000, 7)
  47. assert ret == 0
  48. libc.memcpy(orig_func, ioctl_address.contents, 0x1000)
  49. libc.memcpy(ioctl_address.contents, ctypes.create_string_buffer(tramp), len(tramp))
  50. return orig_func
  51. # *** ioctl lib end ***
  52. import tinygrad.runtime.autogen.nv_gpu as nv_gpu
  53. nvescs = {getattr(nv_gpu, x):x for x in dir(nv_gpu) if x.startswith("NV_ESC")}
  54. nvcmds = {getattr(nv_gpu, x):(x, getattr(nv_gpu, "struct_"+x+"_PARAMS", getattr(nv_gpu, "struct_"+x.replace("_CMD_", "_")+"_PARAMS", None))) for x in dir(nv_gpu) if \
  55. x.startswith("NV") and x[6:].startswith("_CTRL_") and isinstance(getattr(nv_gpu, x), int)}
  56. def get_classes():
  57. hdrpy = (pathlib.Path(__file__).parent.parent.parent / "tinygrad/runtime/autogen/nv_gpu.py").read_text()
  58. clss = re.search(r'NV01_ROOT.*?NV_SEMAPHORE_SURFACE = \(0x000000da\) # macro', hdrpy, re.DOTALL).group()
  59. pattern = r'([0-9a-zA-Z_]*) = +\((0x[0-9a-fA-F]+)\)'
  60. matches = re.findall(pattern, clss, re.MULTILINE)
  61. return {int(num, base=16):name for name, num in matches}
  62. nvclasses = get_classes()
  63. nvuvms = {getattr(nv_gpu, x):x for x in dir(nv_gpu) if x.startswith("UVM_") and nv_gpu.__dict__.get(x+"_PARAMS")}
  64. nvqcmds = {int(getattr(nv_gpu, x)):x for x in dir(nv_gpu) if x[:7] in {"NVC6C0_", "NVC56F_", "NVC6B5_"} and isinstance(getattr(nv_gpu, x), int)}
  65. global_ioctl_id = 0
  66. gpus_user_modes = []
  67. gpus_mmio = []
  68. gpus_fifo = []
  69. @ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_ulong, ctypes.c_void_p)
  70. def ioctl(fd, request, argp):
  71. global global_ioctl_id, gpus_user_modes, gpus_mmio
  72. global_ioctl_id += 1
  73. st = time.perf_counter()
  74. ret = libc.syscall(IOCTL_SYSCALL, ctypes.c_int(fd), ctypes.c_ulong(request), ctypes.c_void_p(argp))
  75. et = time.perf_counter()-st
  76. fn = os.readlink(f"/proc/self/fd/{fd}")
  77. #print(f"ioctl {request:8x} {fn:20s}")
  78. idir, size, itype, nr = (request>>30), (request>>16)&0x3FFF, (request>>8)&0xFF, request&0xFF
  79. print(f"#{global_ioctl_id}: ", end="")
  80. if itype == ord(nv_gpu.NV_IOCTL_MAGIC):
  81. if nr == nv_gpu.NV_ESC_RM_CONTROL:
  82. s = get_struct(argp, nv_gpu.NVOS54_PARAMETERS)
  83. if s.cmd in nvcmds:
  84. name, struc = nvcmds[s.cmd]
  85. print(f"NV_ESC_RM_CONTROL cmd={name:30s} hClient={s.hClient}, hObject={s.hObject}, flags={s.flags}, params={s.params}, paramsSize={s.paramsSize}, status={s.status}")
  86. if struc is not None: dump_struct(get_struct(s.params, struc))
  87. elif hasattr(nv_gpu, name+"_PARAMS"): dump_struct(get_struct(argp, getattr(nv_gpu, name+"_PARAMS")))
  88. elif name == "NVA06C_CTRL_CMD_GPFIFO_SCHEDULE": dump_struct(get_struct(argp, nv_gpu.NVA06C_CTRL_GPFIFO_SCHEDULE_PARAMS))
  89. else:
  90. print("unhandled cmd", hex(s.cmd))
  91. # format_struct(s)
  92. # print(f"{(st-start)*1000:7.2f} ms +{et*1000.:7.2f} ms : {ret:2d} = {name:40s}", ' '.join(format_struct(s)))
  93. elif nr == nv_gpu.NV_ESC_RM_ALLOC:
  94. s = get_struct(argp, nv_gpu.NVOS21_PARAMETERS)
  95. print(f"NV_ESC_RM_ALLOC hClass={nvclasses.get(s.hClass, 'unk'):30s}, hRoot={s.hRoot}, hObjectParent={s.hObjectParent}, pAllocParms={s.pAllocParms}, hObjectNew={s.hObjectNew}")
  96. if s.pAllocParms is not None:
  97. if s.hClass == nv_gpu.NV01_DEVICE_0: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV0080_ALLOC_PARAMETERS))
  98. if s.hClass == nv_gpu.FERMI_VASPACE_A: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS))
  99. if s.hClass == nv_gpu.NV50_MEMORY_VIRTUAL: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_MEMORY_ALLOCATION_PARAMS))
  100. if s.hClass == nv_gpu.NV1_MEMORY_USER: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_MEMORY_ALLOCATION_PARAMS))
  101. if s.hClass == nv_gpu.NV1_MEMORY_SYSTEM: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_MEMORY_ALLOCATION_PARAMS))
  102. if s.hClass == nv_gpu.AMPERE_CHANNEL_GPFIFO_A:
  103. sx = get_struct(s.pAllocParms, nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS)
  104. dump_struct(sx)
  105. gpus_fifo.append((sx.gpFifoOffset, sx.gpFifoEntries))
  106. if s.hClass == nv_gpu.KEPLER_CHANNEL_GROUP_A: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS))
  107. if s.hClass == nv_gpu.TURING_USERMODE_A: gpus_user_modes.append(s.hObjectNew)
  108. elif nr == nv_gpu.NV_ESC_RM_MAP_MEMORY:
  109. # nv_ioctl_nvos33_parameters_with_fd
  110. s = get_struct(argp, nv_gpu.NVOS33_PARAMETERS)
  111. print(f"NV_ESC_RM_MAP_MEMORY hClient={s.hClient}, hDevice={s.hDevice}, hMemory={s.hMemory}, length={s.length} flags={s.flags} pLinearAddress={s.pLinearAddress}")
  112. elif nr == nv_gpu.NV_ESC_RM_UPDATE_DEVICE_MAPPING_INFO:
  113. s = get_struct(argp, nv_gpu.NVOS56_PARAMETERS)
  114. print(f"NV_ESC_RM_UPDATE_DEVICE_MAPPING_INFO hClient={s.hClient}, hDevice={s.hDevice}, hMemory={s.hMemory}, pOldCpuAddress={s.pOldCpuAddress} pNewCpuAddress={s.pNewCpuAddress} status={s.status}")
  115. elif nr == nv_gpu.NV_ESC_RM_ALLOC_MEMORY:
  116. s = get_struct(argp, nv_gpu.nv_ioctl_nvos02_parameters_with_fd)
  117. print(f"NV_ESC_RM_ALLOC_MEMORY fd={s.fd}, hRoot={s.params.hRoot}, hObjectParent={s.params.hObjectParent}, hObjectNew={s.params.hObjectNew}, hClass={s.params.hClass}, flags={s.params.flags}, pMemory={s.params.pMemory}, limit={s.params.limit}, status={s.params.status}")
  118. elif nr == nv_gpu.NV_ESC_ALLOC_OS_EVENT:
  119. s = get_struct(argp, nv_gpu.nv_ioctl_nvos02_parameters_with_fd)
  120. elif nr == nv_gpu.NV_ESC_REGISTER_FD:
  121. s = get_struct(argp, nv_gpu.nv_ioctl_register_fd_t)
  122. print(f"NV_ESC_REGISTER_FD fd={s.ctl_fd}")
  123. elif nr in nvescs:
  124. print(nvescs[nr])
  125. else:
  126. print("unhandled NR", nr)
  127. elif fn.endswith("nvidia-uvm"):
  128. print(f"{nvuvms.get(request, f'UVM UNKNOWN {request=}')}")
  129. if nvuvms.get(request) is not None: dump_struct(get_struct(argp, getattr(nv_gpu, nvuvms.get(request)+"_PARAMS")))
  130. if nvuvms.get(request) == "UVM_MAP_EXTERNAL_ALLOCATION":
  131. st = get_struct(argp, getattr(nv_gpu, nvuvms.get(request)+"_PARAMS"))
  132. for i in range(st.gpuAttributesCount):
  133. print("perGpuAttributes[{i}] = ", end="")
  134. dump_struct(st.perGpuAttributes[i])
  135. print("ok")
  136. if getenv("IOCTL") >= 2: print("ioctl", f"{idir=} {size=} {itype=} {nr=} {fd=} {ret=}", fn)
  137. return ret
  138. @ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long)
  139. def _mmap(addr, length, prot, flags, fd, offset):
  140. mmap_type = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long)
  141. orig_mmap = mmap_type(ctypes.addressof(orig_mmap_mv))
  142. ret = orig_mmap(addr, length, prot, flags, fd, offset)
  143. # ll = os.readlink(f"/proc/self/fd/{fd}") if fd >= 0 else ""
  144. print(f"mmap {addr=}, {length=}, {prot=}, {flags=}, {fd=}, {offset=} {ret=}")
  145. return ret
  146. install_hook(libc.ioctl, ioctl)
  147. if getenv("IOCTL") >= 3: orig_mmap_mv = install_hook(libc.mmap, _mmap)
  148. import collections
  149. old_gpputs = collections.defaultdict(int)
  150. def _dump_gpfifo(mark):
  151. print("_dump_gpfifo:", mark)
  152. for start,size in gpus_fifo:
  153. gpfifo_controls = nv_gpu.AmpereAControlGPFifo.from_address(start+size*8)
  154. gpfifo = to_mv(start, gpfifo_controls.GPPut * 8).cast("Q")
  155. if old_gpputs[start] == gpfifo_controls.GPPut: continue
  156. print(f"gpfifo {start}: {gpfifo_controls.GPPut=}")
  157. for i in range(old_gpputs[start], gpfifo_controls.GPPut):
  158. addr = ((gpfifo[i % size] & ((1 << 40)-1)) >> 2) << 2
  159. pckt_cnt = (gpfifo[i % size]>>42)&((1 << 20)-1)
  160. print(f"\t{i}: 0x{gpfifo[i % size]:x}: addr:0x{addr:x} packets:{pckt_cnt} sync:{(gpfifo[i % size] >> 63) & 0x1} fetch:{gpfifo[i % size] & 0x1}")
  161. old_gpputs[start] = gpfifo_controls.GPPut
  162. _dump_qmd(addr, pckt_cnt)
  163. import types
  164. def _dump_qmd(address, packets):
  165. gpfifo = to_mv(address, packets * 4).cast("I")
  166. i = 0
  167. while i < packets:
  168. dat = gpfifo[i]
  169. typ = (dat>>28) & 0xF
  170. if typ == 0: break
  171. size = (dat>>16) & 0xFFF
  172. subc = (dat>>13) & 7
  173. mthd = (dat<<2) & 0x7FFF
  174. method_name = nvqcmds.get(mthd, f"unknown method #{mthd}")
  175. print(f"\t\t{method_name}, {typ=} {size=} {subc=} {mthd=}")
  176. for j in range(size): print(f"\t\t\t{j}: {gpfifo[i+j+1]} | 0x{gpfifo[i+j+1]:x}")
  177. if mthd == 792:
  178. for x in dir(nv_gpu):
  179. if x.startswith("NVC6C0_QMDV03_00_"):
  180. vv = getattr(nv_gpu, x)
  181. bits = None
  182. if isinstance(vv, tuple) and len(vv) == 2:
  183. bits = vv
  184. if isinstance(vv, types.FunctionType):
  185. bits = vv(0)
  186. if bits is not None:
  187. res = 0
  188. for bt in range(bits[1], bits[0]+1): res |= ((gpfifo[i + 3 + bt // 32] >> (bt % 32)) & 0x1) << (bt - bits[1])
  189. if res != 0: print(f"{x}, {hex(res)} | {bin(res)}")
  190. const_addr = gpfifo[i+35] + ((gpfifo[i+36] & 0xffff) << 32)
  191. const_len = ((gpfifo[i+36] >> 19))
  192. # hexdump(to_mv(const_addr, const_len))
  193. i += size + 1
  194. # IOCTL=1 PTX=1 CUDA=1 python3 test/test_ops.py TestOps.test_tiny_add