opencl_ioctl.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. import ctypes, ctypes.util, struct, fcntl, re
  2. from hexdump import hexdump
  3. from tinygrad.runtime.ops_gpu import CLDevice, CLAllocator
  4. import pathlib, sys
  5. sys.path.append(pathlib.Path(__file__).parent.parent.parent.as_posix())
  6. ops = {}
  7. import xml.etree.ElementTree as ET
  8. xml = ET.parse(pathlib.Path(__file__).parent / "adreno_pm4.xml")
  9. for child in xml.getroot():
  10. if 'name' in child.attrib and child.attrib['name'] == "adreno_pm4_type3_packets":
  11. for sc in child:
  12. if 'name' in sc.attrib and ('variants' not in sc.attrib or sc.attrib['variants'] != "A2XX"):
  13. ops[int(sc.attrib['value'], 0x10)] = sc.attrib['name']
  14. #print(ops)
  15. #exit(0)
  16. from extra.qcom_gpu_driver import msm_kgsl
  17. def ioctls_from_header():
  18. hdr = (pathlib.Path(__file__).parent.parent.parent / "extra/qcom_gpu_driver/msm_kgsl.h").read_text().replace("\\\n", "")
  19. pattern = r'#define\s+(IOCTL_KGSL_[A-Z0-9_]+)\s+_IOWR?\(KGSL_IOC_TYPE,\s+(0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)'
  20. matches = re.findall(pattern, hdr, re.MULTILINE)
  21. return {int(nr, 0x10):(name, getattr(msm_kgsl, "struct_"+sname)) for name, nr, sname in matches}
  22. nrs = ioctls_from_header()
  23. # https://github.com/ensc/dietlibc/blob/master/include/sys/aarch64-ioctl.h
  24. def get_struct(argp, stype):
  25. return ctypes.cast(ctypes.c_void_p(argp), ctypes.POINTER(stype)).contents
  26. def format_struct(s):
  27. sdats = []
  28. for field_name, field_type in s._fields_:
  29. if field_name in {"__pad", "PADDING_0"}: continue
  30. dat = getattr(s, field_name)
  31. if isinstance(dat, int): sdats.append(f"{field_name}:0x{dat:X}")
  32. else: sdats.append(f"{field_name}:{dat}")
  33. return sdats
  34. import mmap
  35. mmaped = {}
  36. def get_mem(addr, vlen):
  37. for k,v in mmaped.items():
  38. if k <= addr and addr < k+len(v):
  39. return v[addr-k:addr-k+vlen]
  40. def hprint(vals):
  41. ret = []
  42. for v in vals:
  43. if v > 31: ret.append(f"{v:#x}")
  44. else: ret.append(f"{v}")
  45. return f"({','.join(ret)})"
  46. ST6_SHADER = 0
  47. ST6_CONSTANTS = 1
  48. def parse_cmd_buf(dat):
  49. ptr = 0
  50. while ptr < len(dat):
  51. cmd = struct.unpack("I", dat[ptr:ptr+4])[0]
  52. if (cmd>>24) == 0x70:
  53. # packet with opcode and opcode specific payload (replace pkt3)
  54. opcode, size = ((cmd>>16)&0x7F), cmd&0x3FFF
  55. vals = struct.unpack("I"*size, dat[ptr+4:ptr+4+4*size])
  56. print(f"{ptr:3X} -- typ 7: {size=:3d}, {opcode=:#x} {ops[opcode]}", hprint(vals))
  57. if ops[opcode] == "CP_LOAD_STATE6_FRAG":
  58. dst_off = vals[0] & 0x3FFF
  59. state_type = (vals[0]>>14) & 0x3
  60. state_src = (vals[0]>>16) & 0x3
  61. state_block = (vals[0]>>18) & 0xF # 13 = SB4_CS_SHADER
  62. num_unit = vals[0]>>22
  63. print(f"{num_unit=} {state_block=} {state_src=} {state_type=} {dst_off=}")
  64. from extra.disassemblers.adreno import disasm_raw
  65. if state_type == ST6_SHADER: disasm_raw(get_mem(((vals[2] << 32) | vals[1]), 0x180))
  66. if state_type == ST6_CONSTANTS: hexdump(get_mem(((vals[2] << 32) | vals[1]), min(0x180, num_unit*4)))
  67. pass
  68. ptr += 4*size
  69. elif (cmd>>28) == 0x4:
  70. # write one or more registers (replace pkt0)
  71. offset, size = ((cmd>>8)&0x7FFFF), cmd&0x7F
  72. vals = struct.unpack("I"*size, dat[ptr+4:ptr+4+4*size])
  73. print(f"{ptr:3X} -- typ 4: {size=:3d}, {offset=:#x}", hprint(vals))
  74. ptr += 4*size
  75. else:
  76. print("unk", hex(cmd))
  77. ptr += 4
  78. @ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_ulong, ctypes.c_void_p)
  79. def ioctl(fd, request, argp):
  80. ret = libc.syscall(0x1d, ctypes.c_int(fd), ctypes.c_ulong(request), ctypes.c_void_p(argp))
  81. idir, size, itype, nr = (request>>30), (request>>16)&0x3FFF, (request>>8)&0xFF, request&0xFF
  82. if nr in nrs and itype == 9:
  83. name, stype = nrs[nr]
  84. s = get_struct(argp, stype)
  85. print(f"{ret:2d} = {name:40s}", ' '.join(format_struct(s)))
  86. if name == "IOCTL_KGSL_GPUOBJ_INFO":
  87. mmaped[s.gpuaddr] = mmap.mmap(fd, s.size, offset=s.id*0x1000)
  88. if name == "IOCTL_KGSL_GPU_COMMAND":
  89. for i in range(s.numcmds):
  90. cmd = get_struct(s.cmdlist+s.cmdsize*i, msm_kgsl.struct_kgsl_command_object)
  91. print(f"cmd {i}:", format_struct(cmd))
  92. #hexdump(get_mem(cmd.gpuaddr, cmd.size))
  93. parse_cmd_buf(get_mem(cmd.gpuaddr, cmd.size))
  94. for i in range(s.numobjs):
  95. obj = get_struct(s.objlist+s.objsize*i, msm_kgsl.struct_kgsl_command_object)
  96. print(f"obj {i}:", format_struct(obj))
  97. print(format_struct(msm_kgsl.struct_kgsl_cmdbatch_profiling_buffer.from_buffer_copy(get_mem(obj.gpuaddr, obj.size))))
  98. #hexdump(get_mem(obj.gpuaddr, obj.size))
  99. else:
  100. #print(f"ioctl({fd=}, (dir:{idir}, size:0x{size:3X}, type:{itype:d}, nr:0x{nr:2X}), {argp=:X}) = {ret=}")
  101. pass
  102. return ret
  103. def install_hook(c_function, python_function):
  104. # AARCH64 trampoline to ioctl
  105. tramp = b"\x70\x00\x00\x10\x10\x02\x40\xf9\x00\x02\x1f\xd6"
  106. tramp += struct.pack("Q", ctypes.cast(ctypes.byref(python_function), ctypes.POINTER(ctypes.c_ulong)).contents.value)
  107. # get real ioctl address
  108. ioctl_address = ctypes.cast(ctypes.byref(c_function), ctypes.POINTER(ctypes.c_ulong))
  109. # hook ioctl
  110. libc = ctypes.CDLL(ctypes.util.find_library("libc"))
  111. ret = libc.mprotect(ctypes.c_ulong((ioctl_address.contents.value//0x1000)*0x1000), 0x2000, 7)
  112. assert ret == 0
  113. libc.memcpy(ioctl_address.contents, ctypes.create_string_buffer(tramp), len(tramp))
  114. libc = ctypes.CDLL(ctypes.util.find_library("libc"))
  115. install_hook(libc.ioctl, ioctl)
  116. """
  117. print("***** init device")
  118. dev = CLDevice()
  119. print("***** alloc")
  120. alloc = CLAllocator(dev)
  121. a = alloc._alloc(16)
  122. #alloc._alloc(0x2000)
  123. ba = bytearray(b"hello")
  124. print(f"***** copyin {ctypes.addressof((ctypes.c_char * len(ba)).from_buffer(ba)):#x}")
  125. alloc.copyin(a, memoryview(ba))
  126. dev.synchronize()
  127. print("***** copyout")
  128. mv2 = memoryview(bytearray(b"nopeo"))
  129. alloc.copyout(mv2, a)
  130. dev.synchronize()
  131. print("***** done", bytes(mv2))
  132. exit(0)
  133. """
  134. print("***** import tinygrad")
  135. from tinygrad import Tensor, Device, TinyJit
  136. print("***** access GPU")
  137. dev = Device["GPU"]
  138. print("***** create tensor a")
  139. a = Tensor([1.,2.]*200).realize()
  140. print("***** create tensor b")
  141. b = Tensor([3.,4.]*200).realize()
  142. @TinyJit
  143. def add(a, b): return (a+b).realize()
  144. for i in range(4):
  145. print(f"***** add tensors {i}")
  146. c = add(a, b)
  147. #dev.synchronize()
  148. c = add(b, a)
  149. dev.synchronize()
  150. #print("***** copy out")
  151. #print(c.numpy())