test_pm4.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. import time
  2. from hexdump import hexdump
  3. from tinygrad import Tensor, Device
  4. import tinygrad.runtime.autogen.amd_gpu as amd_gpu
  5. import tinygrad.runtime.autogen.kfd as kfd
  6. import tinygrad.runtime.autogen.hsa as hsa
  7. from tinygrad.engine.schedule import create_schedule
  8. from tinygrad.runtime.ops_amd import kio, AMDProgram
  9. from tinygrad.helpers import to_mv
  10. DISPATCH_INIT_VALUE = 0x21 | 0x8000
  11. #mmCOMPUTE_START_X = 0x2e04
  12. #mmCOMPUTE_PGM_LO = 0x2e0c
  13. BASE_ADDR = 0x00001260
  14. PACKET3_SET_SH_REG_START = 0x2c00
  15. SUB = PACKET3_SET_SH_REG_START - BASE_ADDR
  16. regCOMPUTE_PGM_LO = 0x1bac - SUB
  17. regCOMPUTE_START_X = 0x1ba4 - SUB
  18. regCOMPUTE_NUM_THREAD_X = 0x1ba7 - SUB
  19. regCOMPUTE_USER_DATA_0 = 0x1be0 - SUB
  20. regCOMPUTE_USER_DATA_8 = 0x1be8 - SUB
  21. regCOMPUTE_PGM_RSRC1 = 0x1bb2 - SUB
  22. regCOMPUTE_PGM_RSRC2 = 0x1bb3 - SUB
  23. # DEBUG=6 python3 extra/hip_gpu_driver/test_pm4.py
  24. # sudo umr -i 1 -s amd744c.gfx1100 --sbank 1 1 2 | grep regCOMPUTE
  25. # 0x00009025
  26. COMPUTE_SHADER_EN = 1
  27. USE_THREAD_DIMENSIONS = 1 << 5
  28. CS_W32_EN = 1 << 15
  29. def format_struct(s):
  30. sdats = []
  31. for field_name, field_type in s._fields_:
  32. dat = getattr(s, field_name)
  33. if isinstance(dat, int): sdats.append(f"{field_name}:0x{dat:X}")
  34. else: sdats.append(f"{field_name}:{dat}")
  35. return sdats
  36. if __name__ == "__main__":
  37. dev = Device["KFD"]
  38. a = Tensor([0.,1.,2.], device="KFD").realize()
  39. b = a + 7
  40. b.lazydata.buffer.allocate()
  41. si = create_schedule([b.lazydata])[-1]
  42. runner = dev.get_runner(*si.ast)
  43. prg: AMDProgram = runner.clprg
  44. print("device initted")
  45. # Compute Queue
  46. gart_compute = dev._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
  47. eop_buffer = dev._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
  48. compute_ring = dev._gpu_alloc(0x800000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
  49. ctx_save_restore_address = dev._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
  50. compute_queue = kio.create_queue(dev.kfd, ring_base_address=compute_ring.va_addr, ring_size=compute_ring.size, gpu_id=dev.gpu_id,
  51. queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
  52. #eop_buffer_address=eop_buffer.va_addr, eop_buffer_size=eop_buffer.size,
  53. #ctx_save_restore_address=ctx_save_restore_address.va_addr, ctx_save_restore_size=ctx_save_restore_address.size,
  54. #ctl_stack_size = 0xa000,
  55. write_pointer_address=gart_compute.va_addr, read_pointer_address=gart_compute.va_addr+8)
  56. compute_doorbell = to_mv(dev.doorbells + compute_queue.doorbell_offset - dev.doorbells_base, 4).cast("I")
  57. #scratch = dev._gpu_alloc(0x10000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
  58. ka = to_mv(dev.kernargs_ptr, 0x10).cast("Q")
  59. ka[0] = b.lazydata.buffer._buf.va_addr
  60. ka[1] = a.lazydata.buffer._buf.va_addr
  61. compute_read_pointer = to_mv(compute_queue.read_pointer_address, 8).cast("Q")
  62. compute_write_pointer = to_mv(compute_queue.write_pointer_address, 8).cast("Q")
  63. hexdump(to_mv(prg.handle, 0x40))
  64. code = hsa.amd_kernel_code_t.from_address(prg.handle)
  65. #print(format_struct(code))
  66. #print("code")
  67. #hexdump(to_mv(code_ptr, 0x100))
  68. #runner.local_size = [2,1,1]
  69. print(runner.local_size, runner.global_size)
  70. #pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), mmCOMPUTE_PGM_LO,
  71. # prg.handle&0xFFFFFFFF, prg.handle>>32, 0, 0, (scratch.va_addr>>8)&0xFFFFFFFF, scratch.va_addr>>40]
  72. code_ptr = (prg.handle + code.kernel_code_entry_byte_offset) >> 8
  73. pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), regCOMPUTE_PGM_LO, code_ptr&0xFFFFFFFF, code_ptr>>32, 0, 0, 0, 0]
  74. pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_PGM_RSRC1, code.compute_pgm_rsrc1, code.compute_pgm_rsrc2]
  75. pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_USER_DATA_0, dev.kernargs_ptr&0xFFFFFFFF, dev.kernargs_ptr>>32]
  76. #pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_USER_DATA_0, 0, 0]
  77. pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), regCOMPUTE_START_X, 0,0,0,
  78. runner.local_size[0],runner.local_size[1],runner.local_size[2],0,0]
  79. # disabled USE_THREAD_DIMENSIONS
  80. pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3),
  81. runner.global_size[0],runner.global_size[1],runner.global_size[2], CS_W32_EN | COMPUTE_SHADER_EN]
  82. #pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_NOP, 0x3fff)]*0x200
  83. """
  84. addr=0x0
  85. sz=(1 << 64)-1
  86. gli=0
  87. glv=0
  88. glk=0
  89. gl1=0
  90. gl2=0
  91. pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0,
  92. sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0,
  93. amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | \
  94. amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
  95. amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2)]
  96. print(pm4_cmd)
  97. """
  98. wptr = 0
  99. pm4_buffer_view = to_mv(compute_ring.va_addr, compute_ring.size).cast("I")
  100. for j in range(0x80000):
  101. for i, value in enumerate(pm4_cmd): pm4_buffer_view[wptr+i] = value
  102. wptr += len(pm4_cmd)
  103. compute_write_pointer[0] = wptr
  104. compute_doorbell[0] = wptr
  105. for k in range(10):
  106. done = compute_read_pointer[0] == compute_write_pointer[0]
  107. print(compute_read_pointer[0], compute_write_pointer[0], done)
  108. if done: break
  109. time.sleep(0.01)
  110. break
  111. #break
  112. #print(compute_read_pointer[0])
  113. #time.sleep(0.05)
  114. #print(compute_read_pointer[0])
  115. #time.sleep(100)
  116. print(a.numpy())
  117. print(b.numpy())
  118. exit(0)
  119. #pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), mmCOMPUTE_PGM_LO, 0,0,0,1,1,1,0,0]
  120. #pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, )]
  121. #pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0,
  122. # sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0,
  123. # amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | \
  124. # amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
  125. # amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2)]