| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143 |
- import pathlib, re, ctypes, mmap, collections, struct, functools, os, copy
- import tinygrad.runtime.autogen.kfd as kfd
- from typing import Optional, Any
- from tinygrad.helpers import from_mv
- from extra.mockgpu.driver import VirtDriver, VirtFileDesc, TextFileDesc, DirFileDesc, VirtFile
- from extra.mockgpu.amd.amdgpu import AMDGPU, gpu_props
- libc = ctypes.CDLL(ctypes.util.find_library("c"))
- libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
- libc.mmap.restype = ctypes.c_void_p
- def ioctls_from_header():
- hdrpy = (pathlib.Path(__file__).parent.parent.parent.parent / "tinygrad" / "runtime" / "autogen" / "kfd.py").read_text()
- pattern = r'# (AMDKFD_IOC_[A-Z0-9_]+)\s=\s_(IOW?R?).*\(( 0x[0-9a-fA-F]+) ,\s+struct\s([A-Za-z0-9_]+)\s+\)'
- matches = re.findall(pattern, hdrpy, re.MULTILINE)
- return type("KFD_IOCTLS", (object, ), {name: int(nr, 0x10) for name, _, nr, _ in matches}), \
- {int(nr, 0x10): getattr(kfd, "struct_"+sname) for name, idir, nr, sname in matches}
- kfd_ioctls, kfd_headers = ioctls_from_header()
- class KFDFileDesc(VirtFileDesc):
- def __init__(self, fd, driver):
- super().__init__(fd)
- self.driver = driver
- def ioctl(self, fd, request, argp): return self.driver.kfd_ioctl(request, argp)
- def mmap(self, start, sz, prot, flags, fd, offset): return offset
- class DRMFileDesc(VirtFileDesc):
- def __init__(self, fd, driver, gpu):
- super().__init__(fd)
- self.driver, self.gpu = driver, gpu
- def mmap(self, start, sz, prot, flags, fd, offset): return libc.mmap(start, sz, prot, flags|mmap.MAP_ANONYMOUS, -1, 0)
- class AMDDriver(VirtDriver):
- def __init__(self, gpus=6):
- super().__init__()
- self.tracked_files += [VirtFile('/dev/kfd', functools.partial(KFDFileDesc, driver=self))] + \
- [VirtFile('/sys/devices/virtual/kfd/kfd/topology/nodes', functools.partial(DirFileDesc, child_names=[str(i) for i in range(gpus)]))]
- self.gpus = {}
- self.next_fd = (1 << 30)
- self.next_handle = 1
- self.next_event = 1
- self.object_by_handle = {}
- self.doorbells = {}
- self.next_doorbell = collections.defaultdict(int)
- for i in range(gpus): self._prepare_gpu(i)
- def _alloc_fd(self):
- my_fd = self.next_fd
- self.next_fd = self.next_fd + 1
- return my_fd
- def _alloc_handle(self):
- handle = self.next_handle
- self.next_handle += 1
- return handle
- def _alloc_next_event_slot(self):
- ev = self.next_event
- self.next_event += 1
- return ev
- def _alloc_doorbell(self, gpu_id):
- x = ctypes.addressof(from_mv(self.doorbells[gpu_id])) + self.next_doorbell[gpu_id] * 8
- self.next_doorbell[gpu_id] += 1
- return x
- def _prepare_gpu(self, gpu_id):
- self.doorbells[gpu_id] = memoryview(bytearray(0x2000))
- self.gpus[gpu_id] = AMDGPU(gpu_id)
- self.tracked_files += [
- VirtFile(f'/sys/devices/virtual/kfd/kfd/topology/nodes/{gpu_id}', functools.partial(DirFileDesc, child_names=['gpu_id', 'properties'])),
- VirtFile(f'/sys/devices/virtual/kfd/kfd/topology/nodes/{gpu_id}/gpu_id', functools.partial(TextFileDesc, text=f"{gpu_id}")),
- VirtFile(f'/sys/devices/virtual/kfd/kfd/topology/nodes/{gpu_id}/properties',
- functools.partial(TextFileDesc, text=gpu_props.format(drm_render_minor=gpu_id))),
- VirtFile(f'/dev/dri/renderD{gpu_id}', functools.partial(DRMFileDesc, driver=self, gpu=f"{self.gpus[gpu_id]}")),
- ]
- def open(self, name, flags, mode, virtfile): return virtfile.fdcls(self._alloc_fd())
- def kfd_ioctl(self, req, argp):
- nr = req & 0xFF
- struct = kfd_headers[nr].from_address(argp)
- if nr == kfd_ioctls.AMDKFD_IOC_ACQUIRE_VM: pass
- elif nr == kfd_ioctls.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU:
- if struct.gpu_id not in self.gpus: return -1
- struct.handle = self._alloc_handle()
- self.object_by_handle[struct.handle] = copy.deepcopy(struct) # save memory struct to know what mem it is
- elif nr == kfd_ioctls.AMDKFD_IOC_FREE_MEMORY_OF_GPU:
- self.object_by_handle.pop(struct.handle)
- elif nr == kfd_ioctls.AMDKFD_IOC_MAP_MEMORY_TO_GPU:
- dev_ids = (ctypes.c_int32 * struct.n_devices).from_address(struct.device_ids_array_ptr)
- for i in range(struct.n_devices):
- gpu = self.gpus[dev_ids[i]]
- mem_obj = self.object_by_handle[struct.handle]
- gpu.map_range(mem_obj.va_addr, mem_obj.size)
- struct.n_success = i + 1
- elif nr == kfd_ioctls.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU:
- dev_ids = (ctypes.c_int32 * struct.n_devices).from_address(struct.device_ids_array_ptr)
- for i in range(struct.n_devices):
- gpu = self.gpus[dev_ids[i]]
- mem_obj = self.object_by_handle[struct.handle]
- gpu.unmap_range(mem_obj.va_addr, mem_obj.size)
- struct.n_success = i + 1
- elif nr == kfd_ioctls.AMDKFD_IOC_CREATE_EVENT:
- struct.event_slot_index = self._alloc_next_event_slot()
- struct.event_id = struct.event_slot_index
- elif nr == kfd_ioctls.AMDKFD_IOC_CREATE_QUEUE:
- gpu = self.gpus[struct.gpu_id]
- if struct.queue_type == kfd.KFD_IOC_QUEUE_TYPE_SDMA:
- gpu.add_sdma_queue(struct.ring_base_address, struct.ring_size, struct.read_pointer_address, struct.write_pointer_address)
- elif struct.queue_type == kfd.KFD_IOC_QUEUE_TYPE_COMPUTE:
- gpu.add_pm4_queue(struct.ring_base_address, struct.ring_size, struct.read_pointer_address, struct.write_pointer_address)
- else: raise RuntimeError("Unsuported, queue")
- # Track writes to doorbell, calling callback
- struct.doorbell_offset = self._alloc_doorbell(struct.gpu_id)
- self.track_address(struct.doorbell_offset, struct.doorbell_offset + 8, lambda mv,off: None, lambda mv, off: self._emulate_execute())
- elif nr == kfd_ioctls.AMDKFD_IOC_WAIT_EVENTS:
- pass
- else:
- name = "unknown"
- for k,v in kfd_ioctls.__dict__.items():
- if nr == v: name = k
- assert False, f"unknown kfd ioctl, {nr} {name}"
- exit(1)
- return 0
- def _emulate_execute(self):
- any_progress = True
- while any_progress:
- any_progress = False
- for gpu in self.gpus.values():
- for q in gpu.queues:
- if (prev_rptr:=q.rptr[0]) != q.wptr[0]:
- q.execute()
- any_progress |= (prev_rptr != q.rptr[0])
|