ops_gpu.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. from __future__ import annotations
  2. from typing import Tuple, Optional, List, cast
  3. import ctypes, functools, hashlib
  4. import tinygrad.runtime.autogen.opencl as cl
  5. from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG
  6. from tinygrad.renderer.cstyle import OpenCLRenderer
  7. from tinygrad.device import BufferOptions, LRUAllocator, Compiled, Compiler, CompileError
  8. # see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
  9. OSX_TIMING_RATIO = (125/3) if OSX else 1.0
  10. def check(status):
  11. if status != 0: raise RuntimeError(f"OpenCL Error {status}")
  12. def checked(ret, status): return (check(status.value), ret)[1]
  13. class CLCompiler(Compiler):
  14. def __init__(self, device:CLDevice, compile_key:str):
  15. self.device = device
  16. super().__init__(f"compile_cl_{compile_key}")
  17. def compile(self, src:str) -> bytes:
  18. program = checked(cl.clCreateProgramWithSource(self.device.context, 1, to_char_p_p([src.encode()]), None, status := ctypes.c_int32()), status)
  19. build_status: int = cl.clBuildProgram(program, 1, self.device.device_id, None, cl.clBuildProgram.argtypes[4](), None)
  20. if build_status != 0:
  21. cl.clGetProgramBuildInfo(program, self.device.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, log_size := ctypes.c_size_t())
  22. cl.clGetProgramBuildInfo(program, self.device.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None) # noqa: E501
  23. raise CompileError(f"OpenCL Compile Error\n\n{mstr.value.decode()}")
  24. check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARY_SIZES, ctypes.sizeof(ctypes.c_size_t), binary_sizes := (ctypes.c_size_t * 1)(), None))
  25. check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARIES, ctypes.sizeof(ctypes.c_void_p), (ctypes.c_void_p * 1)(ctypes.addressof(binary := ctypes.create_string_buffer(binary_sizes[0]))), None)) # noqa: E501
  26. check(cl.clReleaseProgram(program))
  27. return bytes(binary)
  28. class CLProgram:
  29. def __init__(self, device:CLDevice, name:str, lib:bytes):
  30. self.device, self.name, self.lib = device, name, lib
  31. self.program = checked(cl.clCreateProgramWithBinary(device.context, 1, device.device_id, (ctypes.c_size_t * 1)(len(lib)),
  32. to_char_p_p([lib], ctypes.c_ubyte), binary_status := ctypes.c_int32(),
  33. errcode_ret := ctypes.c_int32()), errcode_ret)
  34. check(binary_status.value)
  35. check(cl.clBuildProgram(self.program, 1, device.device_id, None, cl.clBuildProgram.argtypes[4](), None)) # NOTE: OSX requires this
  36. self.kernel = checked(cl.clCreateKernel(self.program, name.encode(), status := ctypes.c_int32()), status)
  37. def __del__(self):
  38. if hasattr(self, 'kernel'): check(cl.clReleaseKernel(self.kernel))
  39. if hasattr(self, 'program'): check(cl.clReleaseProgram(self.program))
  40. def __call__(self, *bufs:ctypes._CData, global_size:Tuple[int,int,int]=(1,1,1), local_size:Optional[Tuple[int,int,int]]=None, vals:Tuple[int, ...]=(), wait=False) -> Optional[float]: # noqa: E501
  41. for i,b in enumerate(bufs): cl.clSetKernelArg(self.kernel, i, ctypes.sizeof(b), ctypes.byref(b))
  42. for i,v in enumerate(vals,start=len(bufs)): cl.clSetKernelArg(self.kernel, i, 4, ctypes.byref(ctypes.c_int32(v)))
  43. if local_size is not None: global_size = cast(Tuple[int,int,int], tuple(int(g*l) for g,l in zip(global_size, local_size)))
  44. event = cl.cl_event() if wait else None
  45. check(cl.clEnqueueNDRangeKernel(self.device.queue, self.kernel, len(global_size), None, (ctypes.c_size_t * len(global_size))(*global_size), (ctypes.c_size_t * len(local_size))(*local_size) if local_size else None, 0, None, event)) # noqa: E501
  46. if wait:
  47. assert event is not None
  48. check(cl.clWaitForEvents(1, event))
  49. check(cl.clGetEventProfilingInfo(event, cl.CL_PROFILING_COMMAND_START, 8, ctypes.byref(start := ctypes.c_uint64()), None))
  50. check(cl.clGetEventProfilingInfo(event, cl.CL_PROFILING_COMMAND_END, 8, ctypes.byref(end := ctypes.c_uint64()), None))
  51. return float(end.value-start.value) * OSX_TIMING_RATIO * 1e-9
  52. return None
  53. class CLAllocator(LRUAllocator):
  54. def __init__(self, device:CLDevice):
  55. self.device = device
  56. super().__init__()
  57. def _alloc(self, size:int, options:BufferOptions) -> ctypes._CData:
  58. if options.image is not None:
  59. return checked(cl.clCreateImage2D(self.device.context, cl.CL_MEM_READ_WRITE,
  60. cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]),
  61. options.image.shape[1], options.image.shape[0], 0, None, status := ctypes.c_int32()), status)
  62. return checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, status := ctypes.c_int32()), status)
  63. def _free(self, opaque:ctypes._CData, options:BufferOptions): check(cl.clReleaseMemObject(opaque))
  64. def copyin(self, dest:ctypes._CData, src:memoryview):
  65. check(cl.clEnqueueWriteBuffer(self.device.queue, dest, False, 0, len(src)*src.itemsize, from_mv(src), 0, None, None))
  66. self.device.pending_copyin.append(src) # NOTE: these can't be freed until the GPU actually executes this command
  67. def copyout(self, dest:memoryview, src:ctypes._CData):
  68. check(cl.clEnqueueReadBuffer(self.device.queue, src, False, 0, len(dest)*dest.itemsize, from_mv(dest), 0, None, None))
  69. self.device.synchronize()
  70. class CLDevice(Compiled):
  71. device_ids = None # this is global and only initted once
  72. def __init__(self, device:str=""):
  73. if CLDevice.device_ids is None:
  74. check(cl.clGetPlatformIDs(0, None, num_platforms := ctypes.c_uint32()))
  75. check(cl.clGetPlatformIDs(num_platforms.value, platform_ids := (cl.cl_platform_id * num_platforms.value)(), None))
  76. for device_type in [cl.CL_DEVICE_TYPE_GPU, cl.CL_DEVICE_TYPE_DEFAULT]:
  77. err = cl.clGetDeviceIDs(platform_ids[0], device_type, 0, None, num_devices := ctypes.c_uint32())
  78. if err == 0 and num_devices.value != 0: break
  79. if DEBUG >= 1: print(f"CLDevice: got {num_platforms.value} platforms and {num_devices.value} devices")
  80. CLDevice.device_ids = init_c_var((cl.cl_device_id * num_devices.value)(), lambda x: check(cl.clGetDeviceIDs(platform_ids[0], device_type, num_devices, x, None))) # noqa: E501
  81. self.device_id = CLDevice.device_ids[0 if ":" not in device else int(device.split(":")[1])]
  82. self.device_name = (cl.clGetDeviceInfo(self.device_id, cl.CL_DEVICE_NAME, 256, buf := ctypes.create_string_buffer(256), None), buf.value.decode())[1] # noqa: E501
  83. self.driver_version = (cl.clGetDeviceInfo(self.device_id, cl.CL_DRIVER_VERSION, 256, buf := ctypes.create_string_buffer(256), None), buf.value.decode())[1] # noqa: E501
  84. self.context = checked(cl.clCreateContext(None, 1, self.device_id, cl.clCreateContext.argtypes[3](), None, status := ctypes.c_int32()), status)
  85. self.queue = checked(cl.clCreateCommandQueue(self.context, self.device_id, cl.CL_QUEUE_PROFILING_ENABLE, status), status)
  86. self.pending_copyin: List[memoryview] = []
  87. compile_key = hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest()
  88. super().__init__(device, CLAllocator(self), OpenCLRenderer(), CLCompiler(self, f"compile_cl_{compile_key}"), functools.partial(CLProgram, self))
  89. def synchronize(self):
  90. check(cl.clFinish(self.queue))
  91. self.pending_copyin.clear()
  92. GPUDevice = CLDevice # for legacy reasons