disk_read_speed.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. #!/usr/bin/env python3
  2. import os, ctypes, ctypes.util, io, mmap, pathlib
  3. from tinygrad import Tensor, dtypes, Device
  4. from tinygrad.helpers import Timing, from_mv
  5. libc = ctypes.CDLL(ctypes.util.find_library("c"))
  6. #from extra.hip_gpu_driver import hip_ioctl
  7. # sudo su -c "echo 3 > /proc/sys/vm/drop_caches"
  8. # sudo su -c 'echo 8 > /proc/sys/kernel/printk'
  9. # sudo su -c "echo 'module amdgpu +p' > /sys/kernel/debug/dynamic_debug/control"
  10. libc.memcpy.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t]
  11. libc.read.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_size_t]
  12. libc.read.restype = ctypes.c_size_t
  13. libc.malloc.argtypes = [ctypes.c_size_t]
  14. libc.malloc.restype = ctypes.c_void_p
  15. def read_direct(fd, sz):
  16. with Timing("mmap: ", lambda x: f", {sz/x:.2f} GB/s"):
  17. buf = mmap.mmap(-1, sz, flags=mmap.MAP_SHARED|mmap.MAP_POPULATE)
  18. with Timing("read: ", lambda x: f", {sz/x:.2f} GB/s"):
  19. ret = libc.read(fd, from_mv(buf), sz)
  20. assert ret == sz
  21. def read_mmap(fd, sz):
  22. with Timing("mmfd: ", lambda x: f", {sz/x:.2f} GB/s"):
  23. buf = mmap.mmap(fd, sz, flags=mmap.MAP_SHARED|mmap.MAP_POPULATE) #|MAP_LOCKED)
  24. t = 0
  25. for i in range(0, sz, 0x1000): t += buf[i]
  26. # def _copyin_async(self, dest:T, src:T, size:int): check(hip.hipMemcpyAsync(dest, src, size, hip.hipMemcpyHostToDevice, None))
  27. def read_to_gpu_mmap(fd, sz, gpubuf):
  28. with Timing("gpu copyin: ", lambda x: f", {sz/x:.2f} GB/s"):
  29. with Timing("mmfd: ", lambda x: f", {sz/x:.2f} GB/s"):
  30. buf = mmap.mmap(fd, sz, flags=mmap.MAP_SHARED|mmap.MAP_POPULATE) #|MAP_LOCKED)
  31. dev.allocator._copyin_async(gpubuf, from_mv(buf), sz)
  32. dev.synchronize()
  33. def read_to_gpu_single(fd, sz, gpubuf):
  34. os.lseek(fd, 0, os.SEEK_SET)
  35. with Timing("total: ", lambda x: f", {sz/x:.2f} GB/s"):
  36. with Timing("gpu host alloc: ", lambda x: f", {sz/x:.2f} GB/s"):
  37. hst = dev.allocator._hostalloc(sz)
  38. with Timing("read to host: ", lambda x: f", {sz/x:.2f} GB/s"):
  39. ret = libc.read(fd, hst, sz)
  40. with Timing("gpu host copy: ", lambda x: f", {sz/x:.2f} GB/s"):
  41. dev.allocator._copyin_async(gpubuf, hst, sz)
  42. dev.synchronize()
  43. def read_to_gpu_pingpong(fd, sz, gpubuf):
  44. psz = 256*1024*1024
  45. print(f"piece size {psz/(1024*1024):.2f} MB")
  46. with Timing("gpu host alloc: ", lambda x: f", {sz/x:.2f} GB/s"):
  47. hst1 = dev.allocator._hostalloc(psz)
  48. hst2 = dev.allocator._hostalloc(psz)
  49. os.lseek(fd, 0, os.SEEK_SET)
  50. with Timing("total: ", lambda x: f", {sz/x:.2f} GB/s"):
  51. for i in range(sz//(psz*2)):
  52. with Timing("tfer(0): ", lambda x: f", {psz/x:.2f} GB/s"):
  53. ret = libc.read(fd, hst1, psz)
  54. dev.synchronize()
  55. dev.allocator._copyin_async(gpubuf, hst1, psz)
  56. with Timing("tfer(1): ", lambda x: f", {psz/x:.2f} GB/s"):
  57. ret = libc.read(fd, hst2, psz)
  58. dev.synchronize()
  59. dev.allocator._copyin_async(gpubuf, hst2, psz)
  60. dev.synchronize()
  61. MAP_LOCKED = 0x2000
  62. MAP_HUGETLB = 0x40000
  63. if __name__ == "__main__":
  64. dev = Device[Device.DEFAULT]
  65. warm = (Tensor.ones(1024, device=Device.DEFAULT).contiguous() + Tensor.ones(1024, device=Device.DEFAULT).contiguous()).realize()
  66. #fn = "/home/tiny/tinygrad/weights/rng"
  67. fn = pathlib.Path(__file__).parents[1] / "weights/LLaMA-2/70B/consolidated.00.pth"
  68. sz = os.stat(fn).st_size
  69. t = Tensor.empty(sz, dtype=dtypes.uint8, device=f"disk:{fn}")
  70. with Timing("copy: ", lambda x: f", {sz/x:.2f} GB/s"):
  71. on_dev = t.to(Device.DEFAULT).realize()
  72. exit(0)
  73. # 4GB of random numbers
  74. #fd = os.open("/home/tiny/tinygrad/weights/rng", os.O_RDWR|os.O_DIRECT)
  75. #sz = os.fstat(fd).st_size // 4
  76. fd = os.open("/home/tiny/tinygrad/weights/LLaMA/7B/consolidated.00.pth", os.O_RDWR|os.O_DIRECT)
  77. sz = os.fstat(fd).st_size
  78. print(f"read {sz} from {fd}")
  79. with Timing("gpu alloc: ", lambda x: f", {sz/x:.2f} GB/s"):
  80. gpubuf = dev.allocator._alloc(sz)
  81. # warmup
  82. dev.allocator._copyin_async(gpubuf, from_mv(bytearray(b"\x00\x00\x00\x00"*0x1000)), 0x4000)
  83. print("copying, is warm")
  84. print("****** read to gpu pingpong")
  85. read_to_gpu_pingpong(fd, sz, gpubuf)
  86. exit(0)
  87. print("****** read direct")
  88. read_direct(fd, sz)
  89. print("****** read mmap")
  90. read_mmap(fd, sz)
  91. print("****** read to gpu single")
  92. read_to_gpu_single(fd, sz, gpubuf)
  93. print("****** read to gpu mmap")
  94. read_to_gpu_mmap(fd, sz, gpubuf)
  95. os._exit(0)