external_multi_gpu.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. #!/usr/bin/env python3
  2. # cd extra/disassemblers/ && git clone --recursive github.com:geohot/cuda_ioctl_sniffer.git
  3. # LD_PRELOAD=$PWD/extra/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py
  4. import numpy as np
  5. from tinygrad.tensor import Tensor
  6. from tinygrad.helpers import colored, Timing, getenv
  7. from tinygrad.device import Device
  8. d0, d1 = f'{Device.DEFAULT}:0', f'{Device.DEFAULT}:1'
  9. def sync():
  10. Device[d0].synchronize()
  11. Device[d1].synchronize()
  12. if __name__ == "__main__":
  13. print("GPU devices", d0, d1)
  14. sz = getenv("N", 1024*1024*256) # 1 GB
  15. with Timing("GPU initial sync: "): sync()
  16. with Timing("CPU creation: ", on_exit=lambda x: f", {(sz*4*2)/x:.2f} GB/sec"):
  17. c0 = (Tensor.ones(sz, device="clang")/2).realize()
  18. c1 = (Tensor.ones(sz, device="clang")/4).realize()
  19. print(c0.lazydata.base.realized)
  20. print(c1.lazydata.base.realized)
  21. with Timing("CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
  22. a0 = c0.to(d0).realize()
  23. sync()
  24. with Timing("CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
  25. b1 = c1.to(d1).realize()
  26. sync()
  27. # cross copy. this is (sometimes) going through the CPU
  28. with Timing("0 -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
  29. a1 = a0.to(d1).realize()
  30. sync()
  31. with Timing("1 -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
  32. b0 = b1.to(d0).realize()
  33. sync()
  34. # sum
  35. with Timing("0+0 -> 0 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
  36. ab0 = (a0 + b0).realize()
  37. sync()
  38. with Timing("1+1 -> 1 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
  39. ab1 = (a1 + b1).realize()
  40. sync()
  41. # cross device sum (does this work?)
  42. with Timing(colored("0+1 -> 0 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
  43. abx0 = (a0 + b1.to(d0)).realize()
  44. sync()
  45. with Timing(colored("1+0 -> 1 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
  46. abx1 = (b1 + a0.to(d1)).realize()
  47. sync()
  48. # copy back
  49. # NOTE: half of this slowness is caused by allocating memory on the CPU
  50. with Timing("0 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
  51. cc0 = ab0.numpy()
  52. with Timing("1 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
  53. cc1 = ab1.numpy()
  54. # same
  55. print("testing")
  56. np.testing.assert_allclose(cc0, cc1)
  57. # same (cross)
  58. print("testing (cross)")
  59. np.testing.assert_allclose(cc0, abx0.numpy())
  60. np.testing.assert_allclose(cc0, abx1.numpy())
  61. # devices
  62. print(ab0)
  63. print(ab1)
  64. print(abx0)
  65. print(abx1)