| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667 |
- import unittest
- from tinygrad import Tensor
- from tinygrad import Device
- from tinygrad.helpers import Timing, CI, OSX
- import multiprocessing.shared_memory as shared_memory
- N = 4096 if CI else 16384
- class TestCopySpeed(unittest.TestCase):
- @classmethod
- def setUpClass(cls): Device[Device.DEFAULT].synchronize()
- def testCopySHMtoDefault(self):
- s = shared_memory.SharedMemory(name="test_X", create=True, size=N*N*4)
- s.close()
- if CI and not OSX:
- t = Tensor.empty(N, N, device="disk:/dev/shm/test_X").realize()
- else:
- t = Tensor.empty(N, N, device="disk:shm:test_X").realize()
- for _ in range(3):
- with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
- with Timing("queue: "):
- t.to(Device.DEFAULT).realize()
- Device[Device.DEFAULT].synchronize()
- s.unlink()
- def testCopyCPUtoDefault(self):
- t = Tensor.rand(N, N, device="clang").realize()
- print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
- for _ in range(3):
- with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
- with Timing("queue: "):
- t.to(Device.DEFAULT).realize()
- Device[Device.DEFAULT].synchronize()
- def testCopyCPUtoDefaultFresh(self):
- print("fresh copy")
- for _ in range(3):
- t = Tensor.rand(N, N, device="clang").realize()
- with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"): # noqa: F821
- with Timing("queue: "):
- t.to(Device.DEFAULT).realize()
- Device[Device.DEFAULT].synchronize()
- del t
- def testCopyDefaulttoCPU(self):
- t = Tensor.rand(N, N).realize()
- print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
- for _ in range(3):
- with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
- t.to('clang').realize()
- @unittest.skipIf(CI, "CI doesn't have 6 GPUs")
- @unittest.skipIf(Device.DEFAULT != "GPU", "only test this on GPU")
- def testCopyCPUto6GPUs(self):
- from tinygrad.runtime.ops_gpu import CLDevice
- if len(CLDevice.device_ids) != 6: raise unittest.SkipTest("computer doesn't have 6 GPUs")
- t = Tensor.rand(N, N, device="clang").realize()
- print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
- for _ in range(3):
- with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s ({t.nbytes()*6/ns:.2f} GB/s total)"):
- with Timing("queue: "):
- for g in range(6):
- t.to(f"gpu:{g}").realize()
- Device["gpu"].synchronize()
- if __name__ == '__main__':
- unittest.main()
|