speed_compare_cuda_nv.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. from tinygrad import Device, dtypes
  2. from tinygrad.helpers import getenv, colorize_float
  3. from extra.optimization.helpers import load_worlds, ast_str_to_lin
  4. from test.external.fuzz_linearizer import get_fuzz_rawbufs
  5. from tinygrad.engine.search import bufs_from_lin
  6. from tinygrad.engine.realize import CompiledRunner
  7. from tinygrad.tensor import _to_np_dtype
  8. import numpy as np
  9. if __name__ == "__main__":
  10. ast_strs = load_worlds(filter_reduce=False, filter_novariable=True)
  11. cudev = Device["CUDA"]
  12. nvdev = Device["NV"]
  13. # NUM=112 python3 test/external/speed_compare_cuda_nv.py
  14. single = getenv("NUM", -1)
  15. if single != -1: ast_strs = ast_strs[single:single+1]
  16. average_tm_cuda, average_tm_nv = 0, 0
  17. for num,ast in enumerate(ast_strs):
  18. # cuda compile
  19. culin = ast_str_to_lin(ast, opts=cudev.renderer)
  20. culin.hand_coded_optimizations()
  21. has_bf16 = any(b.dtype == dtypes.bfloat16 for b in culin.membufs)
  22. cuda_prg = CompiledRunner(culin.to_program())
  23. cubufs = bufs_from_lin(culin)
  24. test_cubufs = get_fuzz_rawbufs(culin) if not has_bf16 else cubufs
  25. rdr = nvdev.renderer
  26. rdr.device = "NV"
  27. nvlin = ast_str_to_lin(ast, opts=rdr)
  28. nvlin.hand_coded_optimizations()
  29. nv_prg = CompiledRunner(nvlin.to_program())
  30. nvbufs = bufs_from_lin(nvlin)
  31. test_nvbufs = get_fuzz_rawbufs(nvlin) if not has_bf16 else nvbufs
  32. if not has_bf16:
  33. for i,rawbuf in enumerate(test_nvbufs): rawbuf.copyin(test_cubufs[i].as_buffer())
  34. # warmup
  35. tm_cuda, tm_nv, failed = [], [], False
  36. try:
  37. cuda_prg(test_cubufs, {}, wait=True)
  38. for i in range(5): tm_cuda.append(cuda_prg(cubufs, {}, wait=True))
  39. except RuntimeError:
  40. print("CUDA FAILED")
  41. tm_cuda = [1e9]
  42. failed = True
  43. try:
  44. nv_prg(test_nvbufs, {}, wait=True)
  45. for i in range(5): tm_nv.append(nv_prg(nvbufs, {}, wait=True))
  46. except RuntimeError:
  47. print("NV FAILED")
  48. tm_nv = [1e9]
  49. failed = True
  50. if not failed and not has_bf16:
  51. curesult = np.frombuffer(test_cubufs[0].as_buffer(), _to_np_dtype(test_cubufs[0].dtype))
  52. nvresult = np.frombuffer(test_nvbufs[0].as_buffer(), _to_np_dtype(test_nvbufs[0].dtype))
  53. np.testing.assert_allclose(curesult, nvresult, rtol=1e-2, atol=1e-2)
  54. average_tm_cuda += min(tm_cuda)
  55. average_tm_nv += min(tm_nv)
  56. ratio = min(tm_nv)/min(tm_cuda)
  57. print(f"{average_tm_nv/average_tm_cuda:5.2f}x -- {num:4d} {colorize_float(ratio)} {min(tm_nv)*1e6:7.2f} us", nvlin.name)
  58. if ratio > 1.04: print(f"NV slower {ratio}", nvlin.ast, nvlin.applied_opts)