external_osx_profiling.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. from tinygrad.runtime.ops_gpu import CLProgram, CL, CLBuffer
  2. from tinygrad import dtypes
  3. import time
  4. N = 1000000
  5. a = CLBuffer(N, dtypes.float32)
  6. b = CLBuffer(N, dtypes.float32)
  7. c = CLBuffer(N, dtypes.float32)
  8. prg = CLProgram("test", """__kernel void test(__global float *a, __global float *b, __global float *c) {
  9. int idx = get_global_id(0);
  10. a[idx] = b[idx] + c[idx];
  11. }""")
  12. prg.clprgs[0](CL.cl_queue[0], [N,], None, a._buf, b._buf, c._buf)
  13. t1 = time.monotonic_ns()
  14. e1 = prg.clprgs[0](CL.cl_queue[0], [N,], None, a._buf, b._buf, c._buf)
  15. CL.synchronize()
  16. t2 = time.monotonic_ns()
  17. time.sleep(3)
  18. t3 = time.monotonic_ns()
  19. e2 = prg.clprgs[0](CL.cl_queue[0], [N,], None, a._buf, b._buf, c._buf)
  20. CL.synchronize()
  21. t4 = time.monotonic_ns()
  22. print(e1.profile.queued)
  23. print(e1.profile.submit)
  24. print(e1.profile.start)
  25. print(e1.profile.end)
  26. print(e1, e2)
  27. print(t2-t1, e1.profile.end - e1.profile.start)
  28. print(t4-t3, e2.profile.end - e2.profile.start)
  29. print(t3-t2, e2.profile.queued-e1.profile.end)
  30. print((t3-t2) / (e2.profile.start-e1.profile.end), "ratio")
  31. print("ratio since boot", t1/e1.profile.start)
  32. print(e1.profile.start)
  33. print(e1.profile.end)
  34. print(e2.profile.start)
  35. print(e2.profile.end)