| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758 |
- # NOTE: this only tests the speed of the LLaMA codegen, it doesn't actually run the net
- import unittest, time
- from examples.llama import Transformer, MODEL_PARAMS
- from tinygrad.tensor import Tensor
- from tinygrad import Device
- from tinygrad.nn.state import get_state_dict
- from tinygrad.device import Allocator
- from tinygrad.engine.realize import method_cache
- from tinygrad.helpers import Profiling
- class FakeProgram:
- def __init__(self, name:str, prg:bytes): pass
- def __call__(self, *bufs, global_size, local_size, vals=(), wait=False): pass
- class FakeAllocator(Allocator):
- def _alloc(self, sz, options): return None
- def copyin(self, dest, src:memoryview): pass
- class TestLLaMASpeed(unittest.TestCase):
- def test_llama_compile(self):
- backup_program = Device[Device.DEFAULT].runtime
- backup_allocator = Device[Device.DEFAULT].allocator
- backup_compiler = Device[Device.DEFAULT].compiler
- Device[Device.DEFAULT].runtime = FakeProgram
- Device[Device.DEFAULT].allocator = FakeAllocator()
- print("testing llama python run time")
- model = Transformer(**MODEL_PARAMS["1"]["7B"]["args"])
- print("built model")
- # assign fake tensors to the values
- for v in get_state_dict(model).values(): v.assign(Tensor.empty(*v.shape, dtype=v.dtype))
- print("assigned empty tensors, doing warmup")
- def run_llama(st, empty_method_cache=True):
- if empty_method_cache: method_cache.clear()
- tms = [time.perf_counter()]
- for i in range(5):
- model(Tensor([[1,2,3,4]]), i).realize()
- tms.append(time.perf_counter())
- timings = [(tms[i+1]-tms[i])*1000 for i in range(len(tms)-1)]
- print(f"{st:15s} mean runtime: {sum(timings)/len(timings):7.2f}ms, runs: ", ", ".join(f'{x:7.2f}' for x in timings))
- run_llama("codegen(0)")
- run_llama("codegen(1)")
- # test no compiler use for this
- Device[Device.DEFAULT].compiler = None
- run_llama("methodcache", False)
- with Profiling(sort='time', frac=0.1, fn="/tmp/llama.prof", ts=5):
- run_llama("profile", False)
- Device[Device.DEFAULT].runtime = backup_program
- Device[Device.DEFAULT].allocator = backup_allocator
- Device[Device.DEFAULT].compiler = backup_compiler
- if __name__ == '__main__':
- TestLLaMASpeed().test_llama_compile()
- #unittest.main()
|