rl.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. import os
  2. import numpy as np
  3. import math, random
  4. from tinygrad.tensor import Tensor
  5. from tinygrad.nn.state import get_parameters, get_state_dict, safe_save, safe_load, load_state_dict
  6. from tinygrad.engine.search import actions, bufs_from_lin, time_linearizer, get_kernel_actions
  7. from tinygrad.nn.optim import Adam
  8. from extra.optimization.extract_policynet import PolicyNet
  9. from extra.optimization.helpers import load_worlds, ast_str_to_lin, lin_to_feats
  10. if __name__ == "__main__":
  11. net = PolicyNet()
  12. if os.path.isfile("/tmp/policynet.safetensors"): load_state_dict(net, safe_load("/tmp/policynet.safetensors"))
  13. optim = Adam(get_parameters(net))
  14. ast_strs = load_worlds()
  15. # select a world
  16. all_feats, all_acts, all_rews = [], [], []
  17. while 1:
  18. Tensor.no_grad, Tensor.training = True, False
  19. lin = ast_str_to_lin(random.choice(ast_strs))
  20. rawbufs = bufs_from_lin(lin)
  21. tm = last_tm = base_tm = time_linearizer(lin, rawbufs)
  22. # take actions
  23. feats, acts, rews = [], [], []
  24. while 1:
  25. feat = lin_to_feats(lin)
  26. feats.append(feat)
  27. probs = net(Tensor([feat])).exp()[0].numpy()
  28. # mask valid actions
  29. valid_action_mask = np.zeros((len(actions)+1), dtype=np.float32)
  30. for x in get_kernel_actions(lin): valid_action_mask[x] = 1
  31. probs *= valid_action_mask
  32. probs /= sum(probs)
  33. act = np.random.choice(len(probs), p=probs)
  34. acts.append(act)
  35. if act == 0:
  36. rews.append(0)
  37. break
  38. try:
  39. lin.apply_opt(actions[act-1])
  40. tm = time_linearizer(lin, rawbufs)
  41. if math.isinf(tm): raise Exception("failed")
  42. rews.append(((last_tm-tm)/base_tm))
  43. last_tm = tm
  44. except Exception:
  45. rews.append(-0.5)
  46. break
  47. #print(f"{tm*1e6:10.2f}", lin.colored_shape())
  48. assert len(feats) == len(acts) and len(acts) == len(rews)
  49. #print(rews)
  50. print(f"***** EPISODE {len(rews)} steps, {sum(rews):5.2f} reward, {base_tm*1e6:12.2f} -> {tm*1e6:12.2f} : {lin.colored_shape()}")
  51. all_feats += feats
  52. all_acts += acts
  53. # rewards to go
  54. for i in range(len(rews)-2, -1, -1): rews[i] += rews[i+1]
  55. all_rews += rews
  56. BS = 32
  57. if len(all_feats) >= BS:
  58. Tensor.no_grad, Tensor.training = False, True
  59. x = Tensor(all_feats[:BS])
  60. mask = np.zeros((BS, len(actions)+1), dtype=np.float32)
  61. mask[range(BS), all_acts[:BS]] = all_rews[:BS]
  62. loss = -(net(x) * Tensor(mask)).mean()
  63. optim.zero_grad()
  64. loss.backward()
  65. optim.step()
  66. all_feats = all_feats[BS:]
  67. all_acts = all_acts[BS:]
  68. all_rews = all_rews[BS:]