vit.py 4.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. import numpy as np
  2. from tinygrad.tensor import Tensor
  3. from tinygrad.helpers import fetch
  4. from extra.models.transformer import TransformerBlock
  5. class ViT:
  6. def __init__(self, layers=12, embed_dim=192, num_heads=3):
  7. self.embedding = (Tensor.uniform(embed_dim, 3, 16, 16), Tensor.zeros(embed_dim))
  8. self.embed_dim = embed_dim
  9. self.cls = Tensor.ones(1, 1, embed_dim)
  10. self.pos_embedding = Tensor.ones(1, 197, embed_dim)
  11. self.tbs = [
  12. TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, ff_dim=embed_dim*4,
  13. prenorm=True, act=lambda x: x.gelu())
  14. for i in range(layers)]
  15. self.encoder_norm = (Tensor.uniform(embed_dim), Tensor.zeros(embed_dim))
  16. self.head = (Tensor.uniform(embed_dim, 1000), Tensor.zeros(1000))
  17. def patch_embed(self, x):
  18. x = x.conv2d(*self.embedding, stride=16)
  19. x = x.reshape(shape=(x.shape[0], x.shape[1], -1)).permute(order=(0,2,1))
  20. return x
  21. def forward(self, x):
  22. ce = self.cls.add(Tensor.zeros(x.shape[0],1,1))
  23. pe = self.patch_embed(x)
  24. x = ce.cat(pe, dim=1)
  25. x = x.add(self.pos_embedding).sequential(self.tbs)
  26. x = x.layernorm().linear(*self.encoder_norm)
  27. return x[:, 0].linear(*self.head)
  28. def load_from_pretrained(m):
  29. # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
  30. if m.embed_dim == 192:
  31. url = "https://storage.googleapis.com/vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz"
  32. elif m.embed_dim == 768:
  33. url = "https://storage.googleapis.com/vit_models/augreg/B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_224.npz"
  34. else:
  35. raise Exception("no pretrained weights for configuration")
  36. dat = np.load(fetch(url))
  37. #for x in dat.keys():
  38. # print(x, dat[x].shape, dat[x].dtype)
  39. m.embedding[0].assign(np.transpose(dat['embedding/kernel'], (3,2,0,1)))
  40. m.embedding[1].assign(dat['embedding/bias'])
  41. m.cls.assign(dat['cls'])
  42. m.head[0].assign(dat['head/kernel'])
  43. m.head[1].assign(dat['head/bias'])
  44. m.pos_embedding.assign(dat['Transformer/posembed_input/pos_embedding'])
  45. m.encoder_norm[0].assign(dat['Transformer/encoder_norm/scale'])
  46. m.encoder_norm[1].assign(dat['Transformer/encoder_norm/bias'])
  47. for i in range(12):
  48. m.tbs[i].query[0].assign(dat[f'Transformer/encoderblock_{i}/MultiHeadDotProductAttention_1/query/kernel'].reshape(m.embed_dim, m.embed_dim))
  49. m.tbs[i].query[1].assign(dat[f'Transformer/encoderblock_{i}/MultiHeadDotProductAttention_1/query/bias'].reshape(m.embed_dim))
  50. m.tbs[i].key[0].assign(dat[f'Transformer/encoderblock_{i}/MultiHeadDotProductAttention_1/key/kernel'].reshape(m.embed_dim, m.embed_dim))
  51. m.tbs[i].key[1].assign(dat[f'Transformer/encoderblock_{i}/MultiHeadDotProductAttention_1/key/bias'].reshape(m.embed_dim))
  52. m.tbs[i].value[0].assign(dat[f'Transformer/encoderblock_{i}/MultiHeadDotProductAttention_1/value/kernel'].reshape(m.embed_dim, m.embed_dim))
  53. m.tbs[i].value[1].assign(dat[f'Transformer/encoderblock_{i}/MultiHeadDotProductAttention_1/value/bias'].reshape(m.embed_dim))
  54. m.tbs[i].out[0].assign(dat[f'Transformer/encoderblock_{i}/MultiHeadDotProductAttention_1/out/kernel'].reshape(m.embed_dim, m.embed_dim))
  55. m.tbs[i].out[1].assign(dat[f'Transformer/encoderblock_{i}/MultiHeadDotProductAttention_1/out/bias'].reshape(m.embed_dim))
  56. m.tbs[i].ff1[0].assign(dat[f'Transformer/encoderblock_{i}/MlpBlock_3/Dense_0/kernel'])
  57. m.tbs[i].ff1[1].assign(dat[f'Transformer/encoderblock_{i}/MlpBlock_3/Dense_0/bias'])
  58. m.tbs[i].ff2[0].assign(dat[f'Transformer/encoderblock_{i}/MlpBlock_3/Dense_1/kernel'])
  59. m.tbs[i].ff2[1].assign(dat[f'Transformer/encoderblock_{i}/MlpBlock_3/Dense_1/bias'])
  60. m.tbs[i].ln1[0].assign(dat[f'Transformer/encoderblock_{i}/LayerNorm_0/scale'])
  61. m.tbs[i].ln1[1].assign(dat[f'Transformer/encoderblock_{i}/LayerNorm_0/bias'])
  62. m.tbs[i].ln2[0].assign(dat[f'Transformer/encoderblock_{i}/LayerNorm_2/scale'])
  63. m.tbs[i].ln2[1].assign(dat[f'Transformer/encoderblock_{i}/LayerNorm_2/bias'])