yolov3.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
  1. # https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg
  2. import sys
  3. import io
  4. import time
  5. import math
  6. import cv2
  7. import numpy as np
  8. from PIL import Image
  9. from tinygrad.tensor import Tensor
  10. from tinygrad.nn import BatchNorm2d, Conv2d
  11. from tinygrad.helpers import fetch
  12. def show_labels(prediction, confidence=0.5, num_classes=80):
  13. coco_labels = fetch('https://raw.githubusercontent.com/pjreddie/darknet/master/data/coco.names').read_bytes()
  14. coco_labels = coco_labels.decode('utf-8').split('\n')
  15. prediction = prediction.detach().numpy()
  16. conf_mask = (prediction[:,:,4] > confidence)
  17. prediction *= np.expand_dims(conf_mask, 2)
  18. labels = []
  19. # Iterate over batches
  20. for img_pred in prediction:
  21. max_conf = np.amax(img_pred[:,5:5+num_classes], axis=1)
  22. max_conf_score = np.argmax(img_pred[:,5:5+num_classes], axis=1)
  23. max_conf_score = np.expand_dims(max_conf_score, axis=1)
  24. max_conf = np.expand_dims(max_conf, axis=1)
  25. seq = (img_pred[:,:5], max_conf, max_conf_score)
  26. image_pred = np.concatenate(seq, axis=1)
  27. non_zero_ind = np.nonzero(image_pred[:,4])[0]
  28. assert all(image_pred[non_zero_ind,0] > 0)
  29. image_pred_ = np.reshape(image_pred[np.squeeze(non_zero_ind),:], (-1, 7))
  30. classes, indexes = np.unique(image_pred_[:, -1], return_index=True)
  31. for index, coco_class in enumerate(classes):
  32. label, probability = coco_labels[int(coco_class)], image_pred_[indexes[index]][4] * 100
  33. print(f"Detected {label} {probability:.2f}")
  34. labels.append(label)
  35. return labels
  36. def add_boxes(img, prediction):
  37. if isinstance(prediction, int): # no predictions
  38. return img
  39. coco_labels = fetch('https://raw.githubusercontent.com/pjreddie/darknet/master/data/coco.names').read_bytes()
  40. coco_labels = coco_labels.decode('utf-8').split('\n')
  41. height, width = img.shape[0:2]
  42. scale_factor = 608 / width
  43. prediction[:,[1,3]] -= (608 - scale_factor * width) / 2
  44. prediction[:,[2,4]] -= (608 - scale_factor * height) / 2
  45. for pred in prediction:
  46. corner1 = tuple(pred[1:3].astype(int))
  47. corner2 = tuple(pred[3:5].astype(int))
  48. w = corner2[0] - corner1[0]
  49. h = corner2[1] - corner1[1]
  50. corner2 = (corner2[0] + w, corner2[1] + h)
  51. label = coco_labels[int(pred[-1])]
  52. img = cv2.rectangle(img, corner1, corner2, (255, 0, 0), 2)
  53. t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0]
  54. c2 = corner1[0] + t_size[0] + 3, corner1[1] + t_size[1] + 4
  55. img = cv2.rectangle(img, corner1, c2, (255, 0, 0), -1)
  56. img = cv2.putText(img, label, (corner1[0], corner1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1)
  57. return img
  58. def bbox_iou(box1, box2):
  59. """
  60. Returns the IoU of two bounding boxes
  61. IoU: IoU = Area Of Overlap / Area of Union -> How close the predicted bounding box is
  62. to the ground truth bounding box. Higher IoU = Better accuracy
  63. In training, used to track accuracy. with inference, using to remove duplicate bounding boxes
  64. """
  65. # Get the coordinates of bounding boxes
  66. b1_x1, b1_y1, b1_x2, b1_y2 = box1[:,0], box1[:,1], box1[:,2], box1[:,3]
  67. b2_x1, b2_y1, b2_x2, b2_y2 = box2[:,0], box2[:,1], box2[:,2], box2[:,3]
  68. # get the coordinates of the intersection rectangle
  69. inter_rect_x1 = np.maximum(b1_x1, b2_x1)
  70. inter_rect_y1 = np.maximum(b1_y1, b2_y1)
  71. inter_rect_x2 = np.maximum(b1_x2, b2_x2)
  72. inter_rect_y2 = np.maximum(b1_y2, b2_y2)
  73. #Intersection area
  74. inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, 99999) * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, 99999)
  75. #Union Area
  76. b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1)
  77. b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1)
  78. iou = inter_area / (b1_area + b2_area - inter_area)
  79. return iou
  80. def process_results(prediction, confidence=0.9, num_classes=80, nms_conf=0.4):
  81. prediction = prediction.detach().numpy()
  82. conf_mask = (prediction[:,:,4] > confidence)
  83. conf_mask = np.expand_dims(conf_mask, 2)
  84. prediction = prediction * conf_mask
  85. # Non max suppression
  86. box_corner = prediction
  87. box_corner[:,:,0] = (prediction[:,:,0] - prediction[:,:,2]/2)
  88. box_corner[:,:,1] = (prediction[:,:,1] - prediction[:,:,3]/2)
  89. box_corner[:,:,2] = (prediction[:,:,0] + prediction[:,:,2]/2)
  90. box_corner[:,:,3] = (prediction[:,:,1] + prediction[:,:,3]/2)
  91. prediction[:,:,:4] = box_corner[:,:,:4]
  92. write = False
  93. # Process img
  94. img_pred = prediction[0]
  95. max_conf = np.amax(img_pred[:,5:5+num_classes], axis=1)
  96. max_conf_score = np.argmax(img_pred[:,5:5+num_classes], axis=1)
  97. max_conf_score = np.expand_dims(max_conf_score, axis=1)
  98. max_conf = np.expand_dims(max_conf, axis=1)
  99. seq = (img_pred[:,:5], max_conf, max_conf_score)
  100. image_pred = np.concatenate(seq, axis=1)
  101. non_zero_ind = np.nonzero(image_pred[:,4])[0]
  102. assert all(image_pred[non_zero_ind,0] > 0)
  103. image_pred_ = np.reshape(image_pred[np.squeeze(non_zero_ind),:], (-1, 7))
  104. if image_pred_.shape[0] == 0:
  105. print("No detections found!")
  106. return 0
  107. for cls in np.unique(image_pred_[:, -1]):
  108. # perform NMS, get the detections with one particular class
  109. cls_mask = image_pred_*np.expand_dims(image_pred_[:, -1] == cls, axis=1)
  110. class_mask_ind = np.squeeze(np.nonzero(cls_mask[:,-2]))
  111. # class_mask_ind = np.nonzero()
  112. image_pred_class = np.reshape(image_pred_[class_mask_ind], (-1, 7))
  113. # sort the detections such that the entry with the maximum objectness
  114. # confidence is at the top
  115. conf_sort_index = np.argsort(image_pred_class[:,4])
  116. image_pred_class = image_pred_class[conf_sort_index]
  117. for i in range(image_pred_class.shape[0]):
  118. # Get the IOUs of all boxes that come after the one we are looking at in the loop
  119. try:
  120. ious = bbox_iou(np.expand_dims(image_pred_class[i], axis=0), image_pred_class[i+1:])
  121. except:
  122. break
  123. # Zero out all the detections that have IoU > threshold
  124. iou_mask = np.expand_dims((ious < nms_conf), axis=1)
  125. image_pred_class[i+1:] *= iou_mask
  126. # Remove the non-zero entries
  127. non_zero_ind = np.squeeze(np.nonzero(image_pred_class[:,4]))
  128. image_pred_class = np.reshape(image_pred_class[non_zero_ind], (-1, 7))
  129. batch_ind = np.array([[0]])
  130. seq = (batch_ind, image_pred_class)
  131. if not write:
  132. output, write = np.concatenate(seq, axis=1), True
  133. else:
  134. out = np.concatenate(seq, axis=1)
  135. output = np.concatenate((output,out))
  136. return output
  137. def infer(model, img):
  138. img = np.array(Image.fromarray(img).resize((608, 608)))
  139. img = img[:,:,::-1].transpose((2,0,1))
  140. img = img[np.newaxis,:,:,:]/255.0
  141. prediction = model.forward(Tensor(img.astype(np.float32)))
  142. return prediction
  143. def parse_cfg(cfg):
  144. # Return a list of blocks
  145. lines = cfg.decode("utf-8").split('\n')
  146. lines = [x for x in lines if len(x) > 0]
  147. lines = [x for x in lines if x[0] != '#']
  148. lines = [x.rstrip().lstrip() for x in lines]
  149. block, blocks = {}, []
  150. for line in lines:
  151. if line[0] == "[":
  152. if len(block) != 0:
  153. blocks.append(block)
  154. block = {}
  155. block["type"] = line[1:-1].rstrip()
  156. else:
  157. key,value = line.split("=")
  158. block[key.rstrip()] = value.lstrip()
  159. blocks.append(block)
  160. return blocks
  161. # TODO: Speed up this function, avoid copying stuff from GPU to CPU
  162. def predict_transform(prediction, inp_dim, anchors, num_classes):
  163. batch_size = prediction.shape[0]
  164. stride = inp_dim // prediction.shape[2]
  165. grid_size = inp_dim // stride
  166. bbox_attrs = 5 + num_classes
  167. num_anchors = len(anchors)
  168. prediction = prediction.reshape(shape=(batch_size, bbox_attrs*num_anchors, grid_size*grid_size))
  169. prediction = prediction.transpose(1, 2)
  170. prediction = prediction.reshape(shape=(batch_size, grid_size*grid_size*num_anchors, bbox_attrs))
  171. prediction_cpu = prediction.numpy()
  172. for i in (0, 1, 4):
  173. prediction_cpu[:,:,i] = 1 / (1 + np.exp(-prediction_cpu[:,:,i]))
  174. # Add the center offsets
  175. grid = np.arange(grid_size)
  176. a, b = np.meshgrid(grid, grid)
  177. x_offset = a.reshape((-1, 1))
  178. y_offset = b.reshape((-1, 1))
  179. x_y_offset = np.concatenate((x_offset, y_offset), 1)
  180. x_y_offset = np.tile(x_y_offset, (1, num_anchors))
  181. x_y_offset = x_y_offset.reshape((-1,2))
  182. x_y_offset = np.expand_dims(x_y_offset, 0)
  183. anchors = [(a[0]/stride, a[1]/stride) for a in anchors]
  184. anchors = np.tile(anchors, (grid_size*grid_size, 1))
  185. anchors = np.expand_dims(anchors, 0)
  186. prediction_cpu[:,:,:2] += x_y_offset
  187. prediction_cpu[:,:,2:4] = np.exp(prediction_cpu[:,:,2:4])*anchors
  188. prediction_cpu[:,:,5:5+num_classes] = 1 / (1 + np.exp(-prediction_cpu[:,:,5:5+num_classes]))
  189. prediction_cpu[:,:,:4] *= stride
  190. return Tensor(prediction_cpu)
  191. class Darknet:
  192. def __init__(self, cfg):
  193. self.blocks = parse_cfg(cfg)
  194. self.net_info, self.module_list = self.create_modules(self.blocks)
  195. print("Modules length:", len(self.module_list))
  196. def create_modules(self, blocks):
  197. net_info = blocks[0] # Info about model hyperparameters
  198. prev_filters, filters = 3, None
  199. output_filters, module_list = [], []
  200. ## module
  201. for index, x in enumerate(blocks[1:]):
  202. module_type = x["type"]
  203. module = []
  204. if module_type == "convolutional":
  205. try:
  206. batch_normalize, bias = int(x["batch_normalize"]), False
  207. except:
  208. batch_normalize, bias = 0, True
  209. # layer
  210. activation = x["activation"]
  211. filters = int(x["filters"])
  212. padding = int(x["pad"])
  213. pad = (int(x["size"]) - 1) // 2 if padding else 0
  214. module.append(Conv2d(prev_filters, filters, int(x["size"]), int(x["stride"]), pad, bias=bias))
  215. # BatchNorm2d
  216. if batch_normalize:
  217. module.append(BatchNorm2d(filters, eps=1e-05, track_running_stats=True))
  218. # LeakyReLU activation
  219. if activation == "leaky":
  220. module.append(lambda x: x.leakyrelu(0.1))
  221. elif module_type == "maxpool":
  222. size, stride = int(x["size"]), int(x["stride"])
  223. module.append(lambda x: x.max_pool2d(kernel_size=(size, size), stride=stride))
  224. elif module_type == "upsample":
  225. module.append(lambda x: Tensor(x.numpy().repeat(2, axis=-2).repeat(2, axis=-1)))
  226. elif module_type == "route":
  227. x["layers"] = x["layers"].split(",")
  228. # Start of route
  229. start = int(x["layers"][0])
  230. # End if it exists
  231. try:
  232. end = int(x["layers"][1])
  233. except:
  234. end = 0
  235. if start > 0: start -= index
  236. if end > 0: end -= index
  237. module.append(lambda x: x)
  238. if end < 0:
  239. filters = output_filters[index + start] + output_filters[index + end]
  240. else:
  241. filters = output_filters[index + start]
  242. # Shortcut corresponds to skip connection
  243. elif module_type == "shortcut":
  244. module.append(lambda x: x)
  245. elif module_type == "yolo":
  246. mask = list(map(int, x["mask"].split(",")))
  247. anchors = [int(a) for a in x["anchors"].split(",")]
  248. anchors = [(anchors[i], anchors[i+1]) for i in range(0, len(anchors), 2)]
  249. module.append([anchors[i] for i in mask])
  250. # Append to module_list
  251. module_list.append(module)
  252. if filters is not None:
  253. prev_filters = filters
  254. output_filters.append(filters)
  255. return (net_info, module_list)
  256. def dump_weights(self):
  257. for i in range(len(self.module_list)):
  258. module_type = self.blocks[i + 1]["type"]
  259. if module_type == "convolutional":
  260. print(self.blocks[i + 1]["type"], "weights", i)
  261. model = self.module_list[i]
  262. conv = model[0]
  263. print(conv.weight.numpy()[0][0][0])
  264. if conv.bias is not None:
  265. print("biases")
  266. print(conv.bias.shape)
  267. print(conv.bias.numpy()[0][0:5])
  268. else:
  269. print("None biases for layer", i)
  270. def load_weights(self, url):
  271. weights = np.frombuffer(fetch(url).read_bytes(), dtype=np.float32)[5:]
  272. ptr = 0
  273. for i in range(len(self.module_list)):
  274. module_type = self.blocks[i + 1]["type"]
  275. if module_type == "convolutional":
  276. model = self.module_list[i]
  277. try: # we have batchnorm, load conv weights without biases, and batchnorm values
  278. batch_normalize = int(self.blocks[i+1]["batch_normalize"])
  279. except: # no batchnorm, load conv weights + biases
  280. batch_normalize = 0
  281. conv = model[0]
  282. if batch_normalize:
  283. bn = model[1]
  284. # Get the number of weights of batchnorm
  285. num_bn_biases = math.prod(bn.bias.shape)
  286. # Load weights
  287. bn_biases = Tensor(weights[ptr:ptr + num_bn_biases])
  288. ptr += num_bn_biases
  289. bn_weights = Tensor(weights[ptr:ptr+num_bn_biases])
  290. ptr += num_bn_biases
  291. bn_running_mean = Tensor(weights[ptr:ptr+num_bn_biases])
  292. ptr += num_bn_biases
  293. bn_running_var = Tensor(weights[ptr:ptr+num_bn_biases])
  294. ptr += num_bn_biases
  295. # Cast the loaded weights into dims of model weights
  296. bn_biases = bn_biases.reshape(shape=tuple(bn.bias.shape))
  297. bn_weights = bn_weights.reshape(shape=tuple(bn.weight.shape))
  298. bn_running_mean = bn_running_mean.reshape(shape=tuple(bn.running_mean.shape))
  299. bn_running_var = bn_running_var.reshape(shape=tuple(bn.running_var.shape))
  300. # Copy data
  301. bn.bias = bn_biases
  302. bn.weight = bn_weights
  303. bn.running_mean = bn_running_mean
  304. bn.running_var = bn_running_var
  305. else:
  306. # load biases of the conv layer
  307. num_biases = math.prod(conv.bias.shape)
  308. # Load weights
  309. conv_biases = Tensor(weights[ptr: ptr+num_biases])
  310. ptr += num_biases
  311. # Reshape
  312. conv_biases = conv_biases.reshape(shape=tuple(conv.bias.shape))
  313. # Copy
  314. conv.bias = conv_biases
  315. # Load weighys for conv layers
  316. num_weights = math.prod(conv.weight.shape)
  317. conv_weights = Tensor(weights[ptr:ptr+num_weights])
  318. ptr += num_weights
  319. conv_weights = conv_weights.reshape(shape=tuple(conv.weight.shape))
  320. conv.weight = conv_weights
  321. def forward(self, x):
  322. modules = self.blocks[1:]
  323. outputs = {} # Cached outputs for route layer
  324. detections, write = None, False
  325. for i, module in enumerate(modules):
  326. module_type = (module["type"])
  327. if module_type == "convolutional" or module_type == "upsample":
  328. for layer in self.module_list[i]:
  329. x = layer(x)
  330. elif module_type == "route":
  331. layers = module["layers"]
  332. layers = [int(a) for a in layers]
  333. if (layers[0]) > 0:
  334. layers[0] = layers[0] - i
  335. if len(layers) == 1:
  336. x = outputs[i + (layers[0])]
  337. else:
  338. if (layers[1]) > 0: layers[1] = layers[1] - i
  339. map1 = outputs[i + layers[0]]
  340. map2 = outputs[i + layers[1]]
  341. x = Tensor(np.concatenate((map1.numpy(), map2.numpy()), axis=1))
  342. elif module_type == "shortcut":
  343. from_ = int(module["from"])
  344. x = outputs[i - 1] + outputs[i + from_]
  345. elif module_type == "yolo":
  346. anchors = self.module_list[i][0]
  347. inp_dim = int(self.net_info["height"]) # 416
  348. num_classes = int(module["classes"])
  349. x = predict_transform(x, inp_dim, anchors, num_classes)
  350. if not write:
  351. detections, write = x, True
  352. else:
  353. detections = Tensor(np.concatenate((detections.numpy(), x.numpy()), axis=1))
  354. outputs[i] = x
  355. return detections
  356. if __name__ == "__main__":
  357. model = Darknet(fetch('https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg').read_bytes())
  358. print("Loading weights file (237MB). This might take a while…")
  359. model.load_weights('https://pjreddie.com/media/files/yolov3.weights')
  360. if len(sys.argv) > 1:
  361. url = sys.argv[1]
  362. else:
  363. url = "https://github.com/ayooshkathuria/pytorch-yolo-v3/raw/master/dog-cycle-car.png"
  364. if url == 'webcam':
  365. cap = cv2.VideoCapture(0)
  366. cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)
  367. while 1:
  368. _ = cap.grab() # discard one frame to circumvent capture buffering
  369. ret, frame = cap.read()
  370. prediction = process_results(infer(model, frame))
  371. img = Image.fromarray(frame[:, :, [2,1,0]])
  372. boxes = add_boxes(np.array(img.resize((608, 608))), prediction)
  373. boxes = cv2.cvtColor(boxes, cv2.COLOR_RGB2BGR)
  374. cv2.imshow('yolo', boxes)
  375. if cv2.waitKey(1) & 0xFF == ord('q'):
  376. break
  377. cap.release()
  378. cv2.destroyAllWindows()
  379. elif url.startswith('http'):
  380. img_stream = io.BytesIO(fetch(url).read_bytes())
  381. img = cv2.imdecode(np.frombuffer(img_stream.read(), np.uint8), 1)
  382. else:
  383. img = cv2.imread(url)
  384. st = time.time()
  385. print('running inference…')
  386. prediction = infer(model, img)
  387. print(f'did inference in {(time.time() - st):2f}s')
  388. show_labels(prediction)
  389. prediction = process_results(prediction)
  390. boxes = add_boxes(np.array(Image.fromarray(img).resize((608, 608))), prediction)
  391. cv2.imwrite('boxes.jpg', boxes)