選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

common.py 16KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387
  1. # YOLOv5 common modules
  2. import logging
  3. from copy import copy
  4. from pathlib import Path, PosixPath
  5. import math
  6. import numpy as np
  7. import pandas as pd
  8. import requests
  9. import torch
  10. import torch.nn as nn
  11. from PIL import Image
  12. from torch.cuda import amp
  13. from utils.datasets import exif_transpose, letterbox
  14. from utils.general import non_max_suppression, make_divisible, scale_coords, increment_path, xyxy2xywh, save_one_box
  15. from utils.plots import colors, plot_one_box
  16. from utils.torch_utils import time_sync
  17. LOGGER = logging.getLogger(__name__)
  18. def autopad(k, p=None): # kernel, padding
  19. # Pad to 'same'
  20. if p is None:
  21. p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad
  22. return p
  23. def DWConv(c1, c2, k=1, s=1, act=True):
  24. # Depthwise convolution
  25. return Conv(c1, c2, k, s, g=math.gcd(c1, c2), act=act)
  26. class Conv(nn.Module):
  27. # Standard convolution
  28. def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups
  29. super().__init__()
  30. self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
  31. self.bn = nn.BatchNorm2d(c2)
  32. self.act = nn.SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity())
  33. def forward(self, x):
  34. return self.act(self.bn(self.conv(x)))
  35. def fuseforward(self, x):
  36. return self.act(self.conv(x))
  37. class TransformerLayer(nn.Module):
  38. # Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)
  39. def __init__(self, c, num_heads):
  40. super().__init__()
  41. self.q = nn.Linear(c, c, bias=False)
  42. self.k = nn.Linear(c, c, bias=False)
  43. self.v = nn.Linear(c, c, bias=False)
  44. self.ma = nn.MultiheadAttention(embed_dim=c, num_heads=num_heads)
  45. self.fc1 = nn.Linear(c, c, bias=False)
  46. self.fc2 = nn.Linear(c, c, bias=False)
  47. def forward(self, x):
  48. x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x
  49. x = self.fc2(self.fc1(x)) + x
  50. return x
  51. class TransformerBlock(nn.Module):
  52. # Vision Transformer https://arxiv.org/abs/2010.11929
  53. def __init__(self, c1, c2, num_heads, num_layers):
  54. super().__init__()
  55. self.conv = None
  56. if c1 != c2:
  57. self.conv = Conv(c1, c2)
  58. self.linear = nn.Linear(c2, c2) # learnable position embedding
  59. self.tr = nn.Sequential(*[TransformerLayer(c2, num_heads) for _ in range(num_layers)])
  60. self.c2 = c2
  61. def forward(self, x):
  62. if self.conv is not None:
  63. x = self.conv(x)
  64. b, _, w, h = x.shape
  65. p = x.flatten(2).unsqueeze(0).transpose(0, 3).squeeze(3)
  66. return self.tr(p + self.linear(p)).unsqueeze(3).transpose(0, 3).reshape(b, self.c2, w, h)
  67. class Bottleneck(nn.Module):
  68. # Standard bottleneck
  69. def __init__(self, c1, c2, shortcut=True, g=1, e=0.5): # ch_in, ch_out, shortcut, groups, expansion
  70. super().__init__()
  71. c_ = int(c2 * e) # hidden channels
  72. self.cv1 = Conv(c1, c_, 1, 1)
  73. self.cv2 = Conv(c_, c2, 3, 1, g=g)
  74. self.add = shortcut and c1 == c2
  75. def forward(self, x):
  76. return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
  77. class BottleneckCSP(nn.Module):
  78. # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks
  79. def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
  80. super().__init__()
  81. c_ = int(c2 * e) # hidden channels
  82. self.cv1 = Conv(c1, c_, 1, 1)
  83. self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)
  84. self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
  85. self.cv4 = Conv(2 * c_, c2, 1, 1)
  86. self.bn = nn.BatchNorm2d(2 * c_) # applied to cat(cv2, cv3)
  87. self.act = nn.LeakyReLU(0.1, inplace=True)
  88. self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)])
  89. def forward(self, x):
  90. y1 = self.cv3(self.m(self.cv1(x)))
  91. y2 = self.cv2(x)
  92. return self.cv4(self.act(self.bn(torch.cat((y1, y2), dim=1))))
  93. class C3(nn.Module):
  94. # CSP Bottleneck with 3 convolutions
  95. def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
  96. super().__init__()
  97. c_ = int(c2 * e) # hidden channels
  98. self.cv1 = Conv(c1, c_, 1, 1)
  99. self.cv2 = Conv(c1, c_, 1, 1)
  100. self.cv3 = Conv(2 * c_, c2, 1) # act=FReLU(c2)
  101. self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)])
  102. # self.m = nn.Sequential(*[CrossConv(c_, c_, 3, 1, g, 1.0, shortcut) for _ in range(n)])
  103. def forward(self, x):
  104. return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
  105. class C3TR(C3):
  106. # C3 module with TransformerBlock()
  107. def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
  108. super().__init__(c1, c2, n, shortcut, g, e)
  109. c_ = int(c2 * e)
  110. self.m = TransformerBlock(c_, c_, 4, n)
  111. class C3SPP(C3):
  112. # C3 module with SPP()
  113. def __init__(self, c1, c2, k=(5, 9, 13), n=1, shortcut=True, g=1, e=0.5):
  114. super().__init__(c1, c2, n, shortcut, g, e)
  115. c_ = int(c2 * e)
  116. self.m = SPP(c_, c_, k)
  117. class SPP(nn.Module):
  118. # Spatial pyramid pooling layer used in YOLOv3-SPP
  119. def __init__(self, c1, c2, k=(5, 9, 13)):
  120. super().__init__()
  121. c_ = c1 // 2 # hidden channels
  122. self.cv1 = Conv(c1, c_, 1, 1)
  123. self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
  124. self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
  125. def forward(self, x):
  126. x = self.cv1(x)
  127. return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
  128. class Focus(nn.Module):
  129. # Focus wh information into c-space
  130. def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups
  131. super().__init__()
  132. self.conv = Conv(c1 * 4, c2, k, s, p, g, act)
  133. # self.contract = Contract(gain=2)
  134. def forward(self, x): # x(b,c,w,h) -> y(b,4c,w/2,h/2)
  135. return self.conv(torch.cat([x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]], 1))
  136. # return self.conv(self.contract(x))
  137. class Contract(nn.Module):
  138. # Contract width-height into channels, i.e. x(1,64,80,80) to x(1,256,40,40)
  139. def __init__(self, gain=2):
  140. super().__init__()
  141. self.gain = gain
  142. def forward(self, x):
  143. N, C, H, W = x.size() # assert (H / s == 0) and (W / s == 0), 'Indivisible gain'
  144. s = self.gain
  145. x = x.view(N, C, H // s, s, W // s, s) # x(1,64,40,2,40,2)
  146. x = x.permute(0, 3, 5, 1, 2, 4).contiguous() # x(1,2,2,64,40,40)
  147. return x.view(N, C * s * s, H // s, W // s) # x(1,256,40,40)
  148. class Expand(nn.Module):
  149. # Expand channels into width-height, i.e. x(1,64,80,80) to x(1,16,160,160)
  150. def __init__(self, gain=2):
  151. super().__init__()
  152. self.gain = gain
  153. def forward(self, x):
  154. N, C, H, W = x.size() # assert C / s ** 2 == 0, 'Indivisible gain'
  155. s = self.gain
  156. x = x.view(N, s, s, C // s ** 2, H, W) # x(1,2,2,16,80,80)
  157. x = x.permute(0, 3, 4, 1, 5, 2).contiguous() # x(1,16,80,2,80,2)
  158. return x.view(N, C // s ** 2, H * s, W * s) # x(1,16,160,160)
  159. class Concat(nn.Module):
  160. # Concatenate a list of tensors along dimension
  161. def __init__(self, dimension=1):
  162. super().__init__()
  163. self.d = dimension
  164. def forward(self, x):
  165. return torch.cat(x, self.d)
  166. class AutoShape(nn.Module):
  167. # YOLOv5 input-robust model wrapper for passing cv2/np/PIL/torch inputs. Includes preprocessing, inference and NMS
  168. conf = 0.25 # NMS confidence threshold
  169. iou = 0.45 # NMS IoU threshold
  170. classes = None # (optional list) filter by class
  171. max_det = 1000 # maximum number of detections per image
  172. def __init__(self, model):
  173. super().__init__()
  174. self.model = model.eval()
  175. def autoshape(self):
  176. LOGGER.info('AutoShape already enabled, skipping... ') # model already converted to model.autoshape()
  177. return self
  178. @torch.no_grad()
  179. def forward(self, imgs, size=640, augment=False, profile=False):
  180. # Inference from various sources. For height=640, width=1280, RGB images example inputs are:
  181. # filename: imgs = 'data/images/zidane.jpg' # str or PosixPath
  182. # URI: = 'https://ultralytics.com/images/zidane.jpg'
  183. # OpenCV: = cv2.imread('image.jpg')[:,:,::-1] # HWC BGR to RGB x(640,1280,3)
  184. # PIL: = Image.open('image.jpg') # HWC x(640,1280,3)
  185. # numpy: = np.zeros((640,1280,3)) # HWC
  186. # torch: = torch.zeros(16,3,320,640) # BCHW (scaled to size=640, 0-1 values)
  187. # multiple: = [Image.open('image1.jpg'), Image.open('image2.jpg'), ...] # list of images
  188. t = [time_sync()]
  189. p = next(self.model.parameters()) # for device and type
  190. if isinstance(imgs, torch.Tensor): # torch
  191. with amp.autocast(enabled=p.device.type != 'cpu'):
  192. return self.model(imgs.to(p.device).type_as(p), augment, profile) # inference
  193. # Pre-process
  194. n, imgs = (len(imgs), imgs) if isinstance(imgs, list) else (1, [imgs]) # number of images, list of images
  195. shape0, shape1, files = [], [], [] # image and inference shapes, filenames
  196. for i, im in enumerate(imgs):
  197. f = f'image{i}' # filename
  198. if isinstance(im, (str, PosixPath)): # filename or uri
  199. im, f = Image.open(requests.get(im, stream=True).raw if str(im).startswith('http') else im), im
  200. im = np.asarray(exif_transpose(im))
  201. elif isinstance(im, Image.Image): # PIL Image
  202. im, f = np.asarray(exif_transpose(im)), getattr(im, 'filename', f) or f
  203. files.append(Path(f).with_suffix('.jpg').name)
  204. if im.shape[0] < 5: # image in CHW
  205. im = im.transpose((1, 2, 0)) # reverse dataloader .transpose(2, 0, 1)
  206. im = im[..., :3] if im.ndim == 3 else np.tile(im[..., None], 3) # enforce 3ch input
  207. s = im.shape[:2] # HWC
  208. shape0.append(s) # image shape
  209. g = (size / max(s)) # gain
  210. shape1.append([y * g for y in s])
  211. imgs[i] = im if im.data.contiguous else np.ascontiguousarray(im) # update
  212. shape1 = [make_divisible(x, int(self.stride.max())) for x in np.stack(shape1, 0).max(0)] # inference shape
  213. x = [letterbox(im, new_shape=shape1, auto=False)[0] for im in imgs] # pad
  214. x = np.stack(x, 0) if n > 1 else x[0][None] # stack
  215. x = np.ascontiguousarray(x.transpose((0, 3, 1, 2))) # BHWC to BCHW
  216. x = torch.from_numpy(x).to(p.device).type_as(p) / 255. # uint8 to fp16/32
  217. t.append(time_sync())
  218. with amp.autocast(enabled=p.device.type != 'cpu'):
  219. # Inference
  220. y = self.model(x, augment, profile)[0] # forward
  221. t.append(time_sync())
  222. # Post-process
  223. y = non_max_suppression(y, self.conf, iou_thres=self.iou, classes=self.classes, max_det=self.max_det) # NMS
  224. for i in range(n):
  225. scale_coords(shape1, y[i][:, :4], shape0[i])
  226. t.append(time_sync())
  227. return Detections(imgs, y, files, t, self.names, x.shape)
  228. class Detections:
  229. # YOLOv5 detections class for inference results
  230. def __init__(self, imgs, pred, files, times=None, names=None, shape=None):
  231. super().__init__()
  232. d = pred[0].device # device
  233. gn = [torch.tensor([*[im.shape[i] for i in [1, 0, 1, 0]], 1., 1.], device=d) for im in imgs] # normalizations
  234. self.imgs = imgs # list of images as numpy arrays
  235. self.pred = pred # list of tensors pred[0] = (xyxy, conf, cls)
  236. self.names = names # class names
  237. self.files = files # image filenames
  238. self.xyxy = pred # xyxy pixels
  239. self.xywh = [xyxy2xywh(x) for x in pred] # xywh pixels
  240. self.xyxyn = [x / g for x, g in zip(self.xyxy, gn)] # xyxy normalized
  241. self.xywhn = [x / g for x, g in zip(self.xywh, gn)] # xywh normalized
  242. self.n = len(self.pred) # number of images (batch size)
  243. self.t = tuple((times[i + 1] - times[i]) * 1000 / self.n for i in range(3)) # timestamps (ms)
  244. self.s = shape # inference BCHW shape
  245. def display(self, pprint=False, show=False, save=False, crop=False, render=False, save_dir=Path('')):
  246. for i, (im, pred) in enumerate(zip(self.imgs, self.pred)):
  247. str = f'image {i + 1}/{len(self.pred)}: {im.shape[0]}x{im.shape[1]} '
  248. if pred.shape[0]:
  249. for c in pred[:, -1].unique():
  250. n = (pred[:, -1] == c).sum() # detections per class
  251. str += f"{n} {self.names[int(c)]}{'s' * (n > 1)}, " # add to string
  252. if show or save or render or crop:
  253. for *box, conf, cls in reversed(pred): # xyxy, confidence, class
  254. label = f'{self.names[int(cls)]} {conf:.2f}'
  255. if crop:
  256. save_one_box(box, im, file=save_dir / 'crops' / self.names[int(cls)] / self.files[i])
  257. else: # all others
  258. plot_one_box(box, im, label=label, color=colors(cls))
  259. else:
  260. str += '(no detections)'
  261. im = Image.fromarray(im.astype(np.uint8)) if isinstance(im, np.ndarray) else im # from np
  262. if pprint:
  263. LOGGER.info(str.rstrip(', '))
  264. if show:
  265. im.show(self.files[i]) # show
  266. if save:
  267. f = self.files[i]
  268. im.save(save_dir / f) # save
  269. if i == self.n - 1:
  270. LOGGER.info(f"Saved {self.n} image{'s' * (self.n > 1)} to '{save_dir}'")
  271. if render:
  272. self.imgs[i] = np.asarray(im)
  273. def print(self):
  274. self.display(pprint=True) # print results
  275. LOGGER.info(f'Speed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape {tuple(self.s)}' %
  276. self.t)
  277. def show(self):
  278. self.display(show=True) # show results
  279. def save(self, save_dir='runs/detect/exp'):
  280. save_dir = increment_path(save_dir, exist_ok=save_dir != 'runs/detect/exp', mkdir=True) # increment save_dir
  281. self.display(save=True, save_dir=save_dir) # save results
  282. def crop(self, save_dir='runs/detect/exp'):
  283. save_dir = increment_path(save_dir, exist_ok=save_dir != 'runs/detect/exp', mkdir=True) # increment save_dir
  284. self.display(crop=True, save_dir=save_dir) # crop results
  285. LOGGER.info(f'Saved results to {save_dir}\n')
  286. def render(self):
  287. self.display(render=True) # render results
  288. return self.imgs
  289. def pandas(self):
  290. # return detections as pandas DataFrames, i.e. print(results.pandas().xyxy[0])
  291. new = copy(self) # return copy
  292. ca = 'xmin', 'ymin', 'xmax', 'ymax', 'confidence', 'class', 'name' # xyxy columns
  293. cb = 'xcenter', 'ycenter', 'width', 'height', 'confidence', 'class', 'name' # xywh columns
  294. for k, c in zip(['xyxy', 'xyxyn', 'xywh', 'xywhn'], [ca, ca, cb, cb]):
  295. a = [[x[:5] + [int(x[5]), self.names[int(x[5])]] for x in x.tolist()] for x in getattr(self, k)] # update
  296. setattr(new, k, [pd.DataFrame(x, columns=c) for x in a])
  297. return new
  298. def tolist(self):
  299. # return a list of Detections objects, i.e. 'for result in results.tolist():'
  300. x = [Detections([self.imgs[i]], [self.pred[i]], self.names, self.s) for i in range(self.n)]
  301. for d in x:
  302. for k in ['imgs', 'pred', 'xyxy', 'xyxyn', 'xywh', 'xywhn']:
  303. setattr(d, k, getattr(d, k)[0]) # pop out of list
  304. return x
  305. def __len__(self):
  306. return self.n
  307. class Classify(nn.Module):
  308. # Classification head, i.e. x(b,c1,20,20) to x(b,c2)
  309. def __init__(self, c1, c2, k=1, s=1, p=None, g=1): # ch_in, ch_out, kernel, stride, padding, groups
  310. super().__init__()
  311. self.aap = nn.AdaptiveAvgPool2d(1) # to x(b,c1,1,1)
  312. self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g) # to x(b,c2,1,1)
  313. self.flat = nn.Flatten()
  314. def forward(self, x):
  315. z = torch.cat([self.aap(y) for y in (x if isinstance(x, list) else [x])], 1) # cat if list
  316. return self.flat(self.conv(z)) # flatten to x(b,c2)