831 lines
35 KiB
Python
831 lines
35 KiB
Python
# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
|
||
"""
|
||
Common modules
|
||
"""
|
||
|
||
import logging
|
||
import math
|
||
import warnings
|
||
from copy import copy
|
||
from pathlib import Path
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
import requests
|
||
import torch
|
||
import torch.nn as nn
|
||
import torch.nn.functional as F
|
||
from PIL import Image
|
||
from typing import Optional
|
||
from torch.cuda import amp
|
||
|
||
from utils.datasets import exif_transpose, letterbox
|
||
from utils.general import (colorstr, increment_path, make_divisible, non_max_suppression, save_one_box, scale_coords,
|
||
xyxy2xywh)
|
||
from utils.plots import Annotator, colors
|
||
from utils.torch_utils import time_sync
|
||
|
||
LOGGER = logging.getLogger(__name__)
|
||
|
||
|
||
def autopad(k, p=None): # kernel, padding
|
||
# Pad to 'same'
|
||
if p is None:
|
||
p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad
|
||
return p
|
||
|
||
class Conv(nn.Module):
|
||
# Standard convolution
|
||
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups
|
||
super().__init__()
|
||
self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
|
||
self.bn = nn.BatchNorm2d(c2)
|
||
self.act = nn.SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity())
|
||
|
||
def forward(self, x):
|
||
return self.act(self.bn(self.conv(x)))
|
||
|
||
def forward_fuse(self, x):
|
||
return self.act(self.conv(x))
|
||
|
||
|
||
class DWConv(Conv):
|
||
# Depth-wise convolution class
|
||
def __init__(self, c1, c2, k=1, s=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups
|
||
super().__init__(c1, c2, k, s, g=math.gcd(c1, c2), act=act)
|
||
|
||
class ChannelAttentionModule(nn.Module):
|
||
def __init__(self, c1, reduction=16):
|
||
super(ChannelAttentionModule, self).__init__()
|
||
mid_channel = c1 // reduction
|
||
self.avg_pool = nn.AdaptiveAvgPool2d(1)
|
||
self.max_pool = nn.AdaptiveMaxPool2d(1)
|
||
self.shared_MLP = nn.Sequential(
|
||
nn.Linear(in_features=c1, out_features=mid_channel),
|
||
nn.ReLU(),
|
||
nn.Linear(in_features=mid_channel, out_features=c1)
|
||
)
|
||
self.sigmoid = nn.Sigmoid()
|
||
#self.act=SiLU()
|
||
def forward(self, x):
|
||
avgout = self.shared_MLP(self.avg_pool(x).view(x.size(0),-1)).unsqueeze(2).unsqueeze(3)
|
||
maxout = self.shared_MLP(self.max_pool(x).view(x.size(0),-1)).unsqueeze(2).unsqueeze(3)
|
||
return self.sigmoid(avgout + maxout)
|
||
|
||
class SpatialAttentionModule(nn.Module):
|
||
def __init__(self):
|
||
super(SpatialAttentionModule, self).__init__()
|
||
self.conv2d = nn.Conv2d(in_channels=2, out_channels=1, kernel_size=7, stride=1, padding=3)
|
||
#self.act=SiLU()
|
||
self.sigmoid = nn.Sigmoid()
|
||
def forward(self, x):
|
||
avgout = torch.mean(x, dim=1, keepdim=True)
|
||
maxout, _ = torch.max(x, dim=1, keepdim=True)
|
||
out = torch.cat([avgout, maxout], dim=1)
|
||
out = self.sigmoid(self.conv2d(out))
|
||
return out
|
||
|
||
class CBAM(nn.Module):
|
||
def __init__(self, c1,c2):
|
||
super(CBAM, self).__init__()
|
||
self.channel_attention = ChannelAttentionModule(c1)
|
||
self.spatial_attention = SpatialAttentionModule()
|
||
|
||
def forward(self, x):
|
||
out = self.channel_attention(x) * x
|
||
out = self.spatial_attention(out) * out
|
||
return out
|
||
|
||
class TransformerLayer(nn.Module):
|
||
def __init__(self, c, num_heads):
|
||
super().__init__()
|
||
|
||
self.ln1 = nn.LayerNorm(c)
|
||
self.q = nn.Linear(c, c, bias=False)
|
||
self.k = nn.Linear(c, c, bias=False)
|
||
self.v = nn.Linear(c, c, bias=False)
|
||
self.ma = nn.MultiheadAttention(embed_dim=c, num_heads=num_heads)
|
||
self.ln2 = nn.LayerNorm(c)
|
||
self.fc1 = nn.Linear(c, 4*c, bias=False)
|
||
self.fc2 = nn.Linear(4*c, c, bias=False)
|
||
self.dropout = nn.Dropout(0.1)
|
||
self.act = nn.ReLU(True)
|
||
|
||
def forward(self, x):
|
||
x_ = self.ln1(x)
|
||
x = self.dropout(self.ma(self.q(x_), self.k(x_), self.v(x_))[0]) + x
|
||
x_ = self.ln2(x)
|
||
x_ = self.fc2(self.dropout(self.act(self.fc1(x_))))
|
||
x = x + self.dropout(x_)
|
||
return x
|
||
|
||
|
||
class TransformerBlock(nn.Module):
|
||
# Vision Transformer https://arxiv.org/abs/2010.11929
|
||
def __init__(self, c1, c2, num_heads, num_layers):
|
||
super().__init__()
|
||
self.conv = None
|
||
if c1 != c2:
|
||
self.conv = Conv(c1, c2)
|
||
self.linear = nn.Linear(c2, c2) # learnable position embedding
|
||
self.tr = nn.Sequential(*(TransformerLayer(c2, num_heads) for _ in range(num_layers)))
|
||
self.c2 = c2
|
||
|
||
def forward(self, x):
|
||
if self.conv is not None:
|
||
x = self.conv(x)
|
||
b, _, w, h = x.shape
|
||
p = x.flatten(2).unsqueeze(0).transpose(0, 3).squeeze(3)
|
||
return self.tr(p + self.linear(p)).unsqueeze(3).transpose(0, 3).reshape(b, self.c2, w, h)
|
||
|
||
def drop_path_f(x, drop_prob: float = 0., training: bool = False):
|
||
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
||
|
||
This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
|
||
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
|
||
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
|
||
changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
|
||
'survival rate' as the argument.
|
||
|
||
"""
|
||
if drop_prob == 0. or not training:
|
||
return x
|
||
keep_prob = 1 - drop_prob
|
||
shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
|
||
random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
|
||
random_tensor.floor_() # binarize
|
||
output = x.div(keep_prob) * random_tensor
|
||
return output
|
||
|
||
|
||
class DropPath(nn.Module):
|
||
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
||
"""
|
||
def __init__(self, drop_prob=None):
|
||
super(DropPath, self).__init__()
|
||
self.drop_prob = drop_prob
|
||
|
||
def forward(self, x):
|
||
return drop_path_f(x, self.drop_prob, self.training)
|
||
|
||
def window_partition(x, window_size: int):
|
||
"""
|
||
将feature map按照window_size划分成一个个没有重叠的window
|
||
Args:
|
||
x: (B, H, W, C)
|
||
window_size (int): window size(M)
|
||
|
||
Returns:
|
||
windows: (num_windows*B, window_size, window_size, C)
|
||
"""
|
||
B, H, W, C = x.shape
|
||
x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
|
||
# permute: [B, H//Mh, Mh, W//Mw, Mw, C] -> [B, H//Mh, W//Mh, Mw, Mw, C]
|
||
# view: [B, H//Mh, W//Mw, Mh, Mw, C] -> [B*num_windows, Mh, Mw, C]
|
||
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
|
||
return windows
|
||
|
||
def window_reverse(windows, window_size: int, H: int, W: int):
|
||
"""
|
||
将一个个window还原成一个feature map
|
||
Args:
|
||
windows: (num_windows*B, window_size, window_size, C)
|
||
window_size (int): Window size(M)
|
||
H (int): Height of image
|
||
W (int): Width of image
|
||
|
||
Returns:
|
||
x: (B, H, W, C)
|
||
"""
|
||
B = int(windows.shape[0] / (H * W / window_size / window_size))
|
||
# view: [B*num_windows, Mh, Mw, C] -> [B, H//Mh, W//Mw, Mh, Mw, C]
|
||
x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
|
||
# permute: [B, H//Mh, W//Mw, Mh, Mw, C] -> [B, H//Mh, Mh, W//Mw, Mw, C]
|
||
# view: [B, H//Mh, Mh, W//Mw, Mw, C] -> [B, H, W, C]
|
||
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
|
||
return x
|
||
|
||
class Mlp(nn.Module):
|
||
""" MLP as used in Vision Transformer, MLP-Mixer and related networks
|
||
"""
|
||
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
|
||
super().__init__()
|
||
out_features = out_features or in_features
|
||
hidden_features = hidden_features or in_features
|
||
|
||
self.fc1 = nn.Linear(in_features, hidden_features)
|
||
self.act = act_layer()
|
||
self.drop1 = nn.Dropout(drop)
|
||
self.fc2 = nn.Linear(hidden_features, out_features)
|
||
self.drop2 = nn.Dropout(drop)
|
||
|
||
def forward(self, x):
|
||
x = self.fc1(x)
|
||
x = self.act(x)
|
||
x = self.drop1(x)
|
||
x = self.fc2(x)
|
||
x = self.drop2(x)
|
||
return x
|
||
|
||
class WindowAttention(nn.Module):
|
||
r""" Window based multi-head self attention (W-MSA) module with relative position bias.
|
||
It supports both of shifted and non-shifted window.
|
||
|
||
Args:
|
||
dim (int): Number of input channels.
|
||
window_size (tuple[int]): The height and width of the window.
|
||
num_heads (int): Number of attention heads.
|
||
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
|
||
attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
|
||
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
|
||
"""
|
||
|
||
def __init__(self, dim, window_size, num_heads, qkv_bias=True, attn_drop=0., proj_drop=0.):
|
||
|
||
super().__init__()
|
||
self.dim = dim
|
||
self.window_size = window_size # [Mh, Mw]
|
||
self.num_heads = num_heads
|
||
head_dim = dim // num_heads
|
||
self.scale = head_dim ** -0.5
|
||
|
||
# define a parameter table of relative position bias
|
||
self.relative_position_bias_table = nn.Parameter(
|
||
torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # [2*Mh-1 * 2*Mw-1, nH]
|
||
|
||
# get pair-wise relative position index for each token inside the window
|
||
coords_h = torch.arange(self.window_size[0])
|
||
coords_w = torch.arange(self.window_size[1])
|
||
coords = torch.stack(torch.meshgrid([coords_h, coords_w], indexing="ij")) # [2, Mh, Mw]
|
||
coords_flatten = torch.flatten(coords, 1) # [2, Mh*Mw]
|
||
# [2, Mh*Mw, 1] - [2, 1, Mh*Mw]
|
||
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # [2, Mh*Mw, Mh*Mw]
|
||
relative_coords = relative_coords.permute(1, 2, 0).contiguous() # [Mh*Mw, Mh*Mw, 2]
|
||
relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
|
||
relative_coords[:, :, 1] += self.window_size[1] - 1
|
||
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
|
||
relative_position_index = relative_coords.sum(-1) # [Mh*Mw, Mh*Mw]
|
||
self.register_buffer("relative_position_index", relative_position_index)
|
||
|
||
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
|
||
self.attn_drop = nn.Dropout(attn_drop)
|
||
self.proj = nn.Linear(dim, dim)
|
||
self.proj_drop = nn.Dropout(proj_drop)
|
||
|
||
nn.init.trunc_normal_(self.relative_position_bias_table, std=.02)
|
||
self.softmax = nn.Softmax(dim=-1)
|
||
|
||
def forward(self, x, mask: Optional[torch.Tensor] = None):
|
||
"""
|
||
Args:
|
||
x: input features with shape of (num_windows*B, Mh*Mw, C)
|
||
mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
|
||
"""
|
||
# [batch_size*num_windows, Mh*Mw, total_embed_dim]
|
||
B_, N, C = x.shape
|
||
# qkv(): -> [batch_size*num_windows, Mh*Mw, 3 * total_embed_dim]
|
||
# reshape: -> [batch_size*num_windows, Mh*Mw, 3, num_heads, embed_dim_per_head]
|
||
# permute: -> [3, batch_size*num_windows, num_heads, Mh*Mw, embed_dim_per_head]
|
||
qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
|
||
# [batch_size*num_windows, num_heads, Mh*Mw, embed_dim_per_head]
|
||
q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple)
|
||
|
||
# transpose: -> [batch_size*num_windows, num_heads, embed_dim_per_head, Mh*Mw]
|
||
# @: multiply -> [batch_size*num_windows, num_heads, Mh*Mw, Mh*Mw]
|
||
q = q * self.scale
|
||
attn = (q @ k.transpose(-2, -1))
|
||
|
||
# relative_position_bias_table.view: [Mh*Mw*Mh*Mw,nH] -> [Mh*Mw,Mh*Mw,nH]
|
||
relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
|
||
self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)
|
||
relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # [nH, Mh*Mw, Mh*Mw]
|
||
attn = attn + relative_position_bias.unsqueeze(0)
|
||
|
||
if mask is not None:
|
||
# mask: [nW, Mh*Mw, Mh*Mw]
|
||
nW = mask.shape[0] # num_windows
|
||
# attn.view: [batch_size, num_windows, num_heads, Mh*Mw, Mh*Mw]
|
||
# mask.unsqueeze: [1, nW, 1, Mh*Mw, Mh*Mw]
|
||
attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
|
||
attn = attn.view(-1, self.num_heads, N, N)
|
||
attn = self.softmax(attn)
|
||
else:
|
||
attn = self.softmax(attn)
|
||
|
||
attn = self.attn_drop(attn)
|
||
|
||
# @: multiply -> [batch_size*num_windows, num_heads, Mh*Mw, embed_dim_per_head]
|
||
# transpose: -> [batch_size*num_windows, Mh*Mw, num_heads, embed_dim_per_head]
|
||
# reshape: -> [batch_size*num_windows, Mh*Mw, total_embed_dim]
|
||
x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
|
||
x = self.proj(x)
|
||
x = self.proj_drop(x)
|
||
return x
|
||
|
||
class SwinTransformerLayer(nn.Module):
|
||
# Vision Transformer https://arxiv.org/abs/2010.11929
|
||
def __init__(self, c, num_heads, window_size=7, shift_size=0,
|
||
mlp_ratio = 4, qkv_bias=False, drop=0., attn_drop=0., drop_path=0.,
|
||
act_layer=nn.GELU, norm_layer=nn.LayerNorm):
|
||
super().__init__()
|
||
if num_heads > 10:
|
||
drop_path = 0.1
|
||
self.window_size = window_size
|
||
self.shift_size = shift_size
|
||
self.mlp_ratio = mlp_ratio
|
||
|
||
self.norm1 = norm_layer(c)
|
||
self.attn = WindowAttention(
|
||
c, window_size=(self.window_size, self.window_size), num_heads=num_heads, qkv_bias=qkv_bias,
|
||
attn_drop=attn_drop, proj_drop=drop)
|
||
|
||
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
|
||
self.norm2 = norm_layer(c)
|
||
mlp_hidden_dim = int(c * mlp_ratio)
|
||
self.mlp = Mlp(in_features=c, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
|
||
|
||
def create_mask(self, x, H, W):
|
||
# calculate attention mask for SW-MSA
|
||
# 保证Hp和Wp是window_size的整数倍
|
||
Hp = int(np.ceil(H / self.window_size)) * self.window_size
|
||
Wp = int(np.ceil(W / self.window_size)) * self.window_size
|
||
# 拥有和feature map一样的通道排列顺序,方便后续window_partition
|
||
img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # [1, Hp, Wp, 1]
|
||
h_slices = ( (0, -self.window_size),
|
||
slice(-self.window_size, -self.shift_size),
|
||
slice(-self.shift_size, None))
|
||
w_slices = (slice(0, -self.window_size),
|
||
slice(-self.window_size, -self.shift_size),
|
||
slice(-self.shift_size, None))
|
||
cnt = 0
|
||
for h in h_slices:
|
||
for w in w_slices:
|
||
img_mask[:, h, w, :] = cnt
|
||
cnt += 1
|
||
|
||
mask_windows = window_partition(img_mask, self.window_size) # [nW, Mh, Mw, 1]
|
||
mask_windows = mask_windows.view(-1, self.window_size * self.window_size) # [nW, Mh*Mw]
|
||
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) # [nW, 1, Mh*Mw] - [nW, Mh*Mw, 1]
|
||
# [nW, Mh*Mw, Mh*Mw]
|
||
attn_mask = attn_mask.masked_fill(attn_mask != 0, torch.tensor(-100.0)).masked_fill(attn_mask == 0, torch.tensor(0.0))
|
||
return attn_mask
|
||
|
||
def forward(self, x):
|
||
b, c, w, h = x.shape
|
||
x = x.permute(0, 3, 2, 1).contiguous() # [b,h,w,c]
|
||
|
||
attn_mask = self.create_mask(x, h, w) # [nW, Mh*Mw, Mh*Mw]
|
||
shortcut = x
|
||
x = self.norm1(x)
|
||
|
||
pad_l = pad_t = 0
|
||
pad_r = (self.window_size - w % self.window_size) % self.window_size
|
||
pad_b = (self.window_size - h % self.window_size) % self.window_size
|
||
x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
|
||
_, hp, wp, _ = x.shape
|
||
|
||
if self.shift_size > 0:
|
||
# print(f"shift size: {self.shift_size}")
|
||
shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
|
||
else:
|
||
shifted_x = x
|
||
attn_mask = None
|
||
|
||
x_windows = window_partition(shifted_x, self.window_size) # [nW*B, Mh, Mw, C]
|
||
x_windows = x_windows.view(-1, self.window_size * self.window_size, c) # [nW*B, Mh*Mw, C]
|
||
|
||
attn_windows = self.attn(x_windows, mask=attn_mask) # [nW*B, Mh*Mw, C]
|
||
|
||
attn_windows = attn_windows.view(-1, self.window_size, self.window_size, c) # [nW*B, Mh, Mw, C]
|
||
shifted_x = window_reverse(attn_windows, self.window_size, hp, wp) # [B, H', W', C]
|
||
|
||
if self.shift_size > 0:
|
||
x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
|
||
else:
|
||
x = shifted_x
|
||
|
||
if pad_r > 0 or pad_b > 0:
|
||
# 把前面pad的数据移除掉
|
||
x = x[:, :h, :w, :].contiguous()
|
||
|
||
x = shortcut + self.drop_path(x)
|
||
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
||
|
||
x = x.permute(0, 3, 2, 1).contiguous()
|
||
return x # (b, self.c2, w, h)
|
||
|
||
class SwinTransformerBlock(nn.Module):
|
||
def __init__(self, c1, c2, num_heads, num_layers, window_size=8):
|
||
super().__init__()
|
||
self.conv = None
|
||
if c1 != c2:
|
||
self.conv = Conv(c1, c2)
|
||
|
||
self.window_size = window_size
|
||
self.shift_size = window_size // 2
|
||
self.tr = nn.Sequential(*(SwinTransformerLayer(c2, num_heads=num_heads, window_size=window_size, shift_size=0 if (i % 2 == 0) else self.shift_size ) for i in range(num_layers)))
|
||
|
||
def forward(self, x):
|
||
if self.conv is not None:
|
||
x = self.conv(x)
|
||
x = self.tr(x)
|
||
return x
|
||
|
||
class Bottleneck(nn.Module):
|
||
# Standard bottleneck
|
||
def __init__(self, c1, c2, shortcut=True, g=1, e=0.5): # ch_in, ch_out, shortcut, groups, expansion
|
||
super().__init__()
|
||
c_ = int(c2 * e) # hidden channels
|
||
self.cv1 = Conv(c1, c_, 1, 1)
|
||
self.cv2 = Conv(c_, c2, 3, 1, g=g)
|
||
self.add = shortcut and c1 == c2
|
||
|
||
def forward(self, x):
|
||
return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
|
||
|
||
|
||
class BottleneckCSP(nn.Module):
|
||
# CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks
|
||
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
|
||
super().__init__()
|
||
c_ = int(c2 * e) # hidden channels
|
||
self.cv1 = Conv(c1, c_, 1, 1)
|
||
self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)
|
||
self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
|
||
self.cv4 = Conv(2 * c_, c2, 1, 1)
|
||
self.bn = nn.BatchNorm2d(2 * c_) # applied to cat(cv2, cv3)
|
||
self.act = nn.SiLU()
|
||
self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
|
||
|
||
def forward(self, x):
|
||
y1 = self.cv3(self.m(self.cv1(x)))
|
||
y2 = self.cv2(x)
|
||
return self.cv4(self.act(self.bn(torch.cat((y1, y2), dim=1))))
|
||
|
||
|
||
class C3(nn.Module):
|
||
# CSP Bottleneck with 3 convolutions
|
||
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
|
||
super().__init__()
|
||
c_ = int(c2 * e) # hidden channels
|
||
self.cv1 = Conv(c1, c_, 1, 1)
|
||
self.cv2 = Conv(c1, c_, 1, 1)
|
||
self.cv3 = Conv(2 * c_, c2, 1) # act=FReLU(c2)
|
||
self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
|
||
# self.m = nn.Sequential(*[CrossConv(c_, c_, 3, 1, g, 1.0, shortcut) for _ in range(n)])
|
||
|
||
def forward(self, x):
|
||
return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
|
||
|
||
|
||
class C3TR(C3):
|
||
# C3 module with TransformerBlock()
|
||
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
|
||
super().__init__(c1, c2, n, shortcut, g, e)
|
||
c_ = int(c2 * e)
|
||
self.m = TransformerBlock(c_, c_, 4, n)
|
||
|
||
class C3STR(C3):
|
||
# C3 module with SwinTransformerBlock()
|
||
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
|
||
super().__init__(c1, c2, n, shortcut, g, e)
|
||
c_ = int(c2 * e)
|
||
self.m = SwinTransformerBlock(c_, c_, c_//32, n)
|
||
|
||
|
||
class C3SPP(C3):
|
||
# C3 module with SPP()
|
||
def __init__(self, c1, c2, k=(5, 9, 13), n=1, shortcut=True, g=1, e=0.5):
|
||
super().__init__(c1, c2, n, shortcut, g, e)
|
||
c_ = int(c2 * e)
|
||
self.m = SPP(c_, c_, k)
|
||
|
||
|
||
class C3Ghost(C3):
|
||
# C3 module with GhostBottleneck()
|
||
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
|
||
super().__init__(c1, c2, n, shortcut, g, e)
|
||
c_ = int(c2 * e) # hidden channels
|
||
self.m = nn.Sequential(*(GhostBottleneck(c_, c_) for _ in range(n)))
|
||
|
||
|
||
class SPP(nn.Module):
|
||
# Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729
|
||
def __init__(self, c1, c2, k=(5, 9, 13)):
|
||
super().__init__()
|
||
c_ = c1 // 2 # hidden channels
|
||
self.cv1 = Conv(c1, c_, 1, 1)
|
||
self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
|
||
self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
|
||
|
||
def forward(self, x):
|
||
x = self.cv1(x)
|
||
with warnings.catch_warnings():
|
||
warnings.simplefilter('ignore') # suppress torch 1.9.0 max_pool2d() warning
|
||
return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
|
||
|
||
class ASPP(nn.Module):
|
||
# Atrous Spatial Pyramid Pooling (ASPP) layer
|
||
def __init__(self, c1, c2, k=(5, 9, 13)):
|
||
super().__init__()
|
||
c_ = c1 // 2 # hidden channels
|
||
self.cv1 = Conv(c1, c_, 1, 1)
|
||
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
|
||
self.m = nn.ModuleList([nn.Conv2d(c_, c_, kernel_size=3, stride=1, padding=(x-1)//2, dilation=(x-1)//2, bias=False) for x in k])
|
||
self.cv2 = Conv(c_ * (len(k) + 2), c2, 1, 1)
|
||
|
||
def forward(self, x):
|
||
x = self.cv1(x)
|
||
return self.cv2(torch.cat([x]+ [self.maxpool(x)] + [m(x) for m in self.m] , 1))
|
||
|
||
class SPPF(nn.Module):
|
||
# Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher
|
||
def __init__(self, c1, c2, k=5): # equivalent to SPP(k=(5, 9, 13))
|
||
super().__init__()
|
||
c_ = c1 // 2 # hidden channels
|
||
self.cv1 = Conv(c1, c_, 1, 1)
|
||
self.cv2 = Conv(c_ * 4, c2, 1, 1)
|
||
self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
|
||
|
||
def forward(self, x):
|
||
x = self.cv1(x)
|
||
with warnings.catch_warnings():
|
||
warnings.simplefilter('ignore') # suppress torch 1.9.0 max_pool2d() warning
|
||
y1 = self.m(x)
|
||
y2 = self.m(y1)
|
||
return self.cv2(torch.cat([x, y1, y2, self.m(y2)], 1))
|
||
|
||
|
||
class Focus(nn.Module):
|
||
# Focus wh information into c-space
|
||
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups
|
||
super().__init__()
|
||
self.conv = Conv(c1 * 4, c2, k, s, p, g, act)
|
||
# self.contract = Contract(gain=2)
|
||
|
||
def forward(self, x): # x(b,c,w,h) -> y(b,4c,w/2,h/2)
|
||
return self.conv(torch.cat([x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]], 1))
|
||
# return self.conv(self.contract(x))
|
||
|
||
|
||
class GhostConv(nn.Module):
|
||
# Ghost Convolution https://github.com/huawei-noah/ghostnet
|
||
def __init__(self, c1, c2, k=1, s=1, g=1, act=True): # ch_in, ch_out, kernel, stride, groups
|
||
super().__init__()
|
||
c_ = c2 // 2 # hidden channels
|
||
self.cv1 = Conv(c1, c_, k, s, None, g, act)
|
||
self.cv2 = Conv(c_, c_, 5, 1, None, c_, act)
|
||
|
||
def forward(self, x):
|
||
y = self.cv1(x)
|
||
return torch.cat([y, self.cv2(y)], 1)
|
||
|
||
|
||
class GhostBottleneck(nn.Module):
|
||
# Ghost Bottleneck https://github.com/huawei-noah/ghostnet
|
||
def __init__(self, c1, c2, k=3, s=1): # ch_in, ch_out, kernel, stride
|
||
super().__init__()
|
||
c_ = c2 // 2
|
||
self.conv = nn.Sequential(GhostConv(c1, c_, 1, 1), # pw
|
||
DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(), # dw
|
||
GhostConv(c_, c2, 1, 1, act=False)) # pw-linear
|
||
self.shortcut = nn.Sequential(DWConv(c1, c1, k, s, act=False),
|
||
Conv(c1, c2, 1, 1, act=False)) if s == 2 else nn.Identity()
|
||
|
||
def forward(self, x):
|
||
return self.conv(x) + self.shortcut(x)
|
||
|
||
|
||
class Contract(nn.Module):
|
||
# Contract width-height into channels, i.e. x(1,64,80,80) to x(1,256,40,40)
|
||
def __init__(self, gain=2):
|
||
super().__init__()
|
||
self.gain = gain
|
||
|
||
def forward(self, x):
|
||
b, c, h, w = x.size() # assert (h / s == 0) and (W / s == 0), 'Indivisible gain'
|
||
s = self.gain
|
||
x = x.view(b, c, h // s, s, w // s, s) # x(1,64,40,2,40,2)
|
||
x = x.permute(0, 3, 5, 1, 2, 4).contiguous() # x(1,2,2,64,40,40)
|
||
return x.view(b, c * s * s, h // s, w // s) # x(1,256,40,40)
|
||
|
||
|
||
class Expand(nn.Module):
|
||
# Expand channels into width-height, i.e. x(1,64,80,80) to x(1,16,160,160)
|
||
def __init__(self, gain=2):
|
||
super().__init__()
|
||
self.gain = gain
|
||
|
||
def forward(self, x):
|
||
b, c, h, w = x.size() # assert C / s ** 2 == 0, 'Indivisible gain'
|
||
s = self.gain
|
||
x = x.view(b, s, s, c // s ** 2, h, w) # x(1,2,2,16,80,80)
|
||
x = x.permute(0, 3, 4, 1, 5, 2).contiguous() # x(1,16,80,2,80,2)
|
||
return x.view(b, c // s ** 2, h * s, w * s) # x(1,16,160,160)
|
||
|
||
|
||
class Concat(nn.Module):
|
||
# Concatenate a list of tensors along dimension
|
||
def __init__(self, dimension=1):
|
||
super().__init__()
|
||
self.d = dimension
|
||
|
||
def forward(self, x):
|
||
return torch.cat(x, self.d)
|
||
|
||
|
||
class AutoShape(nn.Module):
|
||
# YOLOv5 input-robust model wrapper for passing cv2/np/PIL/torch inputs. Includes preprocessing, inference and NMS
|
||
conf = 0.25 # NMS confidence threshold
|
||
iou = 0.45 # NMS IoU threshold
|
||
classes = None # (optional list) filter by class, i.e. = [0, 15, 16] for COCO persons, cats and dogs
|
||
multi_label = False # NMS multiple labels per box
|
||
max_det = 1000 # maximum number of detections per image
|
||
|
||
def __init__(self, model):
|
||
super().__init__()
|
||
self.model = model.eval()
|
||
|
||
def autoshape(self):
|
||
LOGGER.info('AutoShape already enabled, skipping... ') # model already converted to model.autoshape()
|
||
return self
|
||
|
||
def _apply(self, fn):
|
||
# Apply to(), cpu(), cuda(), half() to model tensors that are not parameters or registered buffers
|
||
self = super()._apply(fn)
|
||
m = self.model.model[-1] # Detect()
|
||
m.stride = fn(m.stride)
|
||
m.grid = list(map(fn, m.grid))
|
||
if isinstance(m.anchor_grid, list):
|
||
m.anchor_grid = list(map(fn, m.anchor_grid))
|
||
return self
|
||
|
||
@torch.no_grad()
|
||
def forward(self, imgs, size=640, augment=False, profile=False):
|
||
# Inference from various sources. For height=640, width=1280, RGB images example inputs are:
|
||
# file: imgs = 'data/images/zidane.jpg' # str or PosixPath
|
||
# URI: = 'https://ultralytics.com/images/zidane.jpg'
|
||
# OpenCV: = cv2.imread('image.jpg')[:,:,::-1] # HWC BGR to RGB x(640,1280,3)
|
||
# PIL: = Image.open('image.jpg') or ImageGrab.grab() # HWC x(640,1280,3)
|
||
# numpy: = np.zeros((640,1280,3)) # HWC
|
||
# torch: = torch.zeros(16,3,320,640) # BCHW (scaled to size=640, 0-1 values)
|
||
# multiple: = [Image.open('image1.jpg'), Image.open('image2.jpg'), ...] # list of images
|
||
|
||
t = [time_sync()]
|
||
p = next(self.model.parameters()) # for device and type
|
||
if isinstance(imgs, torch.Tensor): # torch
|
||
with amp.autocast(enabled=p.device.type != 'cpu'):
|
||
return self.model(imgs.to(p.device).type_as(p), augment, profile) # inference
|
||
|
||
# Pre-process
|
||
n, imgs = (len(imgs), imgs) if isinstance(imgs, list) else (1, [imgs]) # number of images, list of images
|
||
shape0, shape1, files = [], [], [] # image and inference shapes, filenames
|
||
for i, im in enumerate(imgs):
|
||
f = f'image{i}' # filename
|
||
if isinstance(im, (str, Path)): # filename or uri
|
||
im, f = Image.open(requests.get(im, stream=True).raw if str(im).startswith('http') else im), im
|
||
im = np.asarray(exif_transpose(im))
|
||
elif isinstance(im, Image.Image): # PIL Image
|
||
im, f = np.asarray(exif_transpose(im)), getattr(im, 'filename', f) or f
|
||
files.append(Path(f).with_suffix('.jpg').name)
|
||
if im.shape[0] < 5: # image in CHW
|
||
im = im.transpose((1, 2, 0)) # reverse dataloader .transpose(2, 0, 1)
|
||
im = im[..., :3] if im.ndim == 3 else np.tile(im[..., None], 3) # enforce 3ch input
|
||
s = im.shape[:2] # HWC
|
||
shape0.append(s) # image shape
|
||
g = (size / max(s)) # gain
|
||
shape1.append([y * g for y in s])
|
||
imgs[i] = im if im.data.contiguous else np.ascontiguousarray(im) # update
|
||
shape1 = [make_divisible(x, int(self.stride.max())) for x in np.stack(shape1, 0).max(0)] # inference shape
|
||
x = [letterbox(im, new_shape=shape1, auto=False)[0] for im in imgs] # pad
|
||
x = np.stack(x, 0) if n > 1 else x[0][None] # stack
|
||
x = np.ascontiguousarray(x.transpose((0, 3, 1, 2))) # BHWC to BCHW
|
||
x = torch.from_numpy(x).to(p.device).type_as(p) / 255 # uint8 to fp16/32
|
||
t.append(time_sync())
|
||
|
||
with amp.autocast(enabled=p.device.type != 'cpu'):
|
||
# Inference
|
||
y = self.model(x, augment, profile)[0] # forward
|
||
t.append(time_sync())
|
||
|
||
# Post-process
|
||
y = non_max_suppression(y, self.conf, iou_thres=self.iou, classes=self.classes,
|
||
multi_label=self.multi_label, max_det=self.max_det) # NMS
|
||
for i in range(n):
|
||
scale_coords(shape1, y[i][:, :4], shape0[i])
|
||
|
||
t.append(time_sync())
|
||
return Detections(imgs, y, files, t, self.names, x.shape)
|
||
|
||
|
||
class Detections:
|
||
# YOLOv5 detections class for inference results
|
||
def __init__(self, imgs, pred, files, times=None, names=None, shape=None):
|
||
super().__init__()
|
||
d = pred[0].device # device
|
||
gn = [torch.tensor([*(im.shape[i] for i in [1, 0, 1, 0]), 1, 1], device=d) for im in imgs] # normalizations
|
||
self.imgs = imgs # list of images as numpy arrays
|
||
self.pred = pred # list of tensors pred[0] = (xyxy, conf, cls)
|
||
self.names = names # class names
|
||
self.files = files # image filenames
|
||
self.xyxy = pred # xyxy pixels
|
||
self.xywh = [xyxy2xywh(x) for x in pred] # xywh pixels
|
||
self.xyxyn = [x / g for x, g in zip(self.xyxy, gn)] # xyxy normalized
|
||
self.xywhn = [x / g for x, g in zip(self.xywh, gn)] # xywh normalized
|
||
self.n = len(self.pred) # number of images (batch size)
|
||
self.t = tuple((times[i + 1] - times[i]) * 1000 / self.n for i in range(3)) # timestamps (ms)
|
||
self.s = shape # inference BCHW shape
|
||
|
||
def display(self, pprint=False, show=False, save=False, crop=False, render=False, save_dir=Path('')):
|
||
crops = []
|
||
for i, (im, pred) in enumerate(zip(self.imgs, self.pred)):
|
||
s = f'image {i + 1}/{len(self.pred)}: {im.shape[0]}x{im.shape[1]} ' # string
|
||
if pred.shape[0]:
|
||
for c in pred[:, -1].unique():
|
||
n = (pred[:, -1] == c).sum() # detections per class
|
||
s += f"{n} {self.names[int(c)]}{'s' * (n > 1)}, " # add to string
|
||
if show or save or render or crop:
|
||
annotator = Annotator(im, example=str(self.names))
|
||
for *box, conf, cls in reversed(pred): # xyxy, confidence, class
|
||
label = f'{self.names[int(cls)]} {conf:.2f}'
|
||
if crop:
|
||
file = save_dir / 'crops' / self.names[int(cls)] / self.files[i] if save else None
|
||
crops.append({'box': box, 'conf': conf, 'cls': cls, 'label': label,
|
||
'im': save_one_box(box, im, file=file, save=save)})
|
||
else: # all others
|
||
annotator.box_label(box, label, color=colors(cls))
|
||
im = annotator.im
|
||
else:
|
||
s += '(no detections)'
|
||
|
||
im = Image.fromarray(im.astype(np.uint8)) if isinstance(im, np.ndarray) else im # from np
|
||
if pprint:
|
||
LOGGER.info(s.rstrip(', '))
|
||
if show:
|
||
im.show(self.files[i]) # show
|
||
if save:
|
||
f = self.files[i]
|
||
im.save(save_dir / f) # save
|
||
if i == self.n - 1:
|
||
LOGGER.info(f"Saved {self.n} image{'s' * (self.n > 1)} to {colorstr('bold', save_dir)}")
|
||
if render:
|
||
self.imgs[i] = np.asarray(im)
|
||
if crop:
|
||
if save:
|
||
LOGGER.info(f'Saved results to {save_dir}\n')
|
||
return crops
|
||
|
||
def print(self):
|
||
self.display(pprint=True) # print results
|
||
LOGGER.info(f'Speed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape {tuple(self.s)}' %
|
||
self.t)
|
||
|
||
def show(self):
|
||
self.display(show=True) # show results
|
||
|
||
def save(self, save_dir='runs/detect/exp'):
|
||
save_dir = increment_path(save_dir, exist_ok=save_dir != 'runs/detect/exp', mkdir=True) # increment save_dir
|
||
self.display(save=True, save_dir=save_dir) # save results
|
||
|
||
def crop(self, save=True, save_dir='runs/detect/exp'):
|
||
save_dir = increment_path(save_dir, exist_ok=save_dir != 'runs/detect/exp', mkdir=True) if save else None
|
||
return self.display(crop=True, save=save, save_dir=save_dir) # crop results
|
||
|
||
def render(self):
|
||
self.display(render=True) # render results
|
||
return self.imgs
|
||
|
||
def pandas(self):
|
||
# return detections as pandas DataFrames, i.e. print(results.pandas().xyxy[0])
|
||
new = copy(self) # return copy
|
||
ca = 'xmin', 'ymin', 'xmax', 'ymax', 'confidence', 'class', 'name' # xyxy columns
|
||
cb = 'xcenter', 'ycenter', 'width', 'height', 'confidence', 'class', 'name' # xywh columns
|
||
for k, c in zip(['xyxy', 'xyxyn', 'xywh', 'xywhn'], [ca, ca, cb, cb]):
|
||
a = [[x[:5] + [int(x[5]), self.names[int(x[5])]] for x in x.tolist()] for x in getattr(self, k)] # update
|
||
setattr(new, k, [pd.DataFrame(x, columns=c) for x in a])
|
||
return new
|
||
|
||
def tolist(self):
|
||
# return a list of Detections objects, i.e. 'for result in results.tolist():'
|
||
x = [Detections([self.imgs[i]], [self.pred[i]], self.names, self.s) for i in range(self.n)]
|
||
for d in x:
|
||
for k in ['imgs', 'pred', 'xyxy', 'xyxyn', 'xywh', 'xywhn']:
|
||
setattr(d, k, getattr(d, k)[0]) # pop out of list
|
||
return x
|
||
|
||
def __len__(self):
|
||
return self.n
|
||
|
||
|
||
class Classify(nn.Module):
|
||
# Classification head, i.e. x(b,c1,20,20) to x(b,c2)
|
||
def __init__(self, c1, c2, k=1, s=1, p=None, g=1): # ch_in, ch_out, kernel, stride, padding, groups
|
||
super().__init__()
|
||
self.aap = nn.AdaptiveAvgPool2d(1) # to x(b,c1,1,1)
|
||
self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g) # to x(b,c2,1,1)
|
||
self.flat = nn.Flatten()
|
||
|
||
def forward(self, x):
|
||
z = torch.cat([self.aap(y) for y in (x if isinstance(x, list) else [x])], 1) # cat if list
|
||
return self.flat(self.conv(z)) # flatten to x(b,c2)
|