commit 48296591cc72712fa6bad1779a05c889a3e1cc70 Author: Teoge Date: Fri Jul 20 16:25:15 2018 +0800 Initial Commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c0db520 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.vscode +.pylintrc +__pycache__ +weights/* \ No newline at end of file diff --git a/config.py b/config.py new file mode 100644 index 0000000..adadc1d --- /dev/null +++ b/config.py @@ -0,0 +1,57 @@ +"""Configurate arguments.""" +import argparse + + +INPUT_IMAGE_SIZE = 512 +# 0: confidence, 1: offset_x, 2: offset_y, 3: cos(direction), 4: sin(direction) +NUM_FEATURE_MAP_CHANNEL = 5 +# image_size / 2^5 = 512 / 32 = 16 +FEATURE_MAP_SIZE = 16 + + +def add_common_arguments(parser): + """Add common arguments for training and inference.""" + parser.add_argument('--detector_weights', + help="The weights of pretrained detector.") + parser.add_argument('--depth_factor', type=int, default=32, + help="Depth factor.") + parser.add_argument('--disable_cuda', action='store_true', + help="Disable CUDA.") + parser.add_argument('--gpu_id', type=int, default=1, + help="Select which gpu to use.") + + +def get_parser_for_training(): + """Return argument parser for training.""" + parser = argparse.ArgumentParser() + parser.add_argument('--dataset_directory', required=True, + help="The location of dataset.") + parser.add_argument('--optimizer_weights', + help="The weights of optimizer.") + parser.add_argument('--batch_size', type=int, default=16, + help="Batch size.") + parser.add_argument('--num_epochs', type=int, default=100, + help="Number of epochs to train for.") + parser.add_argument('--lr', type=float, default=1e-3, + help="The learning rate of back propagation.") + parser.add_argument('--enable_visdom', action='store_true', + help="Enable Visdom to visualize training progress") + add_common_arguments(parser) + return parser + + +def get_parser_for_inference(): + """Return argument parser for inference.""" + parser = argparse.ArgumentParser() + parser.add_argument('--mode', required=True, choices=['image', 'video'], + help="Inference image or video.") + parser.add_argument('--video', + help="Video path if you choose to inference video.") + parser.add_argument('--thresh', type=float, default=0.5, + help="Detection threshold.") + parser.add_argument('--timing', action='store_true', + help="Perform timing during reference.") + parser.add_argument('--save', action='store_true', + help="Save detection result to file.") + add_common_arguments(parser) + return parser diff --git a/data.py b/data.py new file mode 100644 index 0000000..4927e34 --- /dev/null +++ b/data.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +import os +import os.path +from PIL import Image +from torch.utils.data import Dataset +from torchvision import transforms + + +class ParkingSlotDataset(Dataset): + """Parking slot dataset.""" + def __init__(self, root): + super(ParkingSlotDataset, self).__init__() + self.root = root + self.sample_names = [] + self.image_transform = transforms.Compose([ + transforms.Resize((512, 512)), + transforms.ToTensor(), + ]) + for file in os.listdir(root): + if file.endswith(".txt"): + self.sample_names.append(os.path.splitext(file)[0]) + + def __getitem__(self, index): + name = self.sample_names[index] + image = Image.open(os.path.join(self.root, name+'.bmp')) + image = self.image_transform(image) + marking_points = [] + with open(os.path.join(self.root, name+'.txt'), 'r') as file: + for line in file: + marking_point = tuple([float(n) for n in line.split()]) + marking_points.append(marking_point) + return image, marking_points + + def __len__(self): + return len(self.sample_names) diff --git a/detector.py b/detector.py new file mode 100644 index 0000000..de641e0 --- /dev/null +++ b/detector.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- +import torch +from torch import nn +from network import define_halve_unit, define_detector_block + + +class YetAnotherDarknet(nn.modules.Module): + """Yet another darknet, imitating darknet-53 with depth of darknet-19.""" + def __init__(self, input_channel_size, depth_factor): + super(YetAnotherDarknet, self).__init__() + layers = [] + # 0 + layers += [nn.Conv2d(input_channel_size, depth_factor, kernel_size=3, + stride=1, padding=1, bias=False)] + layers += [nn.BatchNorm2d(depth_factor)] + layers += [nn.LeakyReLU(0.1)] + # 1 + layers += define_halve_unit(depth_factor) + layers += define_detector_block(depth_factor) + # 2 + depth_factor *= 2 + layers += define_halve_unit(depth_factor) + layers += define_detector_block(depth_factor) + # 3 + depth_factor *= 2 + layers += define_halve_unit(depth_factor) + layers += define_detector_block(depth_factor) + layers += define_detector_block(depth_factor) + # 4 + depth_factor *= 2 + layers += define_halve_unit(depth_factor) + layers += define_detector_block(depth_factor) + layers += define_detector_block(depth_factor) + # 5 + depth_factor *= 2 + layers += define_halve_unit(depth_factor) + layers += define_detector_block(depth_factor) + self.model = nn.Sequential(*layers) + + def forward(self, *x): + return self.model(x[0]) + + +class DirectionalPointDetector(nn.modules.Module): + """Detector for point with direction.""" + def __init__(self, input_channel_size, depth_factor, output_channel_size): + super(DirectionalPointDetector, self).__init__() + self.extract_feature = YetAnotherDarknet(input_channel_size, + depth_factor) + layers = [] + layers += define_detector_block(16 * depth_factor) + layers += define_detector_block(16 * depth_factor) + layers += [nn.Conv2d(32 * depth_factor, output_channel_size, + kernel_size=1, stride=1, padding=0, bias=False)] + self.predict = nn.Sequential(*layers) + + def forward(self, *x): + feature = self.extract_feature(x[0]) + prediction = self.predict(feature) + point_pred, angle_pred = torch.split(prediction, 3, dim=1) + point_pred = nn.functional.sigmoid(point_pred) + angle_pred = nn.functional.tanh(angle_pred) + return torch.cat((point_pred, angle_pred), dim=1) diff --git a/inference.py b/inference.py new file mode 100644 index 0000000..9e5cb41 --- /dev/null +++ b/inference.py @@ -0,0 +1,85 @@ +"""Inference demo of directional point detector.""" +import math +import cv2 as cv +import numpy as np +import torch +from torchvision.transforms import ToTensor +import config +from detector import DirectionalPointDetector +from utils import get_marking_points, Timer + + +def plot_marking_points(image, marking_points): + """Plot marking points on the image and show.""" + height = image.shape[0] + width = image.shape[1] + for marking_point in marking_points: + p0_x = width * marking_point[0] + p0_y = height * marking_point[1] + p1_x = p0_x + 50 * math.cos(marking_point[2]) + p1_y = p0_y + 50 * math.sin(marking_point[2]) + p0_x = int(round(p0_x)) + p0_y = int(round(p0_y)) + p1_x = int(round(p1_x)) + p1_y = int(round(p1_y)) + cv.arrowedLine(image, (p0_x, p0_y), (p1_x, p1_y), (0, 0, 255)) + cv.imshow('demo', image) + cv.waitKey(1) + + +def preprocess_image(image): + """Preprocess numpy image to torch tensor.""" + if image.shape[0] != 512 or image.shape[1] != 512: + image = cv.resize(image, (512, 512)) + return torch.unsqueeze(ToTensor()(image), 0) + + +def detect_video(detector, device, args): + """Demo for detecting video.""" + timer = Timer() + input_video = cv.VideoCapture(args.video) + frame_width = int(input_video.get(cv.CAP_PROP_FRAME_WIDTH)) + frame_height = int(input_video.get(cv.CAP_PROP_FRAME_HEIGHT)) + output_video = cv.VideoWriter() + if args.save: + output_video.open('record.avi', cv.VideoWriter_fourcc(* 'MJPG'), + input_video.get(cv.CAP_PROP_FPS), + (frame_width, frame_height)) + frame = np.empty([frame_height, frame_width, 3], dtype=np.uint8) + while input_video.read(frame)[0]: + if args.timing: + timer.tic() + prediction = detector(preprocess_image(frame).to(device)) + if args.timing: + timer.toc() + pred_points = get_marking_points(prediction[0], args.thresh) + plot_marking_points(frame, pred_points) + if args.save: + output_video.write(frame) + input_video.release() + output_video.release() + + +def detect_image(detector, device, args): + """Demo for detecting images.""" + image_file = input('Enter image file path: ') + image = cv.imread(image_file) + prediction = detector(preprocess_image(image).to(device)) + pred_points = get_marking_points(prediction[0], args.thresh) + plot_marking_points(image, pred_points) + + +def inference_detector(args): + """Inference demo of directional point detector.""" + args.cuda = not args.disable_cuda and torch.cuda.is_available() + device = torch.device("cuda:" + str(args.gpu_id) if args.cuda else "cpu") + dp_detector = DirectionalPointDetector(3, args.depth_factor, 5).to(device) + dp_detector.load_state_dict(torch.load(args.detector_weights)) + if args.mode == "image": + detect_image(dp_detector, device, args) + elif args.mode == "video": + detect_video(dp_detector, device, args) + + +if __name__ == '__main__': + inference_detector(config.get_parser_for_inference().parse_args()) diff --git a/log.py b/log.py new file mode 100644 index 0000000..04d23da --- /dev/null +++ b/log.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +import math +import numpy as np +from visdom import Visdom +from PIL import ImageDraw + + +class Logger(): + """Logger for training.""" + def __init__(self, curve_names=None): + self.curve_names = curve_names + if curve_names: + self.vis = Visdom() + assert self.vis.check_connection() + self.curve_y = None + self.curve_x_start = 0 + self.curve_x_end = 0 + + def log(self, **kwargs): + """Log and print the information.""" + print("##############################################################") + for key, value in kwargs.items(): + print(key, value, sep='\t') + if not self.curve_names: + return + curve_step = np.array([kwargs[cn] for cn in self.curve_names]) + if self.curve_y is None: + self.curve_y = curve_step + else: + self.curve_y = np.row_stack((self.curve_y, curve_step)) + self.curve_x_end = self.curve_x_end + 1 + + def plot_curve(self): + """Plot curve on visdom.""" + if (self.curve_x_end - self.curve_x_start < 2 or not self.curve_names): + return + if self.curve_x_start == 0: + update_opt = None + else: + update_opt = 'append' + curve_x = np.arange(self.curve_x_start, self.curve_x_end) + curve_x = np.transpose(np.tile(curve_x, (len(self.curve_names), 1))) + self.vis.line(Y=self.curve_y, X=curve_x, win='loss', update=update_opt, + opts=dict(showlegend=True, legend=self.curve_names)) + self.curve_x_start = self.curve_x_end + self.curve_y = None + + def plot_marking_points(self, image, marking_points, win_name='mk_points'): + """Plot marking points on visdom.""" + width, height = image.size + draw = ImageDraw.Draw(image) + for point in marking_points: + p0_x = width * point[0] + p0_y = height * point[1] + p1_x = p0_x + 50*math.cos(point[2]) + p1_y = p0_y + 50*math.sin(point[2]) + draw.line((p0_x, p0_y, p1_x, p1_y), fill=(255, 0, 0)) + image = np.asarray(image, dtype="uint8") + image = np.transpose(image, (2, 0, 1)) + self.vis.image(image, win=win_name) diff --git a/network.py b/network.py new file mode 100644 index 0000000..32ff668 --- /dev/null +++ b/network.py @@ -0,0 +1,40 @@ +"""Common network struture unit definition.""" +from torch import nn + + +def define_squeeze_unit(basic_channel_size): + """Define a 1x1 squeeze convolution with norm and activation.""" + conv = nn.Conv2d(2 * basic_channel_size, basic_channel_size, kernel_size=1, + stride=1, padding=0, bias=False) + norm = nn.BatchNorm2d(basic_channel_size) + relu = nn.LeakyReLU(0.1) + layers = [conv, norm, relu] + return layers + + +def define_expand_unit(basic_channel_size): + """Define a 3x3 expand convolution with norm and activation.""" + conv = nn.Conv2d(basic_channel_size, 2 * basic_channel_size, kernel_size=3, + stride=1, padding=1, bias=False) + norm = nn.BatchNorm2d(2 * basic_channel_size) + relu = nn.LeakyReLU(0.1) + layers = [conv, norm, relu] + return layers + + +def define_halve_unit(basic_channel_size): + """Define a 3x3 expand stride 2 convolution with norm and activation.""" + conv = nn.Conv2d(basic_channel_size, 2 * basic_channel_size, kernel_size=4, + stride=2, padding=1, bias=False) + norm = nn.BatchNorm2d(2 * basic_channel_size) + relu = nn.LeakyReLU(0.1) + layers = [conv, norm, relu] + return layers + + +def define_detector_block(basic_channel_size): + """Define a unit composite of a squeeze and expand unit.""" + layers = [] + layers += define_squeeze_unit(basic_channel_size) + layers += define_expand_unit(basic_channel_size) + return layers diff --git a/train.py b/train.py new file mode 100644 index 0000000..8c02008 --- /dev/null +++ b/train.py @@ -0,0 +1,99 @@ +"""Train directional point detector.""" +import math +import random +import torch +from torch.utils.data import DataLoader +import config +from data import ParkingSlotDataset +from detector import DirectionalPointDetector +from log import Logger +from utils import tensor2im, get_marking_points + + +def get_objective_from_labels(marking_points_batch, device): + """Get regression objective and gradient for directional point detector.""" + batch_size = len(marking_points_batch) + objective = torch.zeros(batch_size, config.NUM_FEATURE_MAP_CHANNEL, + config.FEATURE_MAP_SIZE, config.FEATURE_MAP_SIZE, + device=device) + gradient = torch.zeros_like(objective) + gradient[:, 0].fill_(1.) + for batch_idx, marking_points in enumerate(marking_points_batch): + for marking_point in marking_points: + col = math.floor(marking_point[0] * 16) + row = math.floor(marking_point[1] * 16) + # Confidence Regression + objective[batch_idx, 0, row, col] = 1. + # Offset Regression + offset_x = marking_point[0]*16 - col + offset_y = marking_point[1]*16 - row + objective[batch_idx, 1, row, col] = offset_x + objective[batch_idx, 2, row, col] = offset_y + # Direction Regression + direction = marking_point[2] + objective[batch_idx, 3, row, col] = math.cos(direction) + objective[batch_idx, 4, row, col] = math.sin(direction) + # Assign Gradient + gradient[batch_idx, 1:5, row, col].fill_(1.) + return objective, gradient + + +def plot_random_prediction(logger, image, marking_points, prediction): + """Plot the ground truth and prediction of a random sample in a batch.""" + rand_sample = random.randint(0, image.size(0)-1) + sampled_image = tensor2im(image[rand_sample]) + logger.plot_marking_points(sampled_image, marking_points[rand_sample], + win_name='gt_marking_points') + sampled_image = tensor2im(image[rand_sample]) + pred_points = get_marking_points(prediction[rand_sample], 0.01) + logger.plot_marking_points(sampled_image, pred_points, + win_name='pred_marking_points') + + +def train_detector(args): + """Train directional point detector.""" + args.cuda = not args.disable_cuda and torch.cuda.is_available() + device = torch.device("cuda:"+str(args.gpu_id) if args.cuda else "cpu") + + dp_detector = DirectionalPointDetector(3, args.depth_factor, 5).to(device) + if args.detector_weights is not None: + dp_detector.load_state_dict(torch.load(args.detector_weights)) + + optimizer = torch.optim.Adam(dp_detector.parameters(), lr=args.lr) + if args.optimizer_weights is not None: + optimizer.load_state_dict(torch.load(args.optimizer_weights)) + + if args.enable_visdom: + logger = Logger(['loss']) + else: + logger = Logger() + + data_loader = DataLoader(ParkingSlotDataset(args.dataset_directory), + batch_size=args.batch_size, shuffle=True, + collate_fn=lambda x: list(zip(*x))) + for epoch_idx in range(args.num_epochs): + for iter_idx, (image, marking_points) in enumerate(data_loader): + image = torch.stack(image) + image = image.to(device) + + optimizer.zero_grad() + prediction = dp_detector(image) + objective, gradient = get_objective_from_labels(marking_points, + device) + loss = (prediction - objective) ** 2 + loss.backward(gradient) + optimizer.step() + + logger.log(epoch=epoch_idx, iter=iter_idx, + loss=torch.sum(loss * gradient).item()) + if args.enable_visdom: + logger.plot_curve() + plot_random_prediction(logger, image, marking_points, + prediction) + torch.save(dp_detector.state_dict(), + 'weights/dp_detector_%d.pth' % epoch_idx) + torch.save(optimizer.state_dict(), 'weights/optimizer.pth') + + +if __name__ == '__main__': + train_detector(config.get_parser_for_training().parse_args()) diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..5745def --- /dev/null +++ b/utils.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +import math +import time +import torch +import numpy as np +from PIL import Image + + +class Timer(object): + """Timer.""" + def __init__(self): + self.start_ticking = False + self.start = 0. + + def tic(self): + """Start timer.""" + self.start = time.time() + self.start_ticking = True + + def toc(self): + """End timer.""" + duration = time.time() - self.start + self.start_ticking = False + print("Time elapsed:", duration, "s.") + + +def non_maximum_suppression(marking_points): + """Perform non-maxmum suppression on marking points.""" + suppressed = [False] * len(marking_points) + for i in range(len(marking_points) - 1): + for j in range(i + 1, len(marking_points)): + distx = marking_points[i][0] - marking_points[j][0] + disty = marking_points[i][1] - marking_points[j][1] + dist_square = distx ** 2 + disty ** 2 + # minimum distance in training set: 40.309 + # (40.309 / 600)^2 = 0.004513376 + if dist_square < 0.0045: + idx = i if marking_points[i][3] < marking_points[j][3] else j + suppressed[idx] = True + if any(suppressed): + new_marking_points = [] + for i, supres in enumerate(suppressed): + if not supres: + new_marking_points.append(marking_points[i]) + return new_marking_points + return marking_points + + +def get_marking_points(prediction, thresh): + """Get marking point from predicted feature map.""" + assert isinstance(prediction, torch.Tensor) + marking_points = [] + prediction = prediction.detach().cpu().numpy() + for i in range(prediction.shape[1]): + for j in range(prediction.shape[2]): + if prediction[0, i, j] > thresh: + xval = (j + prediction[1, i, j]) / prediction.shape[2] + yval = (i + prediction[2, i, j]) / prediction.shape[1] + cos_value = prediction[3, i, j] + sin_value = prediction[4, i, j] + angle = math.atan2(sin_value, cos_value) + marking_points.append([xval, yval, angle, prediction[0, i, j]]) + return non_maximum_suppression(marking_points) + + +def tensor2array(image_tensor, imtype=np.uint8): + """Convert float image tensor to numpy ndarray""" + assert isinstance(image_tensor, torch.Tensor) + image_numpy = (image_tensor.detach().cpu().numpy()) * 255.0 + return image_numpy.astype(imtype) + + +def tensor2im(image_tensor, imtype=np.uint8): + """Convert float image tensor to PIL Image""" + image_numpy = np.transpose(tensor2array(image_tensor, imtype), (1, 2, 0)) + return Image.fromarray(image_numpy)