From 48296591cc72712fa6bad1779a05c889a3e1cc70 Mon Sep 17 00:00:00 2001 From: Teoge Date: Fri, 20 Jul 2018 16:25:15 +0800 Subject: [PATCH] Initial Commit --- .gitignore | 4 +++ config.py | 57 ++++++++++++++++++++++++++++++ data.py | 35 +++++++++++++++++++ detector.py | 63 +++++++++++++++++++++++++++++++++ inference.py | 85 ++++++++++++++++++++++++++++++++++++++++++++ log.py | 60 +++++++++++++++++++++++++++++++ network.py | 40 +++++++++++++++++++++ train.py | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++ utils.py | 76 ++++++++++++++++++++++++++++++++++++++++ 9 files changed, 519 insertions(+) create mode 100644 .gitignore create mode 100644 config.py create mode 100644 data.py create mode 100644 detector.py create mode 100644 inference.py create mode 100644 log.py create mode 100644 network.py create mode 100644 train.py create mode 100644 utils.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c0db520 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.vscode +.pylintrc +__pycache__ +weights/* \ No newline at end of file diff --git a/config.py b/config.py new file mode 100644 index 0000000..adadc1d --- /dev/null +++ b/config.py @@ -0,0 +1,57 @@ +"""Configurate arguments.""" +import argparse + + +INPUT_IMAGE_SIZE = 512 +# 0: confidence, 1: offset_x, 2: offset_y, 3: cos(direction), 4: sin(direction) +NUM_FEATURE_MAP_CHANNEL = 5 +# image_size / 2^5 = 512 / 32 = 16 +FEATURE_MAP_SIZE = 16 + + +def add_common_arguments(parser): + """Add common arguments for training and inference.""" + parser.add_argument('--detector_weights', + help="The weights of pretrained detector.") + parser.add_argument('--depth_factor', type=int, default=32, + help="Depth factor.") + parser.add_argument('--disable_cuda', action='store_true', + help="Disable CUDA.") + parser.add_argument('--gpu_id', type=int, default=1, + help="Select which gpu to use.") + + +def get_parser_for_training(): + """Return argument parser for training.""" + parser = argparse.ArgumentParser() + parser.add_argument('--dataset_directory', required=True, + help="The location of dataset.") + parser.add_argument('--optimizer_weights', + help="The weights of optimizer.") + parser.add_argument('--batch_size', type=int, default=16, + help="Batch size.") + parser.add_argument('--num_epochs', type=int, default=100, + help="Number of epochs to train for.") + parser.add_argument('--lr', type=float, default=1e-3, + help="The learning rate of back propagation.") + parser.add_argument('--enable_visdom', action='store_true', + help="Enable Visdom to visualize training progress") + add_common_arguments(parser) + return parser + + +def get_parser_for_inference(): + """Return argument parser for inference.""" + parser = argparse.ArgumentParser() + parser.add_argument('--mode', required=True, choices=['image', 'video'], + help="Inference image or video.") + parser.add_argument('--video', + help="Video path if you choose to inference video.") + parser.add_argument('--thresh', type=float, default=0.5, + help="Detection threshold.") + parser.add_argument('--timing', action='store_true', + help="Perform timing during reference.") + parser.add_argument('--save', action='store_true', + help="Save detection result to file.") + add_common_arguments(parser) + return parser diff --git a/data.py b/data.py new file mode 100644 index 0000000..4927e34 --- /dev/null +++ b/data.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +import os +import os.path +from PIL import Image +from torch.utils.data import Dataset +from torchvision import transforms + + +class ParkingSlotDataset(Dataset): + """Parking slot dataset.""" + def __init__(self, root): + super(ParkingSlotDataset, self).__init__() + self.root = root + self.sample_names = [] + self.image_transform = transforms.Compose([ + transforms.Resize((512, 512)), + transforms.ToTensor(), + ]) + for file in os.listdir(root): + if file.endswith(".txt"): + self.sample_names.append(os.path.splitext(file)[0]) + + def __getitem__(self, index): + name = self.sample_names[index] + image = Image.open(os.path.join(self.root, name+'.bmp')) + image = self.image_transform(image) + marking_points = [] + with open(os.path.join(self.root, name+'.txt'), 'r') as file: + for line in file: + marking_point = tuple([float(n) for n in line.split()]) + marking_points.append(marking_point) + return image, marking_points + + def __len__(self): + return len(self.sample_names) diff --git a/detector.py b/detector.py new file mode 100644 index 0000000..de641e0 --- /dev/null +++ b/detector.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- +import torch +from torch import nn +from network import define_halve_unit, define_detector_block + + +class YetAnotherDarknet(nn.modules.Module): + """Yet another darknet, imitating darknet-53 with depth of darknet-19.""" + def __init__(self, input_channel_size, depth_factor): + super(YetAnotherDarknet, self).__init__() + layers = [] + # 0 + layers += [nn.Conv2d(input_channel_size, depth_factor, kernel_size=3, + stride=1, padding=1, bias=False)] + layers += [nn.BatchNorm2d(depth_factor)] + layers += [nn.LeakyReLU(0.1)] + # 1 + layers += define_halve_unit(depth_factor) + layers += define_detector_block(depth_factor) + # 2 + depth_factor *= 2 + layers += define_halve_unit(depth_factor) + layers += define_detector_block(depth_factor) + # 3 + depth_factor *= 2 + layers += define_halve_unit(depth_factor) + layers += define_detector_block(depth_factor) + layers += define_detector_block(depth_factor) + # 4 + depth_factor *= 2 + layers += define_halve_unit(depth_factor) + layers += define_detector_block(depth_factor) + layers += define_detector_block(depth_factor) + # 5 + depth_factor *= 2 + layers += define_halve_unit(depth_factor) + layers += define_detector_block(depth_factor) + self.model = nn.Sequential(*layers) + + def forward(self, *x): + return self.model(x[0]) + + +class DirectionalPointDetector(nn.modules.Module): + """Detector for point with direction.""" + def __init__(self, input_channel_size, depth_factor, output_channel_size): + super(DirectionalPointDetector, self).__init__() + self.extract_feature = YetAnotherDarknet(input_channel_size, + depth_factor) + layers = [] + layers += define_detector_block(16 * depth_factor) + layers += define_detector_block(16 * depth_factor) + layers += [nn.Conv2d(32 * depth_factor, output_channel_size, + kernel_size=1, stride=1, padding=0, bias=False)] + self.predict = nn.Sequential(*layers) + + def forward(self, *x): + feature = self.extract_feature(x[0]) + prediction = self.predict(feature) + point_pred, angle_pred = torch.split(prediction, 3, dim=1) + point_pred = nn.functional.sigmoid(point_pred) + angle_pred = nn.functional.tanh(angle_pred) + return torch.cat((point_pred, angle_pred), dim=1) diff --git a/inference.py b/inference.py new file mode 100644 index 0000000..9e5cb41 --- /dev/null +++ b/inference.py @@ -0,0 +1,85 @@ +"""Inference demo of directional point detector.""" +import math +import cv2 as cv +import numpy as np +import torch +from torchvision.transforms import ToTensor +import config +from detector import DirectionalPointDetector +from utils import get_marking_points, Timer + + +def plot_marking_points(image, marking_points): + """Plot marking points on the image and show.""" + height = image.shape[0] + width = image.shape[1] + for marking_point in marking_points: + p0_x = width * marking_point[0] + p0_y = height * marking_point[1] + p1_x = p0_x + 50 * math.cos(marking_point[2]) + p1_y = p0_y + 50 * math.sin(marking_point[2]) + p0_x = int(round(p0_x)) + p0_y = int(round(p0_y)) + p1_x = int(round(p1_x)) + p1_y = int(round(p1_y)) + cv.arrowedLine(image, (p0_x, p0_y), (p1_x, p1_y), (0, 0, 255)) + cv.imshow('demo', image) + cv.waitKey(1) + + +def preprocess_image(image): + """Preprocess numpy image to torch tensor.""" + if image.shape[0] != 512 or image.shape[1] != 512: + image = cv.resize(image, (512, 512)) + return torch.unsqueeze(ToTensor()(image), 0) + + +def detect_video(detector, device, args): + """Demo for detecting video.""" + timer = Timer() + input_video = cv.VideoCapture(args.video) + frame_width = int(input_video.get(cv.CAP_PROP_FRAME_WIDTH)) + frame_height = int(input_video.get(cv.CAP_PROP_FRAME_HEIGHT)) + output_video = cv.VideoWriter() + if args.save: + output_video.open('record.avi', cv.VideoWriter_fourcc(* 'MJPG'), + input_video.get(cv.CAP_PROP_FPS), + (frame_width, frame_height)) + frame = np.empty([frame_height, frame_width, 3], dtype=np.uint8) + while input_video.read(frame)[0]: + if args.timing: + timer.tic() + prediction = detector(preprocess_image(frame).to(device)) + if args.timing: + timer.toc() + pred_points = get_marking_points(prediction[0], args.thresh) + plot_marking_points(frame, pred_points) + if args.save: + output_video.write(frame) + input_video.release() + output_video.release() + + +def detect_image(detector, device, args): + """Demo for detecting images.""" + image_file = input('Enter image file path: ') + image = cv.imread(image_file) + prediction = detector(preprocess_image(image).to(device)) + pred_points = get_marking_points(prediction[0], args.thresh) + plot_marking_points(image, pred_points) + + +def inference_detector(args): + """Inference demo of directional point detector.""" + args.cuda = not args.disable_cuda and torch.cuda.is_available() + device = torch.device("cuda:" + str(args.gpu_id) if args.cuda else "cpu") + dp_detector = DirectionalPointDetector(3, args.depth_factor, 5).to(device) + dp_detector.load_state_dict(torch.load(args.detector_weights)) + if args.mode == "image": + detect_image(dp_detector, device, args) + elif args.mode == "video": + detect_video(dp_detector, device, args) + + +if __name__ == '__main__': + inference_detector(config.get_parser_for_inference().parse_args()) diff --git a/log.py b/log.py new file mode 100644 index 0000000..04d23da --- /dev/null +++ b/log.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +import math +import numpy as np +from visdom import Visdom +from PIL import ImageDraw + + +class Logger(): + """Logger for training.""" + def __init__(self, curve_names=None): + self.curve_names = curve_names + if curve_names: + self.vis = Visdom() + assert self.vis.check_connection() + self.curve_y = None + self.curve_x_start = 0 + self.curve_x_end = 0 + + def log(self, **kwargs): + """Log and print the information.""" + print("##############################################################") + for key, value in kwargs.items(): + print(key, value, sep='\t') + if not self.curve_names: + return + curve_step = np.array([kwargs[cn] for cn in self.curve_names]) + if self.curve_y is None: + self.curve_y = curve_step + else: + self.curve_y = np.row_stack((self.curve_y, curve_step)) + self.curve_x_end = self.curve_x_end + 1 + + def plot_curve(self): + """Plot curve on visdom.""" + if (self.curve_x_end - self.curve_x_start < 2 or not self.curve_names): + return + if self.curve_x_start == 0: + update_opt = None + else: + update_opt = 'append' + curve_x = np.arange(self.curve_x_start, self.curve_x_end) + curve_x = np.transpose(np.tile(curve_x, (len(self.curve_names), 1))) + self.vis.line(Y=self.curve_y, X=curve_x, win='loss', update=update_opt, + opts=dict(showlegend=True, legend=self.curve_names)) + self.curve_x_start = self.curve_x_end + self.curve_y = None + + def plot_marking_points(self, image, marking_points, win_name='mk_points'): + """Plot marking points on visdom.""" + width, height = image.size + draw = ImageDraw.Draw(image) + for point in marking_points: + p0_x = width * point[0] + p0_y = height * point[1] + p1_x = p0_x + 50*math.cos(point[2]) + p1_y = p0_y + 50*math.sin(point[2]) + draw.line((p0_x, p0_y, p1_x, p1_y), fill=(255, 0, 0)) + image = np.asarray(image, dtype="uint8") + image = np.transpose(image, (2, 0, 1)) + self.vis.image(image, win=win_name) diff --git a/network.py b/network.py new file mode 100644 index 0000000..32ff668 --- /dev/null +++ b/network.py @@ -0,0 +1,40 @@ +"""Common network struture unit definition.""" +from torch import nn + + +def define_squeeze_unit(basic_channel_size): + """Define a 1x1 squeeze convolution with norm and activation.""" + conv = nn.Conv2d(2 * basic_channel_size, basic_channel_size, kernel_size=1, + stride=1, padding=0, bias=False) + norm = nn.BatchNorm2d(basic_channel_size) + relu = nn.LeakyReLU(0.1) + layers = [conv, norm, relu] + return layers + + +def define_expand_unit(basic_channel_size): + """Define a 3x3 expand convolution with norm and activation.""" + conv = nn.Conv2d(basic_channel_size, 2 * basic_channel_size, kernel_size=3, + stride=1, padding=1, bias=False) + norm = nn.BatchNorm2d(2 * basic_channel_size) + relu = nn.LeakyReLU(0.1) + layers = [conv, norm, relu] + return layers + + +def define_halve_unit(basic_channel_size): + """Define a 3x3 expand stride 2 convolution with norm and activation.""" + conv = nn.Conv2d(basic_channel_size, 2 * basic_channel_size, kernel_size=4, + stride=2, padding=1, bias=False) + norm = nn.BatchNorm2d(2 * basic_channel_size) + relu = nn.LeakyReLU(0.1) + layers = [conv, norm, relu] + return layers + + +def define_detector_block(basic_channel_size): + """Define a unit composite of a squeeze and expand unit.""" + layers = [] + layers += define_squeeze_unit(basic_channel_size) + layers += define_expand_unit(basic_channel_size) + return layers diff --git a/train.py b/train.py new file mode 100644 index 0000000..8c02008 --- /dev/null +++ b/train.py @@ -0,0 +1,99 @@ +"""Train directional point detector.""" +import math +import random +import torch +from torch.utils.data import DataLoader +import config +from data import ParkingSlotDataset +from detector import DirectionalPointDetector +from log import Logger +from utils import tensor2im, get_marking_points + + +def get_objective_from_labels(marking_points_batch, device): + """Get regression objective and gradient for directional point detector.""" + batch_size = len(marking_points_batch) + objective = torch.zeros(batch_size, config.NUM_FEATURE_MAP_CHANNEL, + config.FEATURE_MAP_SIZE, config.FEATURE_MAP_SIZE, + device=device) + gradient = torch.zeros_like(objective) + gradient[:, 0].fill_(1.) + for batch_idx, marking_points in enumerate(marking_points_batch): + for marking_point in marking_points: + col = math.floor(marking_point[0] * 16) + row = math.floor(marking_point[1] * 16) + # Confidence Regression + objective[batch_idx, 0, row, col] = 1. + # Offset Regression + offset_x = marking_point[0]*16 - col + offset_y = marking_point[1]*16 - row + objective[batch_idx, 1, row, col] = offset_x + objective[batch_idx, 2, row, col] = offset_y + # Direction Regression + direction = marking_point[2] + objective[batch_idx, 3, row, col] = math.cos(direction) + objective[batch_idx, 4, row, col] = math.sin(direction) + # Assign Gradient + gradient[batch_idx, 1:5, row, col].fill_(1.) + return objective, gradient + + +def plot_random_prediction(logger, image, marking_points, prediction): + """Plot the ground truth and prediction of a random sample in a batch.""" + rand_sample = random.randint(0, image.size(0)-1) + sampled_image = tensor2im(image[rand_sample]) + logger.plot_marking_points(sampled_image, marking_points[rand_sample], + win_name='gt_marking_points') + sampled_image = tensor2im(image[rand_sample]) + pred_points = get_marking_points(prediction[rand_sample], 0.01) + logger.plot_marking_points(sampled_image, pred_points, + win_name='pred_marking_points') + + +def train_detector(args): + """Train directional point detector.""" + args.cuda = not args.disable_cuda and torch.cuda.is_available() + device = torch.device("cuda:"+str(args.gpu_id) if args.cuda else "cpu") + + dp_detector = DirectionalPointDetector(3, args.depth_factor, 5).to(device) + if args.detector_weights is not None: + dp_detector.load_state_dict(torch.load(args.detector_weights)) + + optimizer = torch.optim.Adam(dp_detector.parameters(), lr=args.lr) + if args.optimizer_weights is not None: + optimizer.load_state_dict(torch.load(args.optimizer_weights)) + + if args.enable_visdom: + logger = Logger(['loss']) + else: + logger = Logger() + + data_loader = DataLoader(ParkingSlotDataset(args.dataset_directory), + batch_size=args.batch_size, shuffle=True, + collate_fn=lambda x: list(zip(*x))) + for epoch_idx in range(args.num_epochs): + for iter_idx, (image, marking_points) in enumerate(data_loader): + image = torch.stack(image) + image = image.to(device) + + optimizer.zero_grad() + prediction = dp_detector(image) + objective, gradient = get_objective_from_labels(marking_points, + device) + loss = (prediction - objective) ** 2 + loss.backward(gradient) + optimizer.step() + + logger.log(epoch=epoch_idx, iter=iter_idx, + loss=torch.sum(loss * gradient).item()) + if args.enable_visdom: + logger.plot_curve() + plot_random_prediction(logger, image, marking_points, + prediction) + torch.save(dp_detector.state_dict(), + 'weights/dp_detector_%d.pth' % epoch_idx) + torch.save(optimizer.state_dict(), 'weights/optimizer.pth') + + +if __name__ == '__main__': + train_detector(config.get_parser_for_training().parse_args()) diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..5745def --- /dev/null +++ b/utils.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +import math +import time +import torch +import numpy as np +from PIL import Image + + +class Timer(object): + """Timer.""" + def __init__(self): + self.start_ticking = False + self.start = 0. + + def tic(self): + """Start timer.""" + self.start = time.time() + self.start_ticking = True + + def toc(self): + """End timer.""" + duration = time.time() - self.start + self.start_ticking = False + print("Time elapsed:", duration, "s.") + + +def non_maximum_suppression(marking_points): + """Perform non-maxmum suppression on marking points.""" + suppressed = [False] * len(marking_points) + for i in range(len(marking_points) - 1): + for j in range(i + 1, len(marking_points)): + distx = marking_points[i][0] - marking_points[j][0] + disty = marking_points[i][1] - marking_points[j][1] + dist_square = distx ** 2 + disty ** 2 + # minimum distance in training set: 40.309 + # (40.309 / 600)^2 = 0.004513376 + if dist_square < 0.0045: + idx = i if marking_points[i][3] < marking_points[j][3] else j + suppressed[idx] = True + if any(suppressed): + new_marking_points = [] + for i, supres in enumerate(suppressed): + if not supres: + new_marking_points.append(marking_points[i]) + return new_marking_points + return marking_points + + +def get_marking_points(prediction, thresh): + """Get marking point from predicted feature map.""" + assert isinstance(prediction, torch.Tensor) + marking_points = [] + prediction = prediction.detach().cpu().numpy() + for i in range(prediction.shape[1]): + for j in range(prediction.shape[2]): + if prediction[0, i, j] > thresh: + xval = (j + prediction[1, i, j]) / prediction.shape[2] + yval = (i + prediction[2, i, j]) / prediction.shape[1] + cos_value = prediction[3, i, j] + sin_value = prediction[4, i, j] + angle = math.atan2(sin_value, cos_value) + marking_points.append([xval, yval, angle, prediction[0, i, j]]) + return non_maximum_suppression(marking_points) + + +def tensor2array(image_tensor, imtype=np.uint8): + """Convert float image tensor to numpy ndarray""" + assert isinstance(image_tensor, torch.Tensor) + image_numpy = (image_tensor.detach().cpu().numpy()) * 255.0 + return image_numpy.astype(imtype) + + +def tensor2im(image_tensor, imtype=np.uint8): + """Convert float image tensor to PIL Image""" + image_numpy = np.transpose(tensor2array(image_tensor, imtype), (1, 2, 0)) + return Image.fromarray(image_numpy)