před 6 roky · 48296591cc
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,4 @@
 .vscode
 .pylintrc
 __pycache__
 weights/*
--- a/config.py
+++ b/config.py
@@ -0,0 +1,57 @@
 """Configurate arguments."""
 import argparse


 INPUT_IMAGE_SIZE = 512
 # 0: confidence, 1: offset_x, 2: offset_y, 3: cos(direction), 4: sin(direction)
 NUM_FEATURE_MAP_CHANNEL = 5
 # image_size / 2^5 = 512 / 32 = 16
 FEATURE_MAP_SIZE = 16


 def add_common_arguments(parser):
    """Add common arguments for training and inference."""
    parser.add_argument('--detector_weights',
                        help="The weights of pretrained detector.")
    parser.add_argument('--depth_factor', type=int, default=32,
                        help="Depth factor.")
    parser.add_argument('--disable_cuda', action='store_true',
                        help="Disable CUDA.")
    parser.add_argument('--gpu_id', type=int, default=1,
                        help="Select which gpu to use.")


 def get_parser_for_training():
    """Return argument parser for training."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--dataset_directory', required=True,
                        help="The location of dataset.")
    parser.add_argument('--optimizer_weights',
                        help="The weights of optimizer.")
    parser.add_argument('--batch_size', type=int, default=16,
                        help="Batch size.")
    parser.add_argument('--num_epochs', type=int, default=100,
                        help="Number of epochs to train for.")
    parser.add_argument('--lr', type=float, default=1e-3,
                        help="The learning rate of back propagation.")
    parser.add_argument('--enable_visdom', action='store_true',
                        help="Enable Visdom to visualize training progress")
    add_common_arguments(parser)
    return parser


 def get_parser_for_inference():
    """Return argument parser for inference."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', required=True, choices=['image', 'video'],
                        help="Inference image or video.")
    parser.add_argument('--video',
                        help="Video path if you choose to inference video.")
    parser.add_argument('--thresh', type=float, default=0.5,
                        help="Detection threshold.")
    parser.add_argument('--timing', action='store_true',
                        help="Perform timing during reference.")
    parser.add_argument('--save', action='store_true',
                        help="Save detection result to file.")
    add_common_arguments(parser)
    return parser
--- a/data.py
+++ b/data.py
@@ -0,0 +1,35 @@
 # -*- coding: utf-8 -*-
 import os
 import os.path
 from PIL import Image
 from torch.utils.data import Dataset
 from torchvision import transforms


 class ParkingSlotDataset(Dataset):
    """Parking slot dataset."""
    def __init__(self, root):
        super(ParkingSlotDataset, self).__init__()
        self.root = root
        self.sample_names = []
        self.image_transform = transforms.Compose([
            transforms.Resize((512, 512)),
            transforms.ToTensor(),
        ])
        for file in os.listdir(root):
            if file.endswith(".txt"):
                self.sample_names.append(os.path.splitext(file)[0])

    def __getitem__(self, index):
        name = self.sample_names[index]
        image = Image.open(os.path.join(self.root, name+'.bmp'))
        image = self.image_transform(image)
        marking_points = []
        with open(os.path.join(self.root, name+'.txt'), 'r') as file:
            for line in file:
                marking_point = tuple([float(n) for n in line.split()])
                marking_points.append(marking_point)
        return image, marking_points

    def __len__(self):
        return len(self.sample_names)
--- a/detector.py
+++ b/detector.py
@@ -0,0 +1,63 @@
 # -*- coding: utf-8 -*-
 import torch
 from torch import nn
 from network import define_halve_unit, define_detector_block


 class YetAnotherDarknet(nn.modules.Module):
    """Yet another darknet, imitating darknet-53 with depth of darknet-19."""
    def __init__(self, input_channel_size, depth_factor):
        super(YetAnotherDarknet, self).__init__()
        layers = []
        # 0
        layers += [nn.Conv2d(input_channel_size, depth_factor, kernel_size=3,
                             stride=1, padding=1, bias=False)]
        layers += [nn.BatchNorm2d(depth_factor)]
        layers += [nn.LeakyReLU(0.1)]
        # 1
        layers += define_halve_unit(depth_factor)
        layers += define_detector_block(depth_factor)
        # 2
        depth_factor *= 2
        layers += define_halve_unit(depth_factor)
        layers += define_detector_block(depth_factor)
        # 3
        depth_factor *= 2
        layers += define_halve_unit(depth_factor)
        layers += define_detector_block(depth_factor)
        layers += define_detector_block(depth_factor)
        # 4
        depth_factor *= 2
        layers += define_halve_unit(depth_factor)
        layers += define_detector_block(depth_factor)
        layers += define_detector_block(depth_factor)
        # 5
        depth_factor *= 2
        layers += define_halve_unit(depth_factor)
        layers += define_detector_block(depth_factor)
        self.model = nn.Sequential(*layers)

    def forward(self, *x):
        return self.model(x[0])


 class DirectionalPointDetector(nn.modules.Module):
    """Detector for point with direction."""
    def __init__(self, input_channel_size, depth_factor, output_channel_size):
        super(DirectionalPointDetector, self).__init__()
        self.extract_feature = YetAnotherDarknet(input_channel_size,
                                                 depth_factor)
        layers = []
        layers += define_detector_block(16 * depth_factor)
        layers += define_detector_block(16 * depth_factor)
        layers += [nn.Conv2d(32 * depth_factor, output_channel_size,
                             kernel_size=1, stride=1, padding=0, bias=False)]
        self.predict = nn.Sequential(*layers)

    def forward(self, *x):
        feature = self.extract_feature(x[0])
        prediction = self.predict(feature)
        point_pred, angle_pred = torch.split(prediction, 3, dim=1)
        point_pred = nn.functional.sigmoid(point_pred)
        angle_pred = nn.functional.tanh(angle_pred)
        return torch.cat((point_pred, angle_pred), dim=1)
--- a/inference.py
+++ b/inference.py
@@ -0,0 +1,85 @@
 """Inference demo of directional point detector."""
 import math
 import cv2 as cv
 import numpy as np
 import torch
 from torchvision.transforms import ToTensor
 import config
 from detector import DirectionalPointDetector
 from utils import get_marking_points, Timer


 def plot_marking_points(image, marking_points):
    """Plot marking points on the image and show."""
    height = image.shape[0]
    width = image.shape[1]
    for marking_point in marking_points:
        p0_x = width * marking_point[0]
        p0_y = height * marking_point[1]
        p1_x = p0_x + 50 * math.cos(marking_point[2])
        p1_y = p0_y + 50 * math.sin(marking_point[2])
        p0_x = int(round(p0_x))
        p0_y = int(round(p0_y))
        p1_x = int(round(p1_x))
        p1_y = int(round(p1_y))
        cv.arrowedLine(image, (p0_x, p0_y), (p1_x, p1_y), (0, 0, 255))
    cv.imshow('demo', image)
    cv.waitKey(1)


 def preprocess_image(image):
    """Preprocess numpy image to torch tensor."""
    if image.shape[0] != 512 or image.shape[1] != 512:
        image = cv.resize(image, (512, 512))
    return torch.unsqueeze(ToTensor()(image), 0)


 def detect_video(detector, device, args):
    """Demo for detecting video."""
    timer = Timer()
    input_video = cv.VideoCapture(args.video)
    frame_width = int(input_video.get(cv.CAP_PROP_FRAME_WIDTH))
    frame_height = int(input_video.get(cv.CAP_PROP_FRAME_HEIGHT))
    output_video = cv.VideoWriter()
    if args.save:
        output_video.open('record.avi', cv.VideoWriter_fourcc(* 'MJPG'),
                          input_video.get(cv.CAP_PROP_FPS),
                          (frame_width, frame_height))
    frame = np.empty([frame_height, frame_width, 3], dtype=np.uint8)
    while input_video.read(frame)[0]:
        if args.timing:
            timer.tic()
        prediction = detector(preprocess_image(frame).to(device))
        if args.timing:
            timer.toc()
        pred_points = get_marking_points(prediction[0], args.thresh)
        plot_marking_points(frame, pred_points)
        if args.save:
            output_video.write(frame)
    input_video.release()
    output_video.release()


 def detect_image(detector, device, args):
    """Demo for detecting images."""
    image_file = input('Enter image file path: ')
    image = cv.imread(image_file)
    prediction = detector(preprocess_image(image).to(device))
    pred_points = get_marking_points(prediction[0], args.thresh)
    plot_marking_points(image, pred_points)


 def inference_detector(args):
    """Inference demo of directional point detector."""
    args.cuda = not args.disable_cuda and torch.cuda.is_available()
    device = torch.device("cuda:" + str(args.gpu_id) if args.cuda else "cpu")
    dp_detector = DirectionalPointDetector(3, args.depth_factor, 5).to(device)
    dp_detector.load_state_dict(torch.load(args.detector_weights))
    if args.mode == "image":
        detect_image(dp_detector, device, args)
    elif args.mode == "video":
        detect_video(dp_detector, device, args)


 if __name__ == '__main__':
    inference_detector(config.get_parser_for_inference().parse_args())
--- a/log.py
+++ b/log.py
@@ -0,0 +1,60 @@
 # -*- coding: utf-8 -*-
 import math
 import numpy as np
 from visdom import Visdom
 from PIL import ImageDraw


 class Logger():
    """Logger for training."""
    def __init__(self, curve_names=None):
        self.curve_names = curve_names
        if curve_names:
            self.vis = Visdom()
            assert self.vis.check_connection()
            self.curve_y = None
            self.curve_x_start = 0
            self.curve_x_end = 0

    def log(self, **kwargs):
        """Log and print the information."""
        print("##############################################################")
        for key, value in kwargs.items():
            print(key, value, sep='\t')
        if not self.curve_names:
            return
        curve_step = np.array([kwargs[cn] for cn in self.curve_names])
        if self.curve_y is None:
            self.curve_y = curve_step
        else:
            self.curve_y = np.row_stack((self.curve_y, curve_step))
        self.curve_x_end = self.curve_x_end + 1

    def plot_curve(self):
        """Plot curve on visdom."""
        if (self.curve_x_end - self.curve_x_start < 2 or not self.curve_names):
            return
        if self.curve_x_start == 0:
            update_opt = None
        else:
            update_opt = 'append'
        curve_x = np.arange(self.curve_x_start, self.curve_x_end)
        curve_x = np.transpose(np.tile(curve_x, (len(self.curve_names), 1)))
        self.vis.line(Y=self.curve_y, X=curve_x, win='loss', update=update_opt,
                      opts=dict(showlegend=True, legend=self.curve_names))
        self.curve_x_start = self.curve_x_end
        self.curve_y = None

    def plot_marking_points(self, image, marking_points, win_name='mk_points'):
        """Plot marking points on visdom."""
        width, height = image.size
        draw = ImageDraw.Draw(image)
        for point in marking_points:
            p0_x = width * point[0]
            p0_y = height * point[1]
            p1_x = p0_x + 50*math.cos(point[2])
            p1_y = p0_y + 50*math.sin(point[2])
            draw.line((p0_x, p0_y, p1_x, p1_y), fill=(255, 0, 0))
        image = np.asarray(image, dtype="uint8")
        image = np.transpose(image, (2, 0, 1))
        self.vis.image(image, win=win_name)
--- a/network.py
+++ b/network.py
@@ -0,0 +1,40 @@
 """Common network struture unit definition."""
 from torch import nn


 def define_squeeze_unit(basic_channel_size):
    """Define a 1x1 squeeze convolution with norm and activation."""
    conv = nn.Conv2d(2 * basic_channel_size, basic_channel_size, kernel_size=1,
                     stride=1, padding=0, bias=False)
    norm = nn.BatchNorm2d(basic_channel_size)
    relu = nn.LeakyReLU(0.1)
    layers = [conv, norm, relu]
    return layers


 def define_expand_unit(basic_channel_size):
    """Define a 3x3 expand convolution with norm and activation."""
    conv = nn.Conv2d(basic_channel_size, 2 * basic_channel_size, kernel_size=3,
                     stride=1, padding=1, bias=False)
    norm = nn.BatchNorm2d(2 * basic_channel_size)
    relu = nn.LeakyReLU(0.1)
    layers = [conv, norm, relu]
    return layers


 def define_halve_unit(basic_channel_size):
    """Define a 3x3 expand stride 2 convolution with norm and activation."""
    conv = nn.Conv2d(basic_channel_size, 2 * basic_channel_size, kernel_size=4,
                     stride=2, padding=1, bias=False)
    norm = nn.BatchNorm2d(2 * basic_channel_size)
    relu = nn.LeakyReLU(0.1)
    layers = [conv, norm, relu]
    return layers


 def define_detector_block(basic_channel_size):
    """Define a unit composite of a squeeze and expand unit."""
    layers = []
    layers += define_squeeze_unit(basic_channel_size)
    layers += define_expand_unit(basic_channel_size)
    return layers
--- a/train.py
+++ b/train.py
@@ -0,0 +1,99 @@
 """Train directional point detector."""
 import math
 import random
 import torch
 from torch.utils.data import DataLoader
 import config
 from data import ParkingSlotDataset
 from detector import DirectionalPointDetector
 from log import Logger
 from utils import tensor2im, get_marking_points


 def get_objective_from_labels(marking_points_batch, device):
    """Get regression objective and gradient for directional point detector."""
    batch_size = len(marking_points_batch)
    objective = torch.zeros(batch_size, config.NUM_FEATURE_MAP_CHANNEL,
                            config.FEATURE_MAP_SIZE, config.FEATURE_MAP_SIZE,
                            device=device)
    gradient = torch.zeros_like(objective)
    gradient[:, 0].fill_(1.)
    for batch_idx, marking_points in enumerate(marking_points_batch):
        for marking_point in marking_points:
            col = math.floor(marking_point[0] * 16)
            row = math.floor(marking_point[1] * 16)
            # Confidence Regression
            objective[batch_idx, 0, row, col] = 1.
            # Offset Regression
            offset_x = marking_point[0]*16 - col
            offset_y = marking_point[1]*16 - row
            objective[batch_idx, 1, row, col] = offset_x
            objective[batch_idx, 2, row, col] = offset_y
            # Direction Regression
            direction = marking_point[2]
            objective[batch_idx, 3, row, col] = math.cos(direction)
            objective[batch_idx, 4, row, col] = math.sin(direction)
            # Assign Gradient
            gradient[batch_idx, 1:5, row, col].fill_(1.)
    return objective, gradient


 def plot_random_prediction(logger, image, marking_points, prediction):
    """Plot the ground truth and prediction of a random sample in a batch."""
    rand_sample = random.randint(0, image.size(0)-1)
    sampled_image = tensor2im(image[rand_sample])
    logger.plot_marking_points(sampled_image, marking_points[rand_sample],
                               win_name='gt_marking_points')
    sampled_image = tensor2im(image[rand_sample])
    pred_points = get_marking_points(prediction[rand_sample], 0.01)
    logger.plot_marking_points(sampled_image, pred_points,
                               win_name='pred_marking_points')


 def train_detector(args):
    """Train directional point detector."""
    args.cuda = not args.disable_cuda and torch.cuda.is_available()
    device = torch.device("cuda:"+str(args.gpu_id) if args.cuda else "cpu")

    dp_detector = DirectionalPointDetector(3, args.depth_factor, 5).to(device)
    if args.detector_weights is not None:
        dp_detector.load_state_dict(torch.load(args.detector_weights))

    optimizer = torch.optim.Adam(dp_detector.parameters(), lr=args.lr)
    if args.optimizer_weights is not None:
        optimizer.load_state_dict(torch.load(args.optimizer_weights))

    if args.enable_visdom:
        logger = Logger(['loss'])
    else:
        logger = Logger()

    data_loader = DataLoader(ParkingSlotDataset(args.dataset_directory),
                             batch_size=args.batch_size, shuffle=True,
                             collate_fn=lambda x: list(zip(*x)))
    for epoch_idx in range(args.num_epochs):
        for iter_idx, (image, marking_points) in enumerate(data_loader):
            image = torch.stack(image)
            image = image.to(device)

            optimizer.zero_grad()
            prediction = dp_detector(image)
            objective, gradient = get_objective_from_labels(marking_points,
                                                            device)
            loss = (prediction - objective) ** 2
            loss.backward(gradient)
            optimizer.step()

            logger.log(epoch=epoch_idx, iter=iter_idx,
                       loss=torch.sum(loss * gradient).item())
            if args.enable_visdom:
                logger.plot_curve()
                plot_random_prediction(logger, image, marking_points,
                                       prediction)
        torch.save(dp_detector.state_dict(),
                   'weights/dp_detector_%d.pth' % epoch_idx)
    torch.save(optimizer.state_dict(), 'weights/optimizer.pth')


 if __name__ == '__main__':
    train_detector(config.get_parser_for_training().parse_args())
--- a/utils.py
+++ b/utils.py
@@ -0,0 +1,76 @@
 # -*- coding: utf-8 -*-
 import math
 import time
 import torch
 import numpy as np
 from PIL import Image


 class Timer(object):
    """Timer."""
    def __init__(self):
        self.start_ticking = False
        self.start = 0.

    def tic(self):
        """Start timer."""
        self.start = time.time()
        self.start_ticking = True

    def toc(self):
        """End timer."""
        duration = time.time() - self.start
        self.start_ticking = False
        print("Time elapsed:", duration, "s.")


 def non_maximum_suppression(marking_points):
    """Perform non-maxmum suppression on marking points."""
    suppressed = [False] * len(marking_points)
    for i in range(len(marking_points) - 1):
        for j in range(i + 1, len(marking_points)):
            distx = marking_points[i][0] - marking_points[j][0]
            disty = marking_points[i][1] - marking_points[j][1]
            dist_square = distx ** 2 + disty ** 2
            # minimum distance in training set: 40.309
            # (40.309 / 600)^2 = 0.004513376
            if dist_square < 0.0045:
                idx = i if marking_points[i][3] < marking_points[j][3] else j
                suppressed[idx] = True
    if any(suppressed):
        new_marking_points = []
        for i, supres in enumerate(suppressed):
            if not supres:
                new_marking_points.append(marking_points[i])
        return new_marking_points
    return marking_points


 def get_marking_points(prediction, thresh):
    """Get marking point from predicted feature map."""
    assert isinstance(prediction, torch.Tensor)
    marking_points = []
    prediction = prediction.detach().cpu().numpy()
    for i in range(prediction.shape[1]):
        for j in range(prediction.shape[2]):
            if prediction[0, i, j] > thresh:
                xval = (j + prediction[1, i, j]) / prediction.shape[2]
                yval = (i + prediction[2, i, j]) / prediction.shape[1]
                cos_value = prediction[3, i, j]
                sin_value = prediction[4, i, j]
                angle = math.atan2(sin_value, cos_value)
                marking_points.append([xval, yval, angle, prediction[0, i, j]])
    return non_maximum_suppression(marking_points)


 def tensor2array(image_tensor, imtype=np.uint8):
    """Convert float image tensor to numpy ndarray"""
    assert isinstance(image_tensor, torch.Tensor)
    image_numpy = (image_tensor.detach().cpu().numpy()) * 255.0
    return image_numpy.astype(imtype)


 def tensor2im(image_tensor, imtype=np.uint8):
    """Convert float image tensor to PIL Image"""
    image_numpy = np.transpose(tensor2array(image_tensor, imtype), (1, 2, 0))
    return Image.fromarray(image_numpy)