You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

530 lines
26KB

  1. import argparse
  2. import math
  3. import os
  4. import random
  5. import time
  6. import logging
  7. from pathlib import Path
  8. import numpy as np
  9. import torch.distributed as dist
  10. import torch.nn.functional as F
  11. import torch.optim as optim
  12. import torch.optim.lr_scheduler as lr_scheduler
  13. import torch.utils.data
  14. import yaml
  15. from torch.cuda import amp
  16. from torch.nn.parallel import DistributedDataParallel as DDP
  17. from torch.utils.tensorboard import SummaryWriter
  18. from tqdm import tqdm
  19. import test # import test.py to get mAP after each epoch
  20. from models.yolo import Model
  21. from utils.datasets import create_dataloader
  22. from utils.general import (
  23. torch_distributed_zero_first, labels_to_class_weights, plot_labels, check_anchors, labels_to_image_weights,
  24. compute_loss, plot_images, fitness, strip_optimizer, plot_results, get_latest_run, check_dataset, check_file,
  25. check_git_status, check_img_size, increment_dir, print_mutation, plot_evolution, set_logging)
  26. from utils.google_utils import attempt_download
  27. from utils.torch_utils import init_seeds, ModelEMA, select_device, intersect_dicts
  28. logger = logging.getLogger(__name__)
  29. def train(hyp, opt, device, tb_writer=None):
  30. logger.info(f'Hyperparameters {hyp}')
  31. log_dir = Path(tb_writer.log_dir) if tb_writer else Path(opt.logdir) / 'evolve' # logging directory
  32. wdir = str(log_dir / 'weights') + os.sep # weights directory
  33. os.makedirs(wdir, exist_ok=True)
  34. last = wdir + 'last.pt'
  35. best = wdir + 'best.pt'
  36. results_file = str(log_dir / 'results.txt')
  37. epochs, batch_size, total_batch_size, weights, rank = \
  38. opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank
  39. # TODO: Use DDP logging. Only the first process is allowed to log.
  40. # Save run settings
  41. with open(log_dir / 'hyp.yaml', 'w') as f:
  42. yaml.dump(hyp, f, sort_keys=False)
  43. with open(log_dir / 'opt.yaml', 'w') as f:
  44. yaml.dump(vars(opt), f, sort_keys=False)
  45. # Configure
  46. cuda = device.type != 'cpu'
  47. init_seeds(2 + rank)
  48. with open(opt.data) as f:
  49. data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict
  50. with torch_distributed_zero_first(rank):
  51. check_dataset(data_dict) # check
  52. train_path = data_dict['train']
  53. test_path = data_dict['val']
  54. nc, names = (1, ['item']) if opt.single_cls else (int(data_dict['nc']), data_dict['names']) # number classes, names
  55. assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data) # check
  56. # Model
  57. pretrained = weights.endswith('.pt')
  58. if pretrained:
  59. with torch_distributed_zero_first(rank):
  60. attempt_download(weights) # download if not found locally
  61. ckpt = torch.load(weights, map_location=device) # load checkpoint
  62. model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) # create
  63. exclude = ['anchor'] if opt.cfg else [] # exclude keys
  64. state_dict = ckpt['model'].float().state_dict() # to FP32
  65. state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect
  66. model.load_state_dict(state_dict, strict=False) # load
  67. logger.info('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report
  68. else:
  69. model = Model(opt.cfg, ch=3, nc=nc).to(device) # create
  70. # Freeze
  71. freeze = ['', ] # parameter names to freeze (full or partial)
  72. if any(freeze):
  73. for k, v in model.named_parameters():
  74. if any(x in k for x in freeze):
  75. print('freezing %s' % k)
  76. v.requires_grad = False
  77. # Optimizer
  78. nbs = 64 # nominal batch size
  79. accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing
  80. hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay
  81. pg0, pg1, pg2 = [], [], [] # optimizer parameter groups
  82. for k, v in model.named_parameters():
  83. v.requires_grad = True
  84. if '.bias' in k:
  85. pg2.append(v) # biases
  86. elif '.weight' in k and '.bn' not in k:
  87. pg1.append(v) # apply weight decay
  88. else:
  89. pg0.append(v) # all else
  90. if opt.adam:
  91. optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum
  92. else:
  93. optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
  94. optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay
  95. optimizer.add_param_group({'params': pg2}) # add pg2 (biases)
  96. logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0)))
  97. del pg0, pg1, pg2
  98. # Scheduler https://arxiv.org/pdf/1812.01187.pdf
  99. # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
  100. lf = lambda x: (((1 + math.cos(x * math.pi / epochs)) / 2) ** 1.0) * 0.8 + 0.2 # cosine
  101. scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
  102. # plot_lr_scheduler(optimizer, scheduler, epochs)
  103. # Resume
  104. start_epoch, best_fitness = 0, 0.0
  105. if pretrained:
  106. # Optimizer
  107. if ckpt['optimizer'] is not None:
  108. optimizer.load_state_dict(ckpt['optimizer'])
  109. best_fitness = ckpt['best_fitness']
  110. # Results
  111. if ckpt.get('training_results') is not None:
  112. with open(results_file, 'w') as file:
  113. file.write(ckpt['training_results']) # write results.txt
  114. # Epochs
  115. start_epoch = ckpt['epoch'] + 1
  116. if epochs < start_epoch:
  117. logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %
  118. (weights, ckpt['epoch'], epochs))
  119. epochs += ckpt['epoch'] # finetune additional epochs
  120. del ckpt, state_dict
  121. # Image sizes
  122. gs = int(max(model.stride)) # grid size (max stride)
  123. imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size] # verify imgsz are gs-multiples
  124. # DP mode
  125. if cuda and rank == -1 and torch.cuda.device_count() > 1:
  126. model = torch.nn.DataParallel(model)
  127. # SyncBatchNorm
  128. if opt.sync_bn and cuda and rank != -1:
  129. model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
  130. logger.info('Using SyncBatchNorm()')
  131. # Exponential moving average
  132. ema = ModelEMA(model) if rank in [-1, 0] else None
  133. # DDP mode
  134. if cuda and rank != -1:
  135. model = DDP(model, device_ids=[opt.local_rank], output_device=(opt.local_rank))
  136. # Trainloader
  137. dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True,
  138. cache=opt.cache_images, rect=opt.rect, rank=rank,
  139. world_size=opt.world_size, workers=opt.workers)
  140. mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class
  141. nb = len(dataloader) # number of batches
  142. assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1)
  143. # Testloader
  144. if rank in [-1, 0]:
  145. # local_rank is set to -1. Because only the first process is expected to do evaluation.
  146. testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False,
  147. cache=opt.cache_images, rect=True, rank=-1, world_size=opt.world_size,
  148. workers=opt.workers)[0]
  149. # Model parameters
  150. hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset
  151. model.nc = nc # attach number of classes to model
  152. model.hyp = hyp # attach hyperparameters to model
  153. model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou)
  154. model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights
  155. model.names = names
  156. # Class frequency
  157. if rank in [-1, 0]:
  158. labels = np.concatenate(dataset.labels, 0)
  159. c = torch.tensor(labels[:, 0]) # classes
  160. # cf = torch.bincount(c.long(), minlength=nc) + 1.
  161. # model._initialize_biases(cf.to(device))
  162. plot_labels(labels, save_dir=log_dir)
  163. if tb_writer:
  164. # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384
  165. tb_writer.add_histogram('classes', c, 0)
  166. # Check anchors
  167. if not opt.noautoanchor:
  168. check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)
  169. # Start training
  170. t0 = time.time()
  171. nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations)
  172. # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training
  173. maps = np.zeros(nc) # mAP per class
  174. results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
  175. scheduler.last_epoch = start_epoch - 1 # do not move
  176. scaler = amp.GradScaler(enabled=cuda)
  177. logger.info('Image sizes %g train, %g test' % (imgsz, imgsz_test))
  178. logger.info('Using %g dataloader workers' % dataloader.num_workers)
  179. logger.info('Starting training for %g epochs...' % epochs)
  180. # torch.autograd.set_detect_anomaly(True)
  181. for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------
  182. model.train()
  183. # Update image weights (optional)
  184. if dataset.image_weights:
  185. # Generate indices
  186. if rank in [-1, 0]:
  187. w = model.class_weights.cpu().numpy() * (1 - maps) ** 2 # class weights
  188. image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w)
  189. dataset.indices = random.choices(range(dataset.n), weights=image_weights,
  190. k=dataset.n) # rand weighted idx
  191. # Broadcast if DDP
  192. if rank != -1:
  193. indices = torch.zeros([dataset.n], dtype=torch.int)
  194. if rank == 0:
  195. indices[:] = torch.from_tensor(dataset.indices, dtype=torch.int)
  196. dist.broadcast(indices, 0)
  197. if rank != 0:
  198. dataset.indices = indices.cpu().numpy()
  199. # Update mosaic border
  200. # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
  201. # dataset.mosaic_border = [b - imgsz, -b] # height, width borders
  202. mloss = torch.zeros(4, device=device) # mean losses
  203. if rank != -1:
  204. dataloader.sampler.set_epoch(epoch)
  205. pbar = enumerate(dataloader)
  206. logger.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size'))
  207. if rank in [-1, 0]:
  208. pbar = tqdm(pbar, total=nb) # progress bar
  209. optimizer.zero_grad()
  210. for i, (imgs, targets, paths, _) in pbar: # batch -------------------------------------------------------------
  211. ni = i + nb * epoch # number integrated batches (since train start)
  212. imgs = imgs.to(device, non_blocking=True).float() / 255.0 # uint8 to float32, 0-255 to 0.0-1.0
  213. # Warmup
  214. if ni <= nw:
  215. xi = [0, nw] # x interp
  216. # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou)
  217. accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round())
  218. for j, x in enumerate(optimizer.param_groups):
  219. # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
  220. x['lr'] = np.interp(ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
  221. if 'momentum' in x:
  222. x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']])
  223. # Multi-scale
  224. if opt.multi_scale:
  225. sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size
  226. sf = sz / max(imgs.shape[2:]) # scale factor
  227. if sf != 1:
  228. ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple)
  229. imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)
  230. # Autocast
  231. with amp.autocast(enabled=cuda):
  232. # Forward
  233. pred = model(imgs)
  234. # Loss
  235. loss, loss_items = compute_loss(pred, targets.to(device), model) # scaled by batch_size
  236. if rank != -1:
  237. loss *= opt.world_size # gradient averaged between devices in DDP mode
  238. # if not torch.isfinite(loss):
  239. # logger.info('WARNING: non-finite loss, ending training ', loss_items)
  240. # return results
  241. # Backward
  242. scaler.scale(loss).backward()
  243. # Optimize
  244. if ni % accumulate == 0:
  245. scaler.step(optimizer) # optimizer.step
  246. scaler.update()
  247. optimizer.zero_grad()
  248. if ema is not None:
  249. ema.update(model)
  250. # Print
  251. if rank in [-1, 0]:
  252. mloss = (mloss * i + loss_items) / (i + 1) # update mean losses
  253. mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB)
  254. s = ('%10s' * 2 + '%10.4g' * 6) % (
  255. '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1])
  256. pbar.set_description(s)
  257. # Plot
  258. if ni < 3:
  259. f = str(log_dir / ('train_batch%g.jpg' % ni)) # filename
  260. result = plot_images(images=imgs, targets=targets, paths=paths, fname=f)
  261. if tb_writer and result is not None:
  262. tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch)
  263. # tb_writer.add_graph(model, imgs) # add model to tensorboard
  264. # end batch ------------------------------------------------------------------------------------------------
  265. # Scheduler
  266. scheduler.step()
  267. # DDP process 0 or single-GPU
  268. if rank in [-1, 0]:
  269. # mAP
  270. if ema is not None:
  271. ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride'])
  272. final_epoch = epoch + 1 == epochs
  273. if not opt.notest or final_epoch: # Calculate mAP
  274. results, maps, times = test.test(opt.data,
  275. batch_size=total_batch_size,
  276. imgsz=imgsz_test,
  277. model=ema.ema.module if hasattr(ema.ema, 'module') else ema.ema,
  278. single_cls=opt.single_cls,
  279. dataloader=testloader,
  280. save_dir=log_dir)
  281. # Write
  282. with open(results_file, 'a') as f:
  283. f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls)
  284. if len(opt.name) and opt.bucket:
  285. os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name))
  286. # Tensorboard
  287. if tb_writer:
  288. tags = ['train/giou_loss', 'train/obj_loss', 'train/cls_loss',
  289. 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95',
  290. 'val/giou_loss', 'val/obj_loss', 'val/cls_loss']
  291. for x, tag in zip(list(mloss[:-1]) + list(results), tags):
  292. tb_writer.add_scalar(tag, x, epoch)
  293. # Update best mAP
  294. fi = fitness(np.array(results).reshape(1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1]
  295. if fi > best_fitness:
  296. best_fitness = fi
  297. # Save model
  298. save = (not opt.nosave) or (final_epoch and not opt.evolve)
  299. if save:
  300. with open(results_file, 'r') as f: # create checkpoint
  301. ckpt = {'epoch': epoch,
  302. 'best_fitness': best_fitness,
  303. 'training_results': f.read(),
  304. 'model': ema.ema.module if hasattr(ema, 'module') else ema.ema,
  305. 'optimizer': None if final_epoch else optimizer.state_dict()}
  306. # Save last, best and delete
  307. torch.save(ckpt, last)
  308. if best_fitness == fi:
  309. torch.save(ckpt, best)
  310. del ckpt
  311. # end epoch ----------------------------------------------------------------------------------------------------
  312. # end training
  313. if rank in [-1, 0]:
  314. # Strip optimizers
  315. n = ('_' if len(opt.name) and not opt.name.isnumeric() else '') + opt.name
  316. fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n
  317. for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]):
  318. if os.path.exists(f1):
  319. os.rename(f1, f2) # rename
  320. ispt = f2.endswith('.pt') # is *.pt
  321. strip_optimizer(f2) if ispt else None # strip optimizer
  322. os.system('gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket and ispt else None # upload
  323. # Finish
  324. if not opt.evolve:
  325. plot_results(save_dir=log_dir) # save as results.png
  326. logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
  327. dist.destroy_process_group() if rank not in [-1, 0] else None
  328. torch.cuda.empty_cache()
  329. return results
  330. if __name__ == '__main__':
  331. parser = argparse.ArgumentParser()
  332. parser.add_argument('--weights', type=str, default='yolov5s.pt', help='initial weights path')
  333. parser.add_argument('--cfg', type=str, default='', help='model.yaml path')
  334. parser.add_argument('--data', type=str, default='data/coco128.yaml', help='data.yaml path')
  335. parser.add_argument('--hyp', type=str, default='', help='hyperparameters path, i.e. data/hyp.scratch.yaml')
  336. parser.add_argument('--epochs', type=int, default=300)
  337. parser.add_argument('--batch-size', type=int, default=16, help='total batch size for all GPUs')
  338. parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='train,test sizes')
  339. parser.add_argument('--rect', action='store_true', help='rectangular training')
  340. parser.add_argument('--resume', nargs='?', const='get_last', default=False,
  341. help='resume from given path/last.pt, or most recent run if blank')
  342. parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
  343. parser.add_argument('--notest', action='store_true', help='only test final epoch')
  344. parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check')
  345. parser.add_argument('--evolve', action='store_true', help='evolve hyperparameters')
  346. parser.add_argument('--bucket', type=str, default='', help='gsutil bucket')
  347. parser.add_argument('--cache-images', action='store_true', help='cache images for faster training')
  348. parser.add_argument('--name', default='', help='renames results.txt to results_name.txt if supplied')
  349. parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
  350. parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')
  351. parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
  352. parser.add_argument('--adam', action='store_true', help='use torch.optim.Adam() optimizer')
  353. parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
  354. parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
  355. parser.add_argument('--logdir', type=str, default='runs/', help='logging directory')
  356. parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers')
  357. opt = parser.parse_args()
  358. # Set DDP variables
  359. opt.total_batch_size = opt.batch_size
  360. opt.world_size = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
  361. opt.global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else -1
  362. set_logging(opt.global_rank)
  363. # Resume
  364. if opt.resume:
  365. last = get_latest_run() if opt.resume == 'get_last' else opt.resume # resume from most recent run
  366. if last and not opt.weights:
  367. logger.info(f'Resuming training from {last}')
  368. opt.weights = last if opt.resume and not opt.weights else opt.weights
  369. if opt.global_rank in [-1, 0]:
  370. check_git_status()
  371. opt.hyp = opt.hyp or ('data/hyp.finetune.yaml' if opt.weights else 'data/hyp.scratch.yaml')
  372. opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file(opt.cfg), check_file(opt.hyp) # check files
  373. assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified'
  374. opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test)
  375. device = select_device(opt.device, batch_size=opt.batch_size)
  376. # DDP mode
  377. if opt.local_rank != -1:
  378. assert torch.cuda.device_count() > opt.local_rank
  379. torch.cuda.set_device(opt.local_rank)
  380. device = torch.device('cuda', opt.local_rank)
  381. dist.init_process_group(backend='nccl', init_method='env://') # distributed backend
  382. assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count'
  383. opt.batch_size = opt.total_batch_size // opt.world_size
  384. logger.info(opt)
  385. with open(opt.hyp) as f:
  386. hyp = yaml.load(f, Loader=yaml.FullLoader) # load hyps
  387. # Train
  388. if not opt.evolve:
  389. tb_writer = None
  390. if opt.global_rank in [-1, 0]:
  391. logger.info('Start Tensorboard with "tensorboard --logdir %s", view at http://localhost:6006/' % opt.logdir)
  392. tb_writer = SummaryWriter(log_dir=increment_dir(Path(opt.logdir) / 'exp', opt.name)) # runs/exp
  393. train(hyp, opt, device, tb_writer)
  394. # Evolve hyperparameters (optional)
  395. else:
  396. # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit)
  397. meta = {'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3)
  398. 'momentum': (0.1, 0.6, 0.98), # SGD momentum/Adam beta1
  399. 'weight_decay': (1, 0.0, 0.001), # optimizer weight decay
  400. 'giou': (1, 0.02, 0.2), # GIoU loss gain
  401. 'cls': (1, 0.2, 4.0), # cls loss gain
  402. 'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight
  403. 'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels)
  404. 'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight
  405. 'iou_t': (0, 0.1, 0.7), # IoU training threshold
  406. 'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold
  407. 'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5)
  408. 'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction)
  409. 'hsv_s': (1, 0.0, 0.9), # image HSV-Saturation augmentation (fraction)
  410. 'hsv_v': (1, 0.0, 0.9), # image HSV-Value augmentation (fraction)
  411. 'degrees': (1, 0.0, 45.0), # image rotation (+/- deg)
  412. 'translate': (1, 0.0, 0.9), # image translation (+/- fraction)
  413. 'scale': (1, 0.0, 0.9), # image scale (+/- gain)
  414. 'shear': (1, 0.0, 10.0), # image shear (+/- deg)
  415. 'perspective': (1, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001
  416. 'flipud': (0, 0.0, 1.0), # image flip up-down (probability)
  417. 'fliplr': (1, 0.0, 1.0), # image flip left-right (probability)
  418. 'mixup': (1, 0.0, 1.0)} # image mixup (probability)
  419. assert opt.local_rank == -1, 'DDP mode not implemented for --evolve'
  420. opt.notest, opt.nosave = True, True # only test/save final epoch
  421. # ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices
  422. yaml_file = Path('runs/evolve/hyp_evolved.yaml') # save best result here
  423. if opt.bucket:
  424. os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket) # download evolve.txt if exists
  425. for _ in range(100): # generations to evolve
  426. if os.path.exists('evolve.txt'): # if evolve.txt exists: select best hyps and mutate
  427. # Select parent(s)
  428. parent = 'single' # parent selection method: 'single' or 'weighted'
  429. x = np.loadtxt('evolve.txt', ndmin=2)
  430. n = min(5, len(x)) # number of previous results to consider
  431. x = x[np.argsort(-fitness(x))][:n] # top n mutations
  432. w = fitness(x) - fitness(x).min() # weights
  433. if parent == 'single' or len(x) == 1:
  434. # x = x[random.randint(0, n - 1)] # random selection
  435. x = x[random.choices(range(n), weights=w)[0]] # weighted selection
  436. elif parent == 'weighted':
  437. x = (x * w.reshape(n, 1)).sum(0) / w.sum() # weighted combination
  438. # Mutate
  439. mp, s = 0.9, 0.2 # mutation probability, sigma
  440. npr = np.random
  441. npr.seed(int(time.time()))
  442. g = np.array([x[0] for x in meta.values()]) # gains 0-1
  443. ng = len(meta)
  444. v = np.ones(ng)
  445. while all(v == 1): # mutate until a change occurs (prevent duplicates)
  446. v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0)
  447. for i, k in enumerate(hyp.keys()): # plt.hist(v.ravel(), 300)
  448. hyp[k] = float(x[i + 7] * v[i]) # mutate
  449. # Constrain to limits
  450. for k, v in meta.items():
  451. hyp[k] = max(hyp[k], v[1]) # lower limit
  452. hyp[k] = min(hyp[k], v[2]) # upper limit
  453. hyp[k] = round(hyp[k], 5) # significant digits
  454. # Train mutation
  455. results = train(hyp.copy(), opt, device)
  456. # Write mutation results
  457. print_mutation(hyp.copy(), results, yaml_file, opt.bucket)
  458. # Plot results
  459. plot_evolution(yaml_file)
  460. print('Hyperparameter evolution complete. Best results saved as: %s\nCommand to train a new model with these '
  461. 'hyperparameters: $ python train.py --hyp %s' % (yaml_file, yaml_file))