Improves docs and handling of entities and resuming by WandbLogger (#3264)

* adds latest tag to match wandb defaults

* adds entity handling, 'last' tag

* fixes bug causing finished runs to resume

* removes redundant "last" tag for wandb artifact
This commit is contained in:
Charles Frye 2021-05-21 14:42:53 -07:00 committed by GitHub
parent dd7f0b7e05
commit 19100ba007
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 25 additions and 10 deletions

View File

@ -443,7 +443,7 @@ def train(hyp, opt, device, tb_writer=None):
if wandb_logger.wandb and not opt.evolve: # Log the stripped model if wandb_logger.wandb and not opt.evolve: # Log the stripped model
wandb_logger.wandb.log_artifact(str(final), type='model', wandb_logger.wandb.log_artifact(str(final), type='model',
name='run_' + wandb_logger.wandb_run.id + '_model', name='run_' + wandb_logger.wandb_run.id + '_model',
aliases=['last', 'best', 'stripped']) aliases=['latest', 'best', 'stripped'])
wandb_logger.finish_run() wandb_logger.finish_run()
else: else:
dist.destroy_process_group() dist.destroy_process_group()

View File

@ -1,3 +1,4 @@
"""Utilities and tools for tracking runs with Weights & Biases."""
import json import json
import sys import sys
from pathlib import Path from pathlib import Path
@ -35,8 +36,9 @@ def get_run_info(run_path):
run_path = Path(remove_prefix(run_path, WANDB_ARTIFACT_PREFIX)) run_path = Path(remove_prefix(run_path, WANDB_ARTIFACT_PREFIX))
run_id = run_path.stem run_id = run_path.stem
project = run_path.parent.stem project = run_path.parent.stem
entity = run_path.parent.parent.stem
model_artifact_name = 'run_' + run_id + '_model' model_artifact_name = 'run_' + run_id + '_model'
return run_id, project, model_artifact_name return entity, project, run_id, model_artifact_name
def check_wandb_resume(opt): def check_wandb_resume(opt):
@ -44,9 +46,9 @@ def check_wandb_resume(opt):
if isinstance(opt.resume, str): if isinstance(opt.resume, str):
if opt.resume.startswith(WANDB_ARTIFACT_PREFIX): if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
if opt.global_rank not in [-1, 0]: # For resuming DDP runs if opt.global_rank not in [-1, 0]: # For resuming DDP runs
run_id, project, model_artifact_name = get_run_info(opt.resume) entity, project, run_id, model_artifact_name = get_run_info(opt.resume)
api = wandb.Api() api = wandb.Api()
artifact = api.artifact(project + '/' + model_artifact_name + ':latest') artifact = api.artifact(entity + '/' + project + '/' + model_artifact_name + ':latest')
modeldir = artifact.download() modeldir = artifact.download()
opt.weights = str(Path(modeldir) / "last.pt") opt.weights = str(Path(modeldir) / "last.pt")
return True return True
@ -78,6 +80,18 @@ def process_wandb_config_ddp_mode(opt):
class WandbLogger(): class WandbLogger():
"""Log training runs, datasets, models, and predictions to Weights & Biases.
This logger sends information to W&B at wandb.ai. By default, this information
includes hyperparameters, system configuration and metrics, model metrics,
and basic data metrics and analyses.
By providing additional command line arguments to train.py, datasets,
models and predictions can also be logged.
For more on how this logger is used, see the Weights & Biases documentation:
https://docs.wandb.com/guides/integrations/yolov5
"""
def __init__(self, opt, name, run_id, data_dict, job_type='Training'): def __init__(self, opt, name, run_id, data_dict, job_type='Training'):
# Pre-training routine -- # Pre-training routine --
self.job_type = job_type self.job_type = job_type
@ -85,16 +99,17 @@ class WandbLogger():
# It's more elegant to stick to 1 wandb.init call, but useful config data is overwritten in the WandbLogger's wandb.init call # It's more elegant to stick to 1 wandb.init call, but useful config data is overwritten in the WandbLogger's wandb.init call
if isinstance(opt.resume, str): # checks resume from artifact if isinstance(opt.resume, str): # checks resume from artifact
if opt.resume.startswith(WANDB_ARTIFACT_PREFIX): if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
run_id, project, model_artifact_name = get_run_info(opt.resume) entity, project, run_id, model_artifact_name = get_run_info(opt.resume)
model_artifact_name = WANDB_ARTIFACT_PREFIX + model_artifact_name model_artifact_name = WANDB_ARTIFACT_PREFIX + model_artifact_name
assert wandb, 'install wandb to resume wandb runs' assert wandb, 'install wandb to resume wandb runs'
# Resume wandb-artifact:// runs here| workaround for not overwriting wandb.config # Resume wandb-artifact:// runs here| workaround for not overwriting wandb.config
self.wandb_run = wandb.init(id=run_id, project=project, resume='allow') self.wandb_run = wandb.init(id=run_id, project=project, entity=entity, resume='allow')
opt.resume = model_artifact_name opt.resume = model_artifact_name
elif self.wandb: elif self.wandb:
self.wandb_run = wandb.init(config=opt, self.wandb_run = wandb.init(config=opt,
resume="allow", resume="allow",
project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem, project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem,
entity=opt.entity,
name=name, name=name,
job_type=job_type, job_type=job_type,
id=run_id) if not wandb.run else wandb.run id=run_id) if not wandb.run else wandb.run
@ -172,8 +187,8 @@ class WandbLogger():
modeldir = model_artifact.download() modeldir = model_artifact.download()
epochs_trained = model_artifact.metadata.get('epochs_trained') epochs_trained = model_artifact.metadata.get('epochs_trained')
total_epochs = model_artifact.metadata.get('total_epochs') total_epochs = model_artifact.metadata.get('total_epochs')
assert epochs_trained < total_epochs, 'training to %g epochs is finished, nothing to resume.' % ( is_finished = total_epochs is None
total_epochs) assert not is_finished, 'training is finished, can only resume incomplete runs.'
return modeldir, model_artifact return modeldir, model_artifact
return None, None return None, None
@ -188,7 +203,7 @@ class WandbLogger():
}) })
model_artifact.add_file(str(path / 'last.pt'), name='last.pt') model_artifact.add_file(str(path / 'last.pt'), name='last.pt')
wandb.log_artifact(model_artifact, wandb.log_artifact(model_artifact,
aliases=['latest', 'epoch ' + str(self.current_epoch), 'best' if best_model else '']) aliases=['latest', 'last', 'epoch ' + str(self.current_epoch), 'best' if best_model else ''])
print("Saving model artifact on epoch ", epoch + 1) print("Saving model artifact on epoch ", epoch + 1)
def log_dataset_artifact(self, data_file, single_cls, project, overwrite_config=False): def log_dataset_artifact(self, data_file, single_cls, project, overwrite_config=False):
@ -291,7 +306,7 @@ class WandbLogger():
if self.result_artifact: if self.result_artifact:
train_results = wandb.JoinedTable(self.val_table, self.result_table, "id") train_results = wandb.JoinedTable(self.val_table, self.result_table, "id")
self.result_artifact.add(train_results, 'result') self.result_artifact.add(train_results, 'result')
wandb.log_artifact(self.result_artifact, aliases=['latest', 'epoch ' + str(self.current_epoch), wandb.log_artifact(self.result_artifact, aliases=['latest', 'last', 'epoch ' + str(self.current_epoch),
('best' if best_result else '')]) ('best' if best_result else '')])
self.result_table = wandb.Table(["epoch", "id", "prediction", "avg_confidence"]) self.result_table = wandb.Table(["epoch", "id", "prediction", "avg_confidence"])
self.result_artifact = wandb.Artifact("run_" + wandb.run.id + "_progress", "evaluation") self.result_artifact = wandb.Artifact("run_" + wandb.run.id + "_progress", "evaluation")