Improves docs and handling of entities and resuming by WandbLogger (#3264)

* adds latest tag to match wandb defaults * adds entity handling, 'last' tag * fixes bug causing finished runs to resume * removes redundant "last" tag for wandb artifact
2021-05-21 14:42:53 -07:00 · 2021-05-21 14:42:53 -07:00 · 19100ba007
parent dd7f0b7e05
commit 19100ba007
2 changed files with 25 additions and 10 deletions
--- a/train.py
+++ b/train.py
@ -443,7 +443,7 @@ def train(hyp, opt, device, tb_writer=None):
        if wandb_logger.wandb and not opt.evolve:  # Log the stripped model
            wandb_logger.wandb.log_artifact(str(final), type='model',
                                            name='run_' + wandb_logger.wandb_run.id + '_model',
-                                            aliases=['last', 'best', 'stripped'])
+                                            aliases=['latest', 'best', 'stripped'])
        wandb_logger.finish_run()
    else:
        dist.destroy_process_group()
--- a/utils/wandb_logging/wandb_utils.py
+++ b/utils/wandb_logging/wandb_utils.py
@ -1,3 +1,4 @@
 """Utilities and tools for tracking runs with Weights & Biases."""
 import json
 import sys
 from pathlib import Path
@ -35,8 +36,9 @@ def get_run_info(run_path):
    run_path = Path(remove_prefix(run_path, WANDB_ARTIFACT_PREFIX))
    run_id = run_path.stem
    project = run_path.parent.stem
    entity = run_path.parent.parent.stem
    model_artifact_name = 'run_' + run_id + '_model'
-    return run_id, project, model_artifact_name
+    return entity, project, run_id, model_artifact_name
 def check_wandb_resume(opt):
@ -44,9 +46,9 @@ def check_wandb_resume(opt):
    if isinstance(opt.resume, str):
        if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
            if opt.global_rank not in [-1, 0]:  # For resuming DDP runs
-                run_id, project, model_artifact_name = get_run_info(opt.resume)
+                entity, project, run_id, model_artifact_name = get_run_info(opt.resume)
                api = wandb.Api()
-                artifact = api.artifact(project + '/' + model_artifact_name + ':latest')
+                artifact = api.artifact(entity + '/' + project + '/' + model_artifact_name + ':latest')
                modeldir = artifact.download()
                opt.weights = str(Path(modeldir) / "last.pt")
            return True
@ -78,6 +80,18 @@ def process_wandb_config_ddp_mode(opt):
 class WandbLogger():
    """Log training runs, datasets, models, and predictions to Weights & Biases.
    This logger sends information to W&B at wandb.ai. By default, this information
    includes hyperparameters, system configuration and metrics, model metrics,
    and basic data metrics and analyses.
    By providing additional command line arguments to train.py, datasets,
    models and predictions can also be logged.
    For more on how this logger is used, see the Weights & Biases documentation:
    https://docs.wandb.com/guides/integrations/yolov5
    """
    def __init__(self, opt, name, run_id, data_dict, job_type='Training'):
        # Pre-training routine --
        self.job_type = job_type
@ -85,16 +99,17 @@ class WandbLogger():
        # It's more elegant to stick to 1 wandb.init call, but useful config data is overwritten in the WandbLogger's wandb.init call
        if isinstance(opt.resume, str):  # checks resume from artifact
            if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
-                run_id, project, model_artifact_name = get_run_info(opt.resume)
+                entity, project, run_id, model_artifact_name = get_run_info(opt.resume)
                model_artifact_name = WANDB_ARTIFACT_PREFIX + model_artifact_name
                assert wandb, 'install wandb to resume wandb runs'
                # Resume wandb-artifact:// runs here| workaround for not overwriting wandb.config
-                self.wandb_run = wandb.init(id=run_id, project=project, resume='allow')
+                self.wandb_run = wandb.init(id=run_id, project=project, entity=entity, resume='allow')
                opt.resume = model_artifact_name
        elif self.wandb:
            self.wandb_run = wandb.init(config=opt,
                                        resume="allow",
                                        project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem,
                                        entity=opt.entity,
                                        name=name,
                                        job_type=job_type,
                                        id=run_id) if not wandb.run else wandb.run
@ -172,8 +187,8 @@ class WandbLogger():
            modeldir = model_artifact.download()
            epochs_trained = model_artifact.metadata.get('epochs_trained')
            total_epochs = model_artifact.metadata.get('total_epochs')
-            assert epochs_trained < total_epochs, 'training to %g epochs is finished, nothing to resume.' % (
+            is_finished = total_epochs is None
-                total_epochs)
+            assert not is_finished, 'training is finished, can only resume incomplete runs.'
            return modeldir, model_artifact
        return None, None
@ -188,7 +203,7 @@ class WandbLogger():
        })
        model_artifact.add_file(str(path / 'last.pt'), name='last.pt')
        wandb.log_artifact(model_artifact,
-                           aliases=['latest', 'epoch ' + str(self.current_epoch), 'best' if best_model else ''])
+                           aliases=['latest', 'last', 'epoch ' + str(self.current_epoch), 'best' if best_model else ''])
        print("Saving model artifact on epoch ", epoch + 1)
    def log_dataset_artifact(self, data_file, single_cls, project, overwrite_config=False):
@ -291,7 +306,7 @@ class WandbLogger():
            if self.result_artifact:
                train_results = wandb.JoinedTable(self.val_table, self.result_table, "id")
                self.result_artifact.add(train_results, 'result')
-                wandb.log_artifact(self.result_artifact, aliases=['latest', 'epoch ' + str(self.current_epoch),
+                wandb.log_artifact(self.result_artifact, aliases=['latest', 'last', 'epoch ' + str(self.current_epoch),
                                                                  ('best' if best_result else '')])
                self.result_table = wandb.Table(["epoch", "id", "prediction", "avg_confidence"])
                self.result_artifact = wandb.Artifact("run_" + wandb.run.id + "_progress", "evaluation")