Browse Source

Improves docs and handling of entities and resuming by WandbLogger (#3264)

* adds latest tag to match wandb defaults

* adds entity handling, 'last' tag

* fixes bug causing finished runs to resume

* removes redundant "last" tag for wandb artifact
modifyDataloader
Charles Frye GitHub 3 years ago
parent
commit
19100ba007
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 25 additions and 10 deletions
  1. +1
    -1
      train.py
  2. +24
    -9
      utils/wandb_logging/wandb_utils.py

+ 1
- 1
train.py View File

if wandb_logger.wandb and not opt.evolve: # Log the stripped model if wandb_logger.wandb and not opt.evolve: # Log the stripped model
wandb_logger.wandb.log_artifact(str(final), type='model', wandb_logger.wandb.log_artifact(str(final), type='model',
name='run_' + wandb_logger.wandb_run.id + '_model', name='run_' + wandb_logger.wandb_run.id + '_model',
aliases=['last', 'best', 'stripped'])
aliases=['latest', 'best', 'stripped'])
wandb_logger.finish_run() wandb_logger.finish_run()
else: else:
dist.destroy_process_group() dist.destroy_process_group()

+ 24
- 9
utils/wandb_logging/wandb_utils.py View File

"""Utilities and tools for tracking runs with Weights & Biases."""
import json import json
import sys import sys
from pathlib import Path from pathlib import Path
run_path = Path(remove_prefix(run_path, WANDB_ARTIFACT_PREFIX)) run_path = Path(remove_prefix(run_path, WANDB_ARTIFACT_PREFIX))
run_id = run_path.stem run_id = run_path.stem
project = run_path.parent.stem project = run_path.parent.stem
entity = run_path.parent.parent.stem
model_artifact_name = 'run_' + run_id + '_model' model_artifact_name = 'run_' + run_id + '_model'
return run_id, project, model_artifact_name
return entity, project, run_id, model_artifact_name




def check_wandb_resume(opt): def check_wandb_resume(opt):
if isinstance(opt.resume, str): if isinstance(opt.resume, str):
if opt.resume.startswith(WANDB_ARTIFACT_PREFIX): if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
if opt.global_rank not in [-1, 0]: # For resuming DDP runs if opt.global_rank not in [-1, 0]: # For resuming DDP runs
run_id, project, model_artifact_name = get_run_info(opt.resume)
entity, project, run_id, model_artifact_name = get_run_info(opt.resume)
api = wandb.Api() api = wandb.Api()
artifact = api.artifact(project + '/' + model_artifact_name + ':latest')
artifact = api.artifact(entity + '/' + project + '/' + model_artifact_name + ':latest')
modeldir = artifact.download() modeldir = artifact.download()
opt.weights = str(Path(modeldir) / "last.pt") opt.weights = str(Path(modeldir) / "last.pt")
return True return True




class WandbLogger(): class WandbLogger():
"""Log training runs, datasets, models, and predictions to Weights & Biases.

This logger sends information to W&B at wandb.ai. By default, this information
includes hyperparameters, system configuration and metrics, model metrics,
and basic data metrics and analyses.

By providing additional command line arguments to train.py, datasets,
models and predictions can also be logged.

For more on how this logger is used, see the Weights & Biases documentation:
https://docs.wandb.com/guides/integrations/yolov5
"""
def __init__(self, opt, name, run_id, data_dict, job_type='Training'): def __init__(self, opt, name, run_id, data_dict, job_type='Training'):
# Pre-training routine -- # Pre-training routine --
self.job_type = job_type self.job_type = job_type
# It's more elegant to stick to 1 wandb.init call, but useful config data is overwritten in the WandbLogger's wandb.init call # It's more elegant to stick to 1 wandb.init call, but useful config data is overwritten in the WandbLogger's wandb.init call
if isinstance(opt.resume, str): # checks resume from artifact if isinstance(opt.resume, str): # checks resume from artifact
if opt.resume.startswith(WANDB_ARTIFACT_PREFIX): if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
run_id, project, model_artifact_name = get_run_info(opt.resume)
entity, project, run_id, model_artifact_name = get_run_info(opt.resume)
model_artifact_name = WANDB_ARTIFACT_PREFIX + model_artifact_name model_artifact_name = WANDB_ARTIFACT_PREFIX + model_artifact_name
assert wandb, 'install wandb to resume wandb runs' assert wandb, 'install wandb to resume wandb runs'
# Resume wandb-artifact:// runs here| workaround for not overwriting wandb.config # Resume wandb-artifact:// runs here| workaround for not overwriting wandb.config
self.wandb_run = wandb.init(id=run_id, project=project, resume='allow')
self.wandb_run = wandb.init(id=run_id, project=project, entity=entity, resume='allow')
opt.resume = model_artifact_name opt.resume = model_artifact_name
elif self.wandb: elif self.wandb:
self.wandb_run = wandb.init(config=opt, self.wandb_run = wandb.init(config=opt,
resume="allow", resume="allow",
project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem, project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem,
entity=opt.entity,
name=name, name=name,
job_type=job_type, job_type=job_type,
id=run_id) if not wandb.run else wandb.run id=run_id) if not wandb.run else wandb.run
modeldir = model_artifact.download() modeldir = model_artifact.download()
epochs_trained = model_artifact.metadata.get('epochs_trained') epochs_trained = model_artifact.metadata.get('epochs_trained')
total_epochs = model_artifact.metadata.get('total_epochs') total_epochs = model_artifact.metadata.get('total_epochs')
assert epochs_trained < total_epochs, 'training to %g epochs is finished, nothing to resume.' % (
total_epochs)
is_finished = total_epochs is None
assert not is_finished, 'training is finished, can only resume incomplete runs.'
return modeldir, model_artifact return modeldir, model_artifact
return None, None return None, None


}) })
model_artifact.add_file(str(path / 'last.pt'), name='last.pt') model_artifact.add_file(str(path / 'last.pt'), name='last.pt')
wandb.log_artifact(model_artifact, wandb.log_artifact(model_artifact,
aliases=['latest', 'epoch ' + str(self.current_epoch), 'best' if best_model else ''])
aliases=['latest', 'last', 'epoch ' + str(self.current_epoch), 'best' if best_model else ''])
print("Saving model artifact on epoch ", epoch + 1) print("Saving model artifact on epoch ", epoch + 1)


def log_dataset_artifact(self, data_file, single_cls, project, overwrite_config=False): def log_dataset_artifact(self, data_file, single_cls, project, overwrite_config=False):
if self.result_artifact: if self.result_artifact:
train_results = wandb.JoinedTable(self.val_table, self.result_table, "id") train_results = wandb.JoinedTable(self.val_table, self.result_table, "id")
self.result_artifact.add(train_results, 'result') self.result_artifact.add(train_results, 'result')
wandb.log_artifact(self.result_artifact, aliases=['latest', 'epoch ' + str(self.current_epoch),
wandb.log_artifact(self.result_artifact, aliases=['latest', 'last', 'epoch ' + str(self.current_epoch),
('best' if best_result else '')]) ('best' if best_result else '')])
self.result_table = wandb.Table(["epoch", "id", "prediction", "avg_confidence"]) self.result_table = wandb.Table(["epoch", "id", "prediction", "avg_confidence"])
self.result_artifact = wandb.Artifact("run_" + wandb.run.id + "_progress", "evaluation") self.result_artifact = wandb.Artifact("run_" + wandb.run.id + "_progress", "evaluation")

Loading…
Cancel
Save