Преглед на файлове

Improves docs and handling of entities and resuming by WandbLogger (#3264)

* adds latest tag to match wandb defaults

* adds entity handling, 'last' tag

* fixes bug causing finished runs to resume

* removes redundant "last" tag for wandb artifact
modifyDataloader
Charles Frye GitHub преди 3 години
родител
ревизия
19100ba007
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
променени са 2 файла, в които са добавени 25 реда и са изтрити 10 реда
  1. +1
    -1
      train.py
  2. +24
    -9
      utils/wandb_logging/wandb_utils.py

+ 1
- 1
train.py Целия файл

@@ -443,7 +443,7 @@ def train(hyp, opt, device, tb_writer=None):
if wandb_logger.wandb and not opt.evolve: # Log the stripped model
wandb_logger.wandb.log_artifact(str(final), type='model',
name='run_' + wandb_logger.wandb_run.id + '_model',
aliases=['last', 'best', 'stripped'])
aliases=['latest', 'best', 'stripped'])
wandb_logger.finish_run()
else:
dist.destroy_process_group()

+ 24
- 9
utils/wandb_logging/wandb_utils.py Целия файл

@@ -1,3 +1,4 @@
"""Utilities and tools for tracking runs with Weights & Biases."""
import json
import sys
from pathlib import Path
@@ -35,8 +36,9 @@ def get_run_info(run_path):
run_path = Path(remove_prefix(run_path, WANDB_ARTIFACT_PREFIX))
run_id = run_path.stem
project = run_path.parent.stem
entity = run_path.parent.parent.stem
model_artifact_name = 'run_' + run_id + '_model'
return run_id, project, model_artifact_name
return entity, project, run_id, model_artifact_name


def check_wandb_resume(opt):
@@ -44,9 +46,9 @@ def check_wandb_resume(opt):
if isinstance(opt.resume, str):
if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
if opt.global_rank not in [-1, 0]: # For resuming DDP runs
run_id, project, model_artifact_name = get_run_info(opt.resume)
entity, project, run_id, model_artifact_name = get_run_info(opt.resume)
api = wandb.Api()
artifact = api.artifact(project + '/' + model_artifact_name + ':latest')
artifact = api.artifact(entity + '/' + project + '/' + model_artifact_name + ':latest')
modeldir = artifact.download()
opt.weights = str(Path(modeldir) / "last.pt")
return True
@@ -78,6 +80,18 @@ def process_wandb_config_ddp_mode(opt):


class WandbLogger():
"""Log training runs, datasets, models, and predictions to Weights & Biases.

This logger sends information to W&B at wandb.ai. By default, this information
includes hyperparameters, system configuration and metrics, model metrics,
and basic data metrics and analyses.

By providing additional command line arguments to train.py, datasets,
models and predictions can also be logged.

For more on how this logger is used, see the Weights & Biases documentation:
https://docs.wandb.com/guides/integrations/yolov5
"""
def __init__(self, opt, name, run_id, data_dict, job_type='Training'):
# Pre-training routine --
self.job_type = job_type
@@ -85,16 +99,17 @@ class WandbLogger():
# It's more elegant to stick to 1 wandb.init call, but useful config data is overwritten in the WandbLogger's wandb.init call
if isinstance(opt.resume, str): # checks resume from artifact
if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
run_id, project, model_artifact_name = get_run_info(opt.resume)
entity, project, run_id, model_artifact_name = get_run_info(opt.resume)
model_artifact_name = WANDB_ARTIFACT_PREFIX + model_artifact_name
assert wandb, 'install wandb to resume wandb runs'
# Resume wandb-artifact:// runs here| workaround for not overwriting wandb.config
self.wandb_run = wandb.init(id=run_id, project=project, resume='allow')
self.wandb_run = wandb.init(id=run_id, project=project, entity=entity, resume='allow')
opt.resume = model_artifact_name
elif self.wandb:
self.wandb_run = wandb.init(config=opt,
resume="allow",
project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem,
entity=opt.entity,
name=name,
job_type=job_type,
id=run_id) if not wandb.run else wandb.run
@@ -172,8 +187,8 @@ class WandbLogger():
modeldir = model_artifact.download()
epochs_trained = model_artifact.metadata.get('epochs_trained')
total_epochs = model_artifact.metadata.get('total_epochs')
assert epochs_trained < total_epochs, 'training to %g epochs is finished, nothing to resume.' % (
total_epochs)
is_finished = total_epochs is None
assert not is_finished, 'training is finished, can only resume incomplete runs.'
return modeldir, model_artifact
return None, None

@@ -188,7 +203,7 @@ class WandbLogger():
})
model_artifact.add_file(str(path / 'last.pt'), name='last.pt')
wandb.log_artifact(model_artifact,
aliases=['latest', 'epoch ' + str(self.current_epoch), 'best' if best_model else ''])
aliases=['latest', 'last', 'epoch ' + str(self.current_epoch), 'best' if best_model else ''])
print("Saving model artifact on epoch ", epoch + 1)

def log_dataset_artifact(self, data_file, single_cls, project, overwrite_config=False):
@@ -291,7 +306,7 @@ class WandbLogger():
if self.result_artifact:
train_results = wandb.JoinedTable(self.val_table, self.result_table, "id")
self.result_artifact.add(train_results, 'result')
wandb.log_artifact(self.result_artifact, aliases=['latest', 'epoch ' + str(self.current_epoch),
wandb.log_artifact(self.result_artifact, aliases=['latest', 'last', 'epoch ' + str(self.current_epoch),
('best' if best_result else '')])
self.result_table = wandb.Table(["epoch", "id", "prediction", "avg_confidence"])
self.result_artifact = wandb.Artifact("run_" + wandb.run.id + "_progress", "evaluation")

Loading…
Отказ
Запис