ソースを参照

Improves docs and handling of entities and resuming by WandbLogger (#3264)

* adds latest tag to match wandb defaults

* adds entity handling, 'last' tag

* fixes bug causing finished runs to resume

* removes redundant "last" tag for wandb artifact
modifyDataloader
Charles Frye GitHub 3年前
コミット
19100ba007
この署名に対応する既知のキーがデータベースに存在しません GPGキーID: 4AEE18F83AFDEB23
2個のファイルの変更25行の追加10行の削除
  1. +1
    -1
      train.py
  2. +24
    -9
      utils/wandb_logging/wandb_utils.py

+ 1
- 1
train.py ファイルの表示

@@ -443,7 +443,7 @@ def train(hyp, opt, device, tb_writer=None):
if wandb_logger.wandb and not opt.evolve: # Log the stripped model
wandb_logger.wandb.log_artifact(str(final), type='model',
name='run_' + wandb_logger.wandb_run.id + '_model',
aliases=['last', 'best', 'stripped'])
aliases=['latest', 'best', 'stripped'])
wandb_logger.finish_run()
else:
dist.destroy_process_group()

+ 24
- 9
utils/wandb_logging/wandb_utils.py ファイルの表示

@@ -1,3 +1,4 @@
"""Utilities and tools for tracking runs with Weights & Biases."""
import json
import sys
from pathlib import Path
@@ -35,8 +36,9 @@ def get_run_info(run_path):
run_path = Path(remove_prefix(run_path, WANDB_ARTIFACT_PREFIX))
run_id = run_path.stem
project = run_path.parent.stem
entity = run_path.parent.parent.stem
model_artifact_name = 'run_' + run_id + '_model'
return run_id, project, model_artifact_name
return entity, project, run_id, model_artifact_name


def check_wandb_resume(opt):
@@ -44,9 +46,9 @@ def check_wandb_resume(opt):
if isinstance(opt.resume, str):
if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
if opt.global_rank not in [-1, 0]: # For resuming DDP runs
run_id, project, model_artifact_name = get_run_info(opt.resume)
entity, project, run_id, model_artifact_name = get_run_info(opt.resume)
api = wandb.Api()
artifact = api.artifact(project + '/' + model_artifact_name + ':latest')
artifact = api.artifact(entity + '/' + project + '/' + model_artifact_name + ':latest')
modeldir = artifact.download()
opt.weights = str(Path(modeldir) / "last.pt")
return True
@@ -78,6 +80,18 @@ def process_wandb_config_ddp_mode(opt):


class WandbLogger():
"""Log training runs, datasets, models, and predictions to Weights & Biases.

This logger sends information to W&B at wandb.ai. By default, this information
includes hyperparameters, system configuration and metrics, model metrics,
and basic data metrics and analyses.

By providing additional command line arguments to train.py, datasets,
models and predictions can also be logged.

For more on how this logger is used, see the Weights & Biases documentation:
https://docs.wandb.com/guides/integrations/yolov5
"""
def __init__(self, opt, name, run_id, data_dict, job_type='Training'):
# Pre-training routine --
self.job_type = job_type
@@ -85,16 +99,17 @@ class WandbLogger():
# It's more elegant to stick to 1 wandb.init call, but useful config data is overwritten in the WandbLogger's wandb.init call
if isinstance(opt.resume, str): # checks resume from artifact
if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
run_id, project, model_artifact_name = get_run_info(opt.resume)
entity, project, run_id, model_artifact_name = get_run_info(opt.resume)
model_artifact_name = WANDB_ARTIFACT_PREFIX + model_artifact_name
assert wandb, 'install wandb to resume wandb runs'
# Resume wandb-artifact:// runs here| workaround for not overwriting wandb.config
self.wandb_run = wandb.init(id=run_id, project=project, resume='allow')
self.wandb_run = wandb.init(id=run_id, project=project, entity=entity, resume='allow')
opt.resume = model_artifact_name
elif self.wandb:
self.wandb_run = wandb.init(config=opt,
resume="allow",
project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem,
entity=opt.entity,
name=name,
job_type=job_type,
id=run_id) if not wandb.run else wandb.run
@@ -172,8 +187,8 @@ class WandbLogger():
modeldir = model_artifact.download()
epochs_trained = model_artifact.metadata.get('epochs_trained')
total_epochs = model_artifact.metadata.get('total_epochs')
assert epochs_trained < total_epochs, 'training to %g epochs is finished, nothing to resume.' % (
total_epochs)
is_finished = total_epochs is None
assert not is_finished, 'training is finished, can only resume incomplete runs.'
return modeldir, model_artifact
return None, None

@@ -188,7 +203,7 @@ class WandbLogger():
})
model_artifact.add_file(str(path / 'last.pt'), name='last.pt')
wandb.log_artifact(model_artifact,
aliases=['latest', 'epoch ' + str(self.current_epoch), 'best' if best_model else ''])
aliases=['latest', 'last', 'epoch ' + str(self.current_epoch), 'best' if best_model else ''])
print("Saving model artifact on epoch ", epoch + 1)

def log_dataset_artifact(self, data_file, single_cls, project, overwrite_config=False):
@@ -291,7 +306,7 @@ class WandbLogger():
if self.result_artifact:
train_results = wandb.JoinedTable(self.val_table, self.result_table, "id")
self.result_artifact.add(train_results, 'result')
wandb.log_artifact(self.result_artifact, aliases=['latest', 'epoch ' + str(self.current_epoch),
wandb.log_artifact(self.result_artifact, aliases=['latest', 'last', 'epoch ' + str(self.current_epoch),
('best' if best_result else '')])
self.result_table = wandb.Table(["epoch", "id", "prediction", "avg_confidence"])
self.result_artifact = wandb.Artifact("run_" + wandb.run.id + "_progress", "evaluation")

読み込み中…
キャンセル
保存