|
|
|
|
|
|
|
|
|
|
|
"""Utilities and tools for tracking runs with Weights & Biases.""" |
|
|
import json |
|
|
import json |
|
|
import sys |
|
|
import sys |
|
|
from pathlib import Path |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
run_path = Path(remove_prefix(run_path, WANDB_ARTIFACT_PREFIX)) |
|
|
run_path = Path(remove_prefix(run_path, WANDB_ARTIFACT_PREFIX)) |
|
|
run_id = run_path.stem |
|
|
run_id = run_path.stem |
|
|
project = run_path.parent.stem |
|
|
project = run_path.parent.stem |
|
|
|
|
|
entity = run_path.parent.parent.stem |
|
|
model_artifact_name = 'run_' + run_id + '_model' |
|
|
model_artifact_name = 'run_' + run_id + '_model' |
|
|
return run_id, project, model_artifact_name |
|
|
|
|
|
|
|
|
return entity, project, run_id, model_artifact_name |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def check_wandb_resume(opt): |
|
|
def check_wandb_resume(opt): |
|
|
|
|
|
|
|
|
if isinstance(opt.resume, str): |
|
|
if isinstance(opt.resume, str): |
|
|
if opt.resume.startswith(WANDB_ARTIFACT_PREFIX): |
|
|
if opt.resume.startswith(WANDB_ARTIFACT_PREFIX): |
|
|
if opt.global_rank not in [-1, 0]: # For resuming DDP runs |
|
|
if opt.global_rank not in [-1, 0]: # For resuming DDP runs |
|
|
run_id, project, model_artifact_name = get_run_info(opt.resume) |
|
|
|
|
|
|
|
|
entity, project, run_id, model_artifact_name = get_run_info(opt.resume) |
|
|
api = wandb.Api() |
|
|
api = wandb.Api() |
|
|
artifact = api.artifact(project + '/' + model_artifact_name + ':latest') |
|
|
|
|
|
|
|
|
artifact = api.artifact(entity + '/' + project + '/' + model_artifact_name + ':latest') |
|
|
modeldir = artifact.download() |
|
|
modeldir = artifact.download() |
|
|
opt.weights = str(Path(modeldir) / "last.pt") |
|
|
opt.weights = str(Path(modeldir) / "last.pt") |
|
|
return True |
|
|
return True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class WandbLogger(): |
|
|
class WandbLogger(): |
|
|
|
|
|
"""Log training runs, datasets, models, and predictions to Weights & Biases. |
|
|
|
|
|
|
|
|
|
|
|
This logger sends information to W&B at wandb.ai. By default, this information |
|
|
|
|
|
includes hyperparameters, system configuration and metrics, model metrics, |
|
|
|
|
|
and basic data metrics and analyses. |
|
|
|
|
|
|
|
|
|
|
|
By providing additional command line arguments to train.py, datasets, |
|
|
|
|
|
models and predictions can also be logged. |
|
|
|
|
|
|
|
|
|
|
|
For more on how this logger is used, see the Weights & Biases documentation: |
|
|
|
|
|
https://docs.wandb.com/guides/integrations/yolov5 |
|
|
|
|
|
""" |
|
|
def __init__(self, opt, name, run_id, data_dict, job_type='Training'): |
|
|
def __init__(self, opt, name, run_id, data_dict, job_type='Training'): |
|
|
# Pre-training routine -- |
|
|
# Pre-training routine -- |
|
|
self.job_type = job_type |
|
|
self.job_type = job_type |
|
|
|
|
|
|
|
|
# It's more elegant to stick to 1 wandb.init call, but useful config data is overwritten in the WandbLogger's wandb.init call |
|
|
# It's more elegant to stick to 1 wandb.init call, but useful config data is overwritten in the WandbLogger's wandb.init call |
|
|
if isinstance(opt.resume, str): # checks resume from artifact |
|
|
if isinstance(opt.resume, str): # checks resume from artifact |
|
|
if opt.resume.startswith(WANDB_ARTIFACT_PREFIX): |
|
|
if opt.resume.startswith(WANDB_ARTIFACT_PREFIX): |
|
|
run_id, project, model_artifact_name = get_run_info(opt.resume) |
|
|
|
|
|
|
|
|
entity, project, run_id, model_artifact_name = get_run_info(opt.resume) |
|
|
model_artifact_name = WANDB_ARTIFACT_PREFIX + model_artifact_name |
|
|
model_artifact_name = WANDB_ARTIFACT_PREFIX + model_artifact_name |
|
|
assert wandb, 'install wandb to resume wandb runs' |
|
|
assert wandb, 'install wandb to resume wandb runs' |
|
|
# Resume wandb-artifact:// runs here| workaround for not overwriting wandb.config |
|
|
# Resume wandb-artifact:// runs here| workaround for not overwriting wandb.config |
|
|
self.wandb_run = wandb.init(id=run_id, project=project, resume='allow') |
|
|
|
|
|
|
|
|
self.wandb_run = wandb.init(id=run_id, project=project, entity=entity, resume='allow') |
|
|
opt.resume = model_artifact_name |
|
|
opt.resume = model_artifact_name |
|
|
elif self.wandb: |
|
|
elif self.wandb: |
|
|
self.wandb_run = wandb.init(config=opt, |
|
|
self.wandb_run = wandb.init(config=opt, |
|
|
resume="allow", |
|
|
resume="allow", |
|
|
project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem, |
|
|
project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem, |
|
|
|
|
|
entity=opt.entity, |
|
|
name=name, |
|
|
name=name, |
|
|
job_type=job_type, |
|
|
job_type=job_type, |
|
|
id=run_id) if not wandb.run else wandb.run |
|
|
id=run_id) if not wandb.run else wandb.run |
|
|
|
|
|
|
|
|
modeldir = model_artifact.download() |
|
|
modeldir = model_artifact.download() |
|
|
epochs_trained = model_artifact.metadata.get('epochs_trained') |
|
|
epochs_trained = model_artifact.metadata.get('epochs_trained') |
|
|
total_epochs = model_artifact.metadata.get('total_epochs') |
|
|
total_epochs = model_artifact.metadata.get('total_epochs') |
|
|
assert epochs_trained < total_epochs, 'training to %g epochs is finished, nothing to resume.' % ( |
|
|
|
|
|
total_epochs) |
|
|
|
|
|
|
|
|
is_finished = total_epochs is None |
|
|
|
|
|
assert not is_finished, 'training is finished, can only resume incomplete runs.' |
|
|
return modeldir, model_artifact |
|
|
return modeldir, model_artifact |
|
|
return None, None |
|
|
return None, None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}) |
|
|
}) |
|
|
model_artifact.add_file(str(path / 'last.pt'), name='last.pt') |
|
|
model_artifact.add_file(str(path / 'last.pt'), name='last.pt') |
|
|
wandb.log_artifact(model_artifact, |
|
|
wandb.log_artifact(model_artifact, |
|
|
aliases=['latest', 'epoch ' + str(self.current_epoch), 'best' if best_model else '']) |
|
|
|
|
|
|
|
|
aliases=['latest', 'last', 'epoch ' + str(self.current_epoch), 'best' if best_model else '']) |
|
|
print("Saving model artifact on epoch ", epoch + 1) |
|
|
print("Saving model artifact on epoch ", epoch + 1) |
|
|
|
|
|
|
|
|
def log_dataset_artifact(self, data_file, single_cls, project, overwrite_config=False): |
|
|
def log_dataset_artifact(self, data_file, single_cls, project, overwrite_config=False): |
|
|
|
|
|
|
|
|
if self.result_artifact: |
|
|
if self.result_artifact: |
|
|
train_results = wandb.JoinedTable(self.val_table, self.result_table, "id") |
|
|
train_results = wandb.JoinedTable(self.val_table, self.result_table, "id") |
|
|
self.result_artifact.add(train_results, 'result') |
|
|
self.result_artifact.add(train_results, 'result') |
|
|
wandb.log_artifact(self.result_artifact, aliases=['latest', 'epoch ' + str(self.current_epoch), |
|
|
|
|
|
|
|
|
wandb.log_artifact(self.result_artifact, aliases=['latest', 'last', 'epoch ' + str(self.current_epoch), |
|
|
('best' if best_result else '')]) |
|
|
('best' if best_result else '')]) |
|
|
self.result_table = wandb.Table(["epoch", "id", "prediction", "avg_confidence"]) |
|
|
self.result_table = wandb.Table(["epoch", "id", "prediction", "avg_confidence"]) |
|
|
self.result_artifact = wandb.Artifact("run_" + wandb.run.id + "_progress", "evaluation") |
|
|
self.result_artifact = wandb.Artifact("run_" + wandb.run.id + "_progress", "evaluation") |