Преглед на файлове

Remove DDP process group timeout (#4422)

modifyDataloader
Glenn Jocher GitHub преди 3 години
родител
ревизия
19d03a955c
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
променени са 2 файла, в които са добавени 3 реда и са изтрити 3 реда
  1. +1
    -1
      train.py
  2. +2
    -2
      utils/torch_utils.py

+ 1
- 1
train.py Целия файл

@@ -493,7 +493,7 @@ def main(opt):
assert not opt.sync_bn, '--sync-bn known training issue, see https://github.com/ultralytics/yolov5/issues/3998'
torch.cuda.set_device(LOCAL_RANK)
device = torch.device('cuda', LOCAL_RANK)
dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo", timeout=timedelta(seconds=60))
dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo")

# Train
if not opt.evolve:

+ 2
- 2
utils/torch_utils.py Целия файл

@@ -35,10 +35,10 @@ def torch_distributed_zero_first(local_rank: int):
Decorator to make all processes in distributed training wait for each local_master to do something.
"""
if local_rank not in [-1, 0]:
dist.barrier()
dist.barrier(device_ids=[local_rank])
yield
if local_rank == 0:
dist.barrier()
dist.barrier(device_ids=[0])


def init_torch_seeds(seed=0):

Loading…
Отказ
Запис