Remove DDP process group timeout (#4422)

2021-08-15 18:32:41 +02:00 · 2021-08-15 18:32:41 +02:00 · 19d03a955c
parent 4e65052f28
commit 19d03a955c
2 changed files with 3 additions and 3 deletions
--- a/train.py
+++ b/train.py
@ -493,7 +493,7 @@ def main(opt):
        assert not opt.sync_bn, '--sync-bn known training issue, see https://github.com/ultralytics/yolov5/issues/3998'
        torch.cuda.set_device(LOCAL_RANK)
        device = torch.device('cuda', LOCAL_RANK)
-        dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo", timeout=timedelta(seconds=60))
+        dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo")

    # Train
    if not opt.evolve:
--- a/utils/torch_utils.py
+++ b/utils/torch_utils.py
@ -35,10 +35,10 @@ def torch_distributed_zero_first(local_rank: int):
    Decorator to make all processes in distributed training wait for each local_master to do something.
    """
    if local_rank not in [-1, 0]:
-        dist.barrier()
+        dist.barrier(device_ids=[local_rank])
    yield
    if local_rank == 0:
-        dist.barrier()
+        dist.barrier(device_ids=[0])


 def init_torch_seeds(seed=0):