Remove DDP process group timeout (#4422)
This commit is contained in:
parent
4e65052f28
commit
19d03a955c
2
train.py
2
train.py
|
|
@ -493,7 +493,7 @@ def main(opt):
|
||||||
assert not opt.sync_bn, '--sync-bn known training issue, see https://github.com/ultralytics/yolov5/issues/3998'
|
assert not opt.sync_bn, '--sync-bn known training issue, see https://github.com/ultralytics/yolov5/issues/3998'
|
||||||
torch.cuda.set_device(LOCAL_RANK)
|
torch.cuda.set_device(LOCAL_RANK)
|
||||||
device = torch.device('cuda', LOCAL_RANK)
|
device = torch.device('cuda', LOCAL_RANK)
|
||||||
dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo", timeout=timedelta(seconds=60))
|
dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo")
|
||||||
|
|
||||||
# Train
|
# Train
|
||||||
if not opt.evolve:
|
if not opt.evolve:
|
||||||
|
|
|
||||||
|
|
@ -35,10 +35,10 @@ def torch_distributed_zero_first(local_rank: int):
|
||||||
Decorator to make all processes in distributed training wait for each local_master to do something.
|
Decorator to make all processes in distributed training wait for each local_master to do something.
|
||||||
"""
|
"""
|
||||||
if local_rank not in [-1, 0]:
|
if local_rank not in [-1, 0]:
|
||||||
dist.barrier()
|
dist.barrier(device_ids=[local_rank])
|
||||||
yield
|
yield
|
||||||
if local_rank == 0:
|
if local_rank == 0:
|
||||||
dist.barrier()
|
dist.barrier(device_ids=[0])
|
||||||
|
|
||||||
|
|
||||||
def init_torch_seeds(seed=0):
|
def init_torch_seeds(seed=0):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue