* Amazon AWS EC2 startup and re-startup scripts * Create resume.py * cleanup5.0
# AWS EC2 instance startup 'MIME' script https://aws.amazon.com/premiumsupport/knowledge-center/execute-user-data-ec2/ | |||||
# This script will run on every instance restart, not only on first start | |||||
# --- DO NOT COPY ABOVE COMMENTS WHEN PASTING INTO USERDATA --- | |||||
Content-Type: multipart/mixed; boundary="//" | |||||
MIME-Version: 1.0 | |||||
--// | |||||
Content-Type: text/cloud-config; charset="us-ascii" | |||||
MIME-Version: 1.0 | |||||
Content-Transfer-Encoding: 7bit | |||||
Content-Disposition: attachment; filename="cloud-config.txt" | |||||
#cloud-config | |||||
cloud_final_modules: | |||||
- [scripts-user, always] | |||||
--// | |||||
Content-Type: text/x-shellscript; charset="us-ascii" | |||||
MIME-Version: 1.0 | |||||
Content-Transfer-Encoding: 7bit | |||||
Content-Disposition: attachment; filename="userdata.txt" | |||||
#!/bin/bash | |||||
# --- paste contents of userdata.sh here --- | |||||
--// |
# Resume all interrupted trainings in yolov5/ dir including DPP trainings | |||||
# Usage: $ python utils/aws/resume.py | |||||
import os | |||||
from pathlib import Path | |||||
import torch | |||||
import yaml | |||||
port = 0 # --master_port | |||||
path = Path('').resolve() | |||||
for last in path.rglob('*/**/last.pt'): | |||||
ckpt = torch.load(last) | |||||
if ckpt['optimizer'] is None: | |||||
continue | |||||
# Load opt.yaml | |||||
with open(last.parent.parent / 'opt.yaml') as f: | |||||
opt = yaml.load(f, Loader=yaml.SafeLoader) | |||||
# Get device count | |||||
d = opt['device'].split(',') # devices | |||||
nd = len(d) # number of devices | |||||
ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1) # distributed data parallel | |||||
if ddp: # multi-GPU | |||||
port += 1 | |||||
cmd = f'python -m torch.distributed.launch --nproc_per_node {nd} --master_port {port} train.py --resume {last}' | |||||
else: # single-GPU | |||||
cmd = f'python train.py --resume {last}' | |||||
cmd += ' > /dev/null 2>&1 &' # redirect output to dev/null and run in daemon thread | |||||
print(cmd) | |||||
os.system(cmd) |
#!/bin/bash | |||||
# AWS EC2 instance startup script https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/user-data.html | |||||
# This script will run only once on first instance start (for a re-start script see mime.sh) | |||||
# /home/ubuntu (ubuntu) or /home/ec2-user (amazon-linux) is working dir | |||||
# Use >300 GB SSD | |||||
cd home/ubuntu | |||||
if [ ! -d yolov5 ]; then | |||||
echo "Running first-time script." # install dependencies, download COCO, pull Docker | |||||
git clone https://github.com/ultralytics/yolov5 && sudo chmod -R 777 yolov5 | |||||
cd yolov5 | |||||
bash data/scripts/get_coco.sh && echo "Data done." & | |||||
sudo docker pull ultralytics/yolov5:latest && echo "Docker done." & | |||||
# python -m pip install --upgrade pip && pip install -r requirements.txt && python detect.py && echo "Requirements done." & | |||||
else | |||||
echo "Running re-start script." # resume interrupted runs | |||||
i=0 | |||||
list=$(docker ps -qa) # container list i.e. $'one\ntwo\nthree\nfour' | |||||
while IFS= read -r id; do | |||||
((i++)) | |||||
echo "restarting container $i: $id" | |||||
docker start $id | |||||
# docker exec -it $id python train.py --resume # single-GPU | |||||
docker exec -d $id python utils/aws/resume.py | |||||
done <<<"$list" | |||||
fi |