|
|
@@ -859,7 +859,7 @@ def flatten_recursive(path=DATASETS_DIR / 'coco128'): |
|
|
|
shutil.copyfile(file, new_path / Path(file).name) |
|
|
|
|
|
|
|
|
|
|
|
def extract_boxes(path=DATASETS_DIR / 'coco128'): # from utils.datasets import *; extract_boxes() |
|
|
|
def extract_boxes(path=DATASETS_DIR / 'coco128'): # from utils.dataloaders import *; extract_boxes() |
|
|
|
# Convert detection dataset into classification dataset, with one directory per class |
|
|
|
path = Path(path) # images dir |
|
|
|
shutil.rmtree(path / 'classifier') if (path / 'classifier').is_dir() else None # remove existing |
|
|
@@ -895,7 +895,7 @@ def extract_boxes(path=DATASETS_DIR / 'coco128'): # from utils.datasets import |
|
|
|
|
|
|
|
def autosplit(path=DATASETS_DIR / 'coco128/images', weights=(0.9, 0.1, 0.0), annotated_only=False): |
|
|
|
""" Autosplit a dataset into train/val/test splits and save path/autosplit_*.txt files |
|
|
|
Usage: from utils.datasets import *; autosplit() |
|
|
|
Usage: from utils.dataloaders import *; autosplit() |
|
|
|
Arguments |
|
|
|
path: Path to images directory |
|
|
|
weights: Train, val, test weights (list, tuple) |
|
|
@@ -972,29 +972,40 @@ def verify_image_label(args): |
|
|
|
def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profile=False, hub=False): |
|
|
|
""" Return dataset statistics dictionary with images and instances counts per split per class |
|
|
|
To run in parent directory: export PYTHONPATH="$PWD/yolov5" |
|
|
|
Usage1: from utils.datasets import *; dataset_stats('coco128.yaml', autodownload=True) |
|
|
|
Usage2: from utils.datasets import *; dataset_stats('path/to/coco128_with_yaml.zip') |
|
|
|
Usage1: from utils.dataloaders import *; dataset_stats('coco128.yaml', autodownload=True) |
|
|
|
Usage2: from utils.dataloaders import *; dataset_stats('path/to/coco128_with_yaml.zip') |
|
|
|
Arguments |
|
|
|
path: Path to data.yaml or data.zip (with data.yaml inside data.zip) |
|
|
|
autodownload: Attempt to download dataset if not found locally |
|
|
|
verbose: Print stats dictionary |
|
|
|
""" |
|
|
|
|
|
|
|
def round_labels(labels): |
|
|
|
def _round_labels(labels): |
|
|
|
# Update labels to integer class and 6 decimal place floats |
|
|
|
return [[int(c), *(round(x, 4) for x in points)] for c, *points in labels] |
|
|
|
|
|
|
|
def unzip(path): |
|
|
|
# Unzip data.zip TODO: CONSTRAINT: path/to/abc.zip MUST unzip to 'path/to/abc/' |
|
|
|
def _find_yaml(dir): |
|
|
|
# Return data.yaml file |
|
|
|
files = list(dir.glob('*.yaml')) or list(dir.rglob('*.yaml')) # try root level first and then recursive |
|
|
|
assert files, f'No *.yaml file found in {dir}' |
|
|
|
if len(files) > 1: |
|
|
|
files = [f for f in files if f.stem == dir.stem] # prefer *.yaml files that match dir name |
|
|
|
assert files, f'Multiple *.yaml files found in {dir}, only 1 *.yaml file allowed' |
|
|
|
assert len(files) == 1, f'Multiple *.yaml files found: {files}, only 1 *.yaml file allowed in {dir}' |
|
|
|
return files[0] |
|
|
|
|
|
|
|
def _unzip(path): |
|
|
|
# Unzip data.zip |
|
|
|
if str(path).endswith('.zip'): # path is data.zip |
|
|
|
assert Path(path).is_file(), f'Error unzipping {path}, file not found' |
|
|
|
ZipFile(path).extractall(path=path.parent) # unzip |
|
|
|
dir = path.with_suffix('') # dataset directory == zip name |
|
|
|
return True, str(dir), next(dir.rglob('*.yaml')) # zipped, data_dir, yaml_path |
|
|
|
assert dir.is_dir(), f'Error unzipping {path}, {dir} not found. path/to/abc.zip MUST unzip to path/to/abc/' |
|
|
|
return True, str(dir), _find_yaml(dir) # zipped, data_dir, yaml_path |
|
|
|
else: # path is data.yaml |
|
|
|
return False, None, path |
|
|
|
|
|
|
|
def hub_ops(f, max_dim=1920): |
|
|
|
def _hub_ops(f, max_dim=1920): |
|
|
|
# HUB ops for 1 image 'f': resize and save at reduced quality in /dataset-hub for web/app viewing |
|
|
|
f_new = im_dir / Path(f).name # dataset-hub image filename |
|
|
|
try: # use PIL |
|
|
@@ -1012,7 +1023,7 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profil |
|
|
|
im = cv2.resize(im, (int(im_width * r), int(im_height * r)), interpolation=cv2.INTER_AREA) |
|
|
|
cv2.imwrite(str(f_new), im) |
|
|
|
|
|
|
|
zipped, data_dir, yaml_path = unzip(Path(path)) |
|
|
|
zipped, data_dir, yaml_path = _unzip(Path(path)) |
|
|
|
with open(check_yaml(yaml_path), errors='ignore') as f: |
|
|
|
data = yaml.safe_load(f) # data dict |
|
|
|
if zipped: |
|
|
@@ -1038,12 +1049,12 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profil |
|
|
|
'unlabelled': int(np.all(x == 0, 1).sum()), |
|
|
|
'per_class': (x > 0).sum(0).tolist()}, |
|
|
|
'labels': [{ |
|
|
|
str(Path(k).name): round_labels(v.tolist())} for k, v in zip(dataset.im_files, dataset.labels)]} |
|
|
|
str(Path(k).name): _round_labels(v.tolist())} for k, v in zip(dataset.im_files, dataset.labels)]} |
|
|
|
|
|
|
|
if hub: |
|
|
|
im_dir = hub_dir / 'images' |
|
|
|
im_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
for _ in tqdm(ThreadPool(NUM_THREADS).imap(hub_ops, dataset.im_files), total=dataset.n, desc='HUB Ops'): |
|
|
|
for _ in tqdm(ThreadPool(NUM_THREADS).imap(_hub_ops, dataset.im_files), total=dataset.n, desc='HUB Ops'): |
|
|
|
pass |
|
|
|
|
|
|
|
# Profile |