Browse Source

Autofix duplicate label handling (#5210)

* Autofix duplicate labels

PR changes duplicate label handling from report error and ignore image-label pair to report warning and autofix image-label pair. 

This should fix this common issue for users and allow everyone to get started and get a model trained faster and easier than before.

* sign fix

* Cleanup

* Increment cache version

* all to any fix
modifyDataloader
Glenn Jocher GitHub 3 years ago
parent
commit
991c654e81
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 12 additions and 8 deletions
  1. +12
    -8
      utils/datasets.py

+ 12
- 8
utils/datasets.py View File



class LoadImagesAndLabels(Dataset): class LoadImagesAndLabels(Dataset):
# YOLOv5 train_loader/val_loader, loads images and labels for training and validation # YOLOv5 train_loader/val_loader, loads images and labels for training and validation
cache_version = 0.5 # dataset labels *.cache version
cache_version = 0.6 # dataset labels *.cache version


def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False, def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False,
cache_images=False, single_cls=False, stride=32, pad=0.0, prefix=''): cache_images=False, single_cls=False, stride=32, pad=0.0, prefix=''):
f.seek(-2, 2) f.seek(-2, 2)
if f.read() != b'\xff\xd9': # corrupt JPEG if f.read() != b'\xff\xd9': # corrupt JPEG
Image.open(im_file).save(im_file, format='JPEG', subsampling=0, quality=100) # re-save image Image.open(im_file).save(im_file, format='JPEG', subsampling=0, quality=100) # re-save image
msg = f'{prefix}WARNING: corrupt JPEG restored and saved {im_file}'
msg = f'{prefix}WARNING: {im_file}: corrupt JPEG restored and saved'


# verify labels # verify labels
if os.path.isfile(lb_file): if os.path.isfile(lb_file):
segments = [np.array(x[1:], dtype=np.float32).reshape(-1, 2) for x in l] # (cls, xy1...) segments = [np.array(x[1:], dtype=np.float32).reshape(-1, 2) for x in l] # (cls, xy1...)
l = np.concatenate((classes.reshape(-1, 1), segments2boxes(segments)), 1) # (cls, xywh) l = np.concatenate((classes.reshape(-1, 1), segments2boxes(segments)), 1) # (cls, xywh)
l = np.array(l, dtype=np.float32) l = np.array(l, dtype=np.float32)
if len(l):
assert l.shape[1] == 5, 'labels require 5 columns each'
assert (l >= 0).all(), 'negative labels'
assert (l[:, 1:] <= 1).all(), 'non-normalized or out of bounds coordinate labels'
assert np.unique(l, axis=0).shape[0] == l.shape[0], 'duplicate labels'
nl = len(l)
if nl:
assert l.shape[1] == 5, f'labels require 5 columns, {l.shape[1]} columns detected'
assert (l >= 0).all(), f'negative label values {l[l < 0]}'
assert (l[:, 1:] <= 1).all(), f'non-normalized or out of bounds coordinates {l[:, 1:][l[:, 1:] > 1]}'
l = np.unique(l, axis=0) # remove duplicate rows
if len(l) < nl:
segments = np.unique(segments, axis=0)
msg = f'{prefix}WARNING: {im_file}: {nl - len(l)} duplicate labels removed'
else: else:
ne = 1 # label empty ne = 1 # label empty
l = np.zeros((0, 5), dtype=np.float32) l = np.zeros((0, 5), dtype=np.float32)
return im_file, l, shape, segments, nm, nf, ne, nc, msg return im_file, l, shape, segments, nm, nf, ne, nc, msg
except Exception as e: except Exception as e:
nc = 1 nc = 1
msg = f'{prefix}WARNING: Ignoring corrupted image and/or label {im_file}: {e}'
msg = f'{prefix}WARNING: {im_file}: ignoring corrupt image/label: {e}'
return [None, None, None, None, nm, nf, ne, nc, msg] return [None, None, None, None, nm, nf, ne, nc, msg]





Loading…
Cancel
Save