Browse Source

Auto-fix corrupt JPEGs (#4548)

* Autofix corrupt JPEGs

This PR automatically re-saves corrupt JPEGs and trains with the resaved images. WARNING: this will overwrite the existing corrupt JPEGs in a dataset and replace them with correct JPEGs, though the filesize may increase and the image contents may not be exactly the same due to lossy JPEG compression schemes. Results may vary by JPEG decoder and hardware.

Current behavior is to exclude corrupt JPEGs from training with a warning to the user, but many users have been complaining about large parts of their dataset being excluded from training.

* Clarify re-save reason
modifyDataloader
Glenn Jocher GitHub 3 years ago
parent
commit
11f85e7e71
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 7 additions and 6 deletions
  1. +7
    -6
      utils/datasets.py

+ 7
- 6
utils/datasets.py View File

@@ -314,7 +314,7 @@ class LoadStreams: # multiple IP or RTSP cameras
print('') # newline

# check for common shapes
s = np.stack([letterbox(x, self.img_size, stride=self.stride, auto=self.auto)[0].shape for x in self.imgs], 0) # shapes
s = np.stack([letterbox(x, self.img_size, stride=self.stride, auto=self.auto)[0].shape for x in self.imgs])
self.rect = np.unique(s, axis=0).shape[0] == 1 # rect inference if all shapes equal
if not self.rect:
print('WARNING: Different stream shapes detected. For optimal performance supply similarly-shaped streams.')
@@ -568,7 +568,7 @@ class LoadImagesAndLabels(Dataset): # for training/testing
if self.augment:
# Albumentations
img, labels = self.albumentations(img, labels)
nl = len(labels) # update after albumentations
nl = len(labels) # update after albumentations

# HSV color-space
augment_hsv(img, hgain=hyp['hsv_h'], sgain=hyp['hsv_s'], vgain=hyp['hsv_v'])
@@ -861,7 +861,7 @@ def autosplit(path='../datasets/coco128/images', weights=(0.9, 0.1, 0.0), annota
def verify_image_label(args):
# Verify one image-label pair
im_file, lb_file, prefix = args
nm, nf, ne, nc = 0, 0, 0, 0 # number missing, found, empty, corrupt
nm, nf, ne, nc, msg, segments = 0, 0, 0, 0, '', [] # number (missing, found, empty, corrupt), message, segments
try:
# verify images
im = Image.open(im_file)
@@ -872,10 +872,11 @@ def verify_image_label(args):
if im.format.lower() in ('jpg', 'jpeg'):
with open(im_file, 'rb') as f:
f.seek(-2, 2)
assert f.read() == b'\xff\xd9', 'corrupted JPEG'
if f.read() != b'\xff\xd9': # corrupt JPEG
im.save(im_file, format='JPEG', subsampling=0, quality=100) # re-save image
msg = f'{prefix}WARNING: corrupt JPEG restored and saved {im_file}'

# verify labels
segments = [] # instance segments
if os.path.isfile(lb_file):
nf = 1 # label found
with open(lb_file, 'r') as f:
@@ -896,7 +897,7 @@ def verify_image_label(args):
else:
nm = 1 # label missing
l = np.zeros((0, 5), dtype=np.float32)
return im_file, l, shape, segments, nm, nf, ne, nc, ''
return im_file, l, shape, segments, nm, nf, ne, nc, msg
except Exception as e:
nc = 1
msg = f'{prefix}WARNING: Ignoring corrupted image and/or label {im_file}: {e}'

Loading…
Cancel
Save