|
| 1 | +import os |
| 2 | +import cv2 |
| 3 | +import copy |
| 4 | +import numpy as np |
| 5 | +import imgaug as ia |
| 6 | +from imgaug import augmenters as iaa |
| 7 | +from keras.utils import Sequence |
| 8 | +import xml.etree.ElementTree as ET |
| 9 | +from utils import BoundBox, bbox_iou |
| 10 | + |
| 11 | +def parse_annotation(ann_dir, img_dir, labels=[]): |
| 12 | + all_imgs = [] |
| 13 | + seen_labels = {} |
| 14 | + |
| 15 | + for ann in sorted(os.listdir(ann_dir)): |
| 16 | + img = {'object':[]} |
| 17 | + |
| 18 | + tree = ET.parse(ann_dir + ann) |
| 19 | + |
| 20 | + for elem in tree.iter(): |
| 21 | + if 'filename' in elem.tag: |
| 22 | + img['filename'] = img_dir + elem.text |
| 23 | + if 'width' in elem.tag: |
| 24 | + img['width'] = int(elem.text) |
| 25 | + if 'height' in elem.tag: |
| 26 | + img['height'] = int(elem.text) |
| 27 | + if 'object' in elem.tag or 'part' in elem.tag: |
| 28 | + obj = {} |
| 29 | + |
| 30 | + for attr in list(elem): |
| 31 | + if 'name' in attr.tag: |
| 32 | + obj['name'] = attr.text |
| 33 | + |
| 34 | + if obj['name'] in seen_labels: |
| 35 | + seen_labels[obj['name']] += 1 |
| 36 | + else: |
| 37 | + seen_labels[obj['name']] = 1 |
| 38 | + |
| 39 | + if len(labels) > 0 and obj['name'] not in labels: |
| 40 | + break |
| 41 | + else: |
| 42 | + img['object'] += [obj] |
| 43 | + |
| 44 | + if 'bndbox' in attr.tag: |
| 45 | + for dim in list(attr): |
| 46 | + if 'xmin' in dim.tag: |
| 47 | + obj['xmin'] = int(round(float(dim.text))) |
| 48 | + if 'ymin' in dim.tag: |
| 49 | + obj['ymin'] = int(round(float(dim.text))) |
| 50 | + if 'xmax' in dim.tag: |
| 51 | + obj['xmax'] = int(round(float(dim.text))) |
| 52 | + if 'ymax' in dim.tag: |
| 53 | + obj['ymax'] = int(round(float(dim.text))) |
| 54 | + |
| 55 | + if len(img['object']) > 0: |
| 56 | + all_imgs += [img] |
| 57 | + |
| 58 | + return all_imgs, seen_labels |
| 59 | + |
| 60 | +class BatchGenerator(Sequence): |
| 61 | + def __init__(self, images, |
| 62 | + config, |
| 63 | + shuffle=True, |
| 64 | + jitter=True, |
| 65 | + norm=None): |
| 66 | + self.generator = None |
| 67 | + |
| 68 | + self.images = images |
| 69 | + self.config = config |
| 70 | + |
| 71 | + self.shuffle = shuffle |
| 72 | + self.jitter = jitter |
| 73 | + self.norm = norm |
| 74 | + |
| 75 | + self.anchors = [BoundBox(0, 0, config['ANCHORS'][2*i], config['ANCHORS'][2*i+1]) for i in range(int(len(config['ANCHORS'])//2))] |
| 76 | + |
| 77 | + ### augmentors by https://github.com/aleju/imgaug |
| 78 | + sometimes = lambda aug: iaa.Sometimes(0.5, aug) |
| 79 | + |
| 80 | + # Define our sequence of augmentation steps that will be applied to every image |
| 81 | + # All augmenters with per_channel=0.5 will sample one value _per image_ |
| 82 | + # in 50% of all cases. In all other cases they will sample new values |
| 83 | + # _per channel_. |
| 84 | + self.aug_pipe = iaa.Sequential( |
| 85 | + [ |
| 86 | + # apply the following augmenters to most images |
| 87 | + #iaa.Fliplr(0.5), # horizontally flip 50% of all images |
| 88 | + #iaa.Flipud(0.2), # vertically flip 20% of all images |
| 89 | + #sometimes(iaa.Crop(percent=(0, 0.1))), # crop images by 0-10% of their height/width |
| 90 | + sometimes(iaa.Affine( |
| 91 | + #scale={"x": (0.8, 1.2), "y": (0.8, 1.2)}, # scale images to 80-120% of their size, individually per axis |
| 92 | + #translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)}, # translate by -20 to +20 percent (per axis) |
| 93 | + #rotate=(-5, 5), # rotate by -45 to +45 degrees |
| 94 | + #shear=(-5, 5), # shear by -16 to +16 degrees |
| 95 | + #order=[0, 1], # use nearest neighbour or bilinear interpolation (fast) |
| 96 | + #cval=(0, 255), # if mode is constant, use a cval between 0 and 255 |
| 97 | + #mode=ia.ALL # use any of scikit-image's warping modes (see 2nd image from the top for examples) |
| 98 | + )), |
| 99 | + # execute 0 to 5 of the following (less important) augmenters per image |
| 100 | + # don't execute all of them, as that would often be way too strong |
| 101 | + iaa.SomeOf((0, 5), |
| 102 | + [ |
| 103 | + #sometimes(iaa.Superpixels(p_replace=(0, 1.0), n_segments=(20, 200))), # convert images into their superpixel representation |
| 104 | + iaa.OneOf([ |
| 105 | + iaa.GaussianBlur((0, 3.0)), # blur images with a sigma between 0 and 3.0 |
| 106 | + iaa.AverageBlur(k=(2, 7)), # blur image using local means with kernel sizes between 2 and 7 |
| 107 | + iaa.MedianBlur(k=(3, 11)), # blur image using local medians with kernel sizes between 2 and 7 |
| 108 | + ]), |
| 109 | + iaa.Sharpen(alpha=(0, 1.0), lightness=(0.75, 1.5)), # sharpen images |
| 110 | + #iaa.Emboss(alpha=(0, 1.0), strength=(0, 2.0)), # emboss images |
| 111 | + # search either for all edges or for directed edges |
| 112 | + #sometimes(iaa.OneOf([ |
| 113 | + # iaa.EdgeDetect(alpha=(0, 0.7)), |
| 114 | + # iaa.DirectedEdgeDetect(alpha=(0, 0.7), direction=(0.0, 1.0)), |
| 115 | + #])), |
| 116 | + iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255), per_channel=0.5), # add gaussian noise to images |
| 117 | + iaa.OneOf([ |
| 118 | + iaa.Dropout((0.01, 0.1), per_channel=0.5), # randomly remove up to 10% of the pixels |
| 119 | + #iaa.CoarseDropout((0.03, 0.15), size_percent=(0.02, 0.05), per_channel=0.2), |
| 120 | + ]), |
| 121 | + #iaa.Invert(0.05, per_channel=True), # invert color channels |
| 122 | + iaa.Add((-10, 10), per_channel=0.5), # change brightness of images (by -10 to 10 of original value) |
| 123 | + iaa.Multiply((0.5, 1.5), per_channel=0.5), # change brightness of images (50-150% of original value) |
| 124 | + iaa.ContrastNormalization((0.5, 2.0), per_channel=0.5), # improve or worsen the contrast |
| 125 | + #iaa.Grayscale(alpha=(0.0, 1.0)), |
| 126 | + #sometimes(iaa.ElasticTransformation(alpha=(0.5, 3.5), sigma=0.25)), # move pixels locally around (with random strengths) |
| 127 | + #sometimes(iaa.PiecewiseAffine(scale=(0.01, 0.05))) # sometimes move parts of the image around |
| 128 | + ], |
| 129 | + random_order=True |
| 130 | + ) |
| 131 | + ], |
| 132 | + random_order=True |
| 133 | + ) |
| 134 | + |
| 135 | + if shuffle: np.random.shuffle(self.images) |
| 136 | + |
| 137 | + def __len__(self): |
| 138 | + return int(np.ceil(float(len(self.images))/self.config['BATCH_SIZE'])) |
| 139 | + |
| 140 | + def num_classes(self): |
| 141 | + return len(self.config['LABELS']) |
| 142 | + |
| 143 | + def size(self): |
| 144 | + return len(self.images) |
| 145 | + |
| 146 | + def load_annotation(self, i): |
| 147 | + annots = [] |
| 148 | + |
| 149 | + for obj in self.images[i]['object']: |
| 150 | + annot = [obj['xmin'], obj['ymin'], obj['xmax'], obj['ymax'], self.config['LABELS'].index(obj['name'])] |
| 151 | + annots += [annot] |
| 152 | + |
| 153 | + if len(annots) == 0: annots = [[]] |
| 154 | + |
| 155 | + return np.array(annots) |
| 156 | + |
| 157 | + def load_image(self, i): |
| 158 | + return cv2.imread(self.images[i]['filename']) |
| 159 | + |
| 160 | + def __getitem__(self, idx): |
| 161 | + l_bound = idx*self.config['BATCH_SIZE'] |
| 162 | + r_bound = (idx+1)*self.config['BATCH_SIZE'] |
| 163 | + |
| 164 | + if r_bound > len(self.images): |
| 165 | + r_bound = len(self.images) |
| 166 | + l_bound = r_bound - self.config['BATCH_SIZE'] |
| 167 | + |
| 168 | + instance_count = 0 |
| 169 | + |
| 170 | + x_batch = np.zeros((r_bound - l_bound, self.config['IMAGE_H'], self.config['IMAGE_W'], 3)) # input images |
| 171 | + b_batch = np.zeros((r_bound - l_bound, 1 , 1 , 1 , self.config['TRUE_BOX_BUFFER'], 4)) # list of self.config['TRUE_self.config['BOX']_BUFFER'] GT boxes |
| 172 | + y_batch = np.zeros((r_bound - l_bound, self.config['GRID_H'], self.config['GRID_W'], self.config['BOX'], 4+1+len(self.config['LABELS']))) # desired network output |
| 173 | + |
| 174 | + for train_instance in self.images[l_bound:r_bound]: |
| 175 | + # augment input image and fix object's position and size |
| 176 | + img, all_objs = self.aug_image(train_instance, jitter=self.jitter) |
| 177 | + |
| 178 | + # construct output from object's x, y, w, h |
| 179 | + true_box_index = 0 |
| 180 | + |
| 181 | + for obj in all_objs: |
| 182 | + if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin'] and obj['name'] in self.config['LABELS']: |
| 183 | + center_x = .5*(obj['xmin'] + obj['xmax']) |
| 184 | + center_x = center_x / (float(self.config['IMAGE_W']) / self.config['GRID_W']) |
| 185 | + center_y = .5*(obj['ymin'] + obj['ymax']) |
| 186 | + center_y = center_y / (float(self.config['IMAGE_H']) / self.config['GRID_H']) |
| 187 | + |
| 188 | + grid_x = int(np.floor(center_x)) |
| 189 | + grid_y = int(np.floor(center_y)) |
| 190 | + |
| 191 | + if grid_x < self.config['GRID_W'] and grid_y < self.config['GRID_H']: |
| 192 | + obj_indx = self.config['LABELS'].index(obj['name']) |
| 193 | + |
| 194 | + center_w = (obj['xmax'] - obj['xmin']) / (float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell |
| 195 | + center_h = (obj['ymax'] - obj['ymin']) / (float(self.config['IMAGE_H']) / self.config['GRID_H']) # unit: grid cell |
| 196 | + |
| 197 | + box = [center_x, center_y, center_w, center_h] |
| 198 | + |
| 199 | + # find the anchor that best predicts this box |
| 200 | + best_anchor = -1 |
| 201 | + max_iou = -1 |
| 202 | + |
| 203 | + shifted_box = BoundBox(0, |
| 204 | + 0, |
| 205 | + center_w, |
| 206 | + center_h) |
| 207 | + |
| 208 | + for i in range(len(self.anchors)): |
| 209 | + anchor = self.anchors[i] |
| 210 | + iou = bbox_iou(shifted_box, anchor) |
| 211 | + |
| 212 | + if max_iou < iou: |
| 213 | + best_anchor = i |
| 214 | + max_iou = iou |
| 215 | + |
| 216 | + # assign ground truth x, y, w, h, confidence and class probs to y_batch |
| 217 | + y_batch[instance_count, grid_y, grid_x, best_anchor, 0:4] = box |
| 218 | + y_batch[instance_count, grid_y, grid_x, best_anchor, 4 ] = 1. |
| 219 | + y_batch[instance_count, grid_y, grid_x, best_anchor, 5+obj_indx] = 1 |
| 220 | + |
| 221 | + # assign the true box to b_batch |
| 222 | + b_batch[instance_count, 0, 0, 0, true_box_index] = box |
| 223 | + |
| 224 | + true_box_index += 1 |
| 225 | + true_box_index = true_box_index % self.config['TRUE_BOX_BUFFER'] |
| 226 | + |
| 227 | + # assign input image to x_batch |
| 228 | + if self.norm != None: |
| 229 | + x_batch[instance_count] = self.norm(img) |
| 230 | + else: |
| 231 | + # plot image and bounding boxes for sanity check |
| 232 | + for obj in all_objs: |
| 233 | + if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin']: |
| 234 | + cv2.rectangle(img[:,:,::-1], (obj['xmin'],obj['ymin']), (obj['xmax'],obj['ymax']), (255,0,0), 3) |
| 235 | + cv2.putText(img[:,:,::-1], obj['name'], |
| 236 | + (obj['xmin']+2, obj['ymin']+12), |
| 237 | + 0, 1.2e-3 * img.shape[0], |
| 238 | + (0,255,0), 2) |
| 239 | + |
| 240 | + x_batch[instance_count] = img |
| 241 | + |
| 242 | + # increase instance counter in current batch |
| 243 | + instance_count += 1 |
| 244 | + |
| 245 | + #print(' new batch created', idx) |
| 246 | + |
| 247 | + return [x_batch, b_batch], y_batch |
| 248 | + |
| 249 | + def on_epoch_end(self): |
| 250 | + if self.shuffle: np.random.shuffle(self.images) |
| 251 | + |
| 252 | + def aug_image(self, train_instance, jitter): |
| 253 | + image_name = train_instance['filename'] |
| 254 | + image = cv2.imread(image_name) |
| 255 | + |
| 256 | + if image is None: print('Cannot find ', image_name) |
| 257 | + |
| 258 | + h, w, c = image.shape |
| 259 | + all_objs = copy.deepcopy(train_instance['object']) |
| 260 | + |
| 261 | + if jitter: |
| 262 | + ### scale the image |
| 263 | + scale = np.random.uniform() / 10. + 1. |
| 264 | + image = cv2.resize(image, (0,0), fx = scale, fy = scale) |
| 265 | + |
| 266 | + ### translate the image |
| 267 | + max_offx = (scale-1.) * w |
| 268 | + max_offy = (scale-1.) * h |
| 269 | + offx = int(np.random.uniform() * max_offx) |
| 270 | + offy = int(np.random.uniform() * max_offy) |
| 271 | + |
| 272 | + image = image[offy : (offy + h), offx : (offx + w)] |
| 273 | + |
| 274 | + ### flip the image |
| 275 | + flip = np.random.binomial(1, .5) |
| 276 | + if flip > 0.5: image = cv2.flip(image, 1) |
| 277 | + |
| 278 | + image = self.aug_pipe.augment_image(image) |
| 279 | + |
| 280 | + # resize the image to standard size |
| 281 | + image = cv2.resize(image, (self.config['IMAGE_H'], self.config['IMAGE_W'])) |
| 282 | + image = image[:,:,::-1] |
| 283 | + |
| 284 | + # fix object's position and size |
| 285 | + for obj in all_objs: |
| 286 | + for attr in ['xmin', 'xmax']: |
| 287 | + if jitter: obj[attr] = int(obj[attr] * scale - offx) |
| 288 | + |
| 289 | + obj[attr] = int(obj[attr] * float(self.config['IMAGE_W']) / w) |
| 290 | + obj[attr] = max(min(obj[attr], self.config['IMAGE_W']), 0) |
| 291 | + |
| 292 | + for attr in ['ymin', 'ymax']: |
| 293 | + if jitter: obj[attr] = int(obj[attr] * scale - offy) |
| 294 | + |
| 295 | + obj[attr] = int(obj[attr] * float(self.config['IMAGE_H']) / h) |
| 296 | + obj[attr] = max(min(obj[attr], self.config['IMAGE_H']), 0) |
| 297 | + |
| 298 | + if jitter and flip > 0.5: |
| 299 | + xmin = obj['xmin'] |
| 300 | + obj['xmin'] = self.config['IMAGE_W'] - obj['xmax'] |
| 301 | + obj['xmax'] = self.config['IMAGE_W'] - xmin |
| 302 | + |
| 303 | + return image, all_objs |
0 commit comments