modifications

wizyoung · wizyoung · commit d081b581641f · 2019-06-30T01:00:55.000+08:00
(1) add the letterbox resize method
(2) kmeans script updated
(3) annotation format changed
(4) add gradient clipping
diff --git a/README.md b/README.md
@@ -86,13 +86,13 @@ For better understanding of the model architecture, you can refer to the followi
 
 (1) annotation file
 
-Generate `train.txt/val.txt/test.txt` files under `./data/my_data/` directory. One line for one image, in the format like `image_index image_absolute_path box_1 box_2 ... box_n`. Box_x format: `label_index x_min y_min x_max y_max`. (The origin of coordinates is at the left top corner, left top => (xmin, ymin), right bottom => (xmax, ymax).) `image_index` is the line index which starts from zero. `label_index` is in range [0, class_num - 1].
+Generate `train.txt/val.txt/test.txt` files under `./data/my_data/` directory. One line for one image, in the format like `image_index image_absolute_path img_width img_height box_1 box_2 ... box_n`. Box_x format: `label_index x_min y_min x_max y_max`. (The origin of coordinates is at the left top corner, left top => (xmin, ymin), right bottom => (xmax, ymax).) `image_index` is the line index which starts from zero. `label_index` is in range [0, class_num - 1].
 
 For example:
 
 ```
-0 xxx/xxx/a.jpg 0 453 369 473 391 1 588 245 608 268
-1 xxx/xxx/b.jpg 1 466 403 485 422 2 793 300 809 320
+0 xxx/xxx/a.jpg 300 400 0 453 369 473 391 1 588 245 608 268
+1 xxx/xxx/b.jpg 300 400 1 466 403 485 422 2 793 300 809 320
 ...
 ```
 
@@ -123,7 +123,7 @@ Then you will get 9 anchors and the average IoU. Save the anchors to a txt file.
 
 The COCO dataset anchors offered by YOLO's author is placed at `./data/yolo_anchors.txt`, you can use that one too.
 
-**NOTE: The yolo anchors computed by the kmeans script is on the original image scale. You may need to resize the anchors to your target training image size before training and write them to the anchors txt file. Then you should not modify the anchors later.**
+The yolo anchors computed by the kmeans script is on the resized image scale.  The default resize method is the letterbox resize, i.e., keep the original aspect ratio in the resized image.
 
 #### 7.2 Training
 
@@ -164,19 +164,19 @@ For higher mAP, you should set score_threshold to a small number.
 
 Here are some training tricks in my experiment:
 
-(1) Apply the two-stage training strategy:
+(1) Apply the two-stage training strategy or the one-stage training strategy:
+
+Two-stage training:
 
 First stage: Restore `darknet53_body` part weights from COCO checkpoints, train the `yolov3_head` with big learning rate like 1e-3 until the loss reaches to a low level.
 
 Second stage: Restore the weights from the first stage, then train the whole model with small learning rate like 1e-4 or smaller. At this stage remember to restore the optimizer parameters if you use optimizers like adam.
 
-Or just restore the whole weight file except the last three convolution layers.
-
-(2) Quick train:
+One-stage training:
 
-If you want to obtain acceptable results in a short time like in 10 minutes. You can use the coco names but substitute several with real class names in your dataset. In this way you restore the whole pretrained COCO model and get a 80 class classification model, but you only care about the class names from your dataset.
+Just restore the whole weight file except the last three convolution layers (Conv_6, Conv_14, Conv_22). In this condition, be careful about the possible nan loss value.
 
-(3) I've included many useful training strategies in `args.py`:
+(2) I've included many useful training strategies in `args.py`:
 
 - Cosine decay of lr (SGDR)
 - Multi-scale training
@@ -188,7 +188,7 @@ These are all good strategies but it does **not** mean they will definitely impr
 
 This [paper](https://arxiv.org/abs/1902.04103) from gluon-cv has proved that data augmentation is critical to YOLO v3, which is completely in consistent with my own experiments. Some data augmentation strategies that seems reasonable may lead to poor performance. For example, after introducing random color jittering, the mAP on my own dataset drops heavily. Thus I hope  you pay extra attention to the data augmentation.
 
-(4) Loss nan? Setting a bigger warm_up_epoch number or less learning rate and try several more times. If you fine-tune the whole model, using adam may cause nan value sometimes. You can try choosing momentum optimizer.
+(4) Loss nan? Setting a bigger warm_up_epoch number or smaller learning rate and try several more times. If you fine-tune the whole model, using adam may cause nan value sometimes. You can try choosing momentum optimizer.
 
 ### 10. TODO
 
diff --git a/args.py b/args.py
@@ -19,9 +19,10 @@
 ### Training releated numbers
 batch_size = 20
 img_size = [416, 416]  # Images will be resized to `img_size` and fed to the network, size format: [width, height]
+letterbox_resize = False  # Whether to use the letterbox resize, i.e., keep the original aspect ratio in the resized image.
 total_epoches = 200
 train_evaluation_step = 100  # Evaluate on the training batch after some steps.
-val_evaluation_epoch = 1  # Evaluate on the whole validation dataset after some steps. Set to None to evaluate every epoch.
+val_evaluation_epoch = 1  # Evaluate on the whole validation dataset after some epochs. Set to None to evaluate every epoch.
 save_epoch = 10  # Save the model after some epochs.
 batch_norm_decay = 0.99  # decay in bn ops
 weight_decay = 5e-4  # l2 weight decay
@@ -32,10 +33,10 @@
 prefetech_buffer = 5  # Prefetech_buffer used in tf.data pipeline.
 
 ### Learning rate and optimizer
-optimizer_name = 'adam'  # Chosen from [sgd, momentum, adam, rmsprop]
+optimizer_name = 'momentum'  # Chosen from [sgd, momentum, adam, rmsprop]
 save_optimizer = True  # Whether to save the optimizer parameters into the checkpoint file.
 learning_rate_init = 1e-3
-lr_type = 'exponential'  # Chosen from [fixed, exponential, cosine_decay, cosine_decay_restart, piecewise]
+lr_type = 'piecewise'  # Chosen from [fixed, exponential, cosine_decay, cosine_decay_restart, piecewise]
 lr_decay_epoch = 5  # Epochs after which learning rate decays. Int or float. Used when chosen `exponential` and `cosine_decay_restart` lr_type.
 lr_decay_factor = 0.96  # The learning rate decay factor. Used when chosen `exponential` lr_type.
 lr_lower_bound = 1e-6  # The minimum learning rate.
@@ -73,6 +74,7 @@
 nms_topk = 150  # keep at most nms_topk outputs after nms
 # mAP eval
 eval_threshold = 0.5  # the iou threshold applied in mAP evaluation
+use_voc_07_metric = True  # whether to use voc 2007 evaluation metric, i.e. the 11-point metric
 
 ### parse some params
 anchors = parse_anchors(anchor_path)
diff --git a/eval.py b/eval.py
@@ -22,7 +22,7 @@
 parser.add_argument("--eval_file", type=str, default="./data/my_data/val.txt",
                     help="The path of the validation or test txt file.")
 
-parser.add_argument("--restore_path", type=str, default="/home/user/Documents/chenyang_projects/yolo_v3_voc/YOLOv3_TensorFlow_old/No_data_aug_bn_0.9/best_model_Epoch_52_step_43200.0_mAP_0.6752_loss_20.220579_lr_0.0005882013",
+parser.add_argument("--restore_path", type=str, default="./data/darknet_weights/yolov3.ckpt",
                     help="The path of the weights to restore.")
 
 parser.add_argument("--anchor_path", type=str, default="./data/yolo_anchors.txt",
@@ -35,6 +35,9 @@
 parser.add_argument("--img_size", nargs='*', type=int, default=[416, 416],
                     help="Resize the input image to `img_size`, size format: [width, height]")
 
+parser.add_argument("--letterbox_resize", type=lambda x: (str(x).lower() == 'true'), default=False,
+                    help="Whether to use the letterbox resize.")
+
 parser.add_argument("--num_threads", type=int, default=10,
                     help="Number of threads for image processing used in tf.data pipeline.")
 
@@ -50,6 +53,9 @@
 parser.add_argument("--nms_topk", type=int, default=400,
                     help="Keep at most nms_topk outputs after nms.")
 
+parser.add_argument("--use_voc_07_metric", type=lambda x: (str(x).lower() == 'true'), default=True,
+                    help="Whether to use the voc 2007 mAP metrics.")
+
 args = parser.parse_args()
 
 # args params
@@ -71,7 +77,7 @@
 val_dataset = tf.data.TextLineDataset(args.eval_file)
 val_dataset = val_dataset.batch(1)
 val_dataset = val_dataset.map(
-    lambda x: tf.py_func(get_batch_data, [x, args.class_num, args.img_size, args.anchors, 'val'], [tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]),
+    lambda x: tf.py_func(get_batch_data, [x, args.class_num, args.img_size, args.anchors, 'val', False, False, args.letterbox_resize], [tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]),
     num_parallel_calls=args.num_threads
 )
 val_dataset.prefetch(args.prefetech_buffer)
@@ -117,10 +123,10 @@
         val_loss_class.update(__loss[4])
 
     rec_total, prec_total, ap_total = AverageMeter(), AverageMeter(), AverageMeter()
-    gt_dict = parse_gt_rec(args.eval_file, args.img_size)
+    gt_dict = parse_gt_rec(args.eval_file, args.img_size, args.letterbox_resize)
     print('mAP eval:')
     for ii in range(args.class_num):
-        npos, nd, rec, prec, ap = voc_eval(gt_dict, val_preds, ii, iou_thres=0.5, use_07_metric=False)
+        npos, nd, rec, prec, ap = voc_eval(gt_dict, val_preds, ii, iou_thres=0.5, use_07_metric=args.use_voc_07_metric)
         rec_total.update(rec, npos)
         prec_total.update(prec, nd)
         ap_total.update(ap, 1)
diff --git a/get_kmeans.py b/get_kmeans.py
@@ -23,7 +23,8 @@ def iou(box, clusters):
     box_area = box[0] * box[1]
     cluster_area = clusters[:, 0] * clusters[:, 1]
 
-    iou_ = intersection / (box_area + cluster_area - intersection + 1e-10)
+    iou_ = np.true_divide(intersection, box_area + cluster_area - intersection + 1e-10)
+    # iou_ = intersection / (box_area + cluster_area - intersection + 1e-10)
 
     return iou_
 
@@ -92,20 +93,31 @@ def kmeans(boxes, k, dist=np.median):
     return clusters
 
 
-def parse_anno(annotation_path):
+def parse_anno(annotation_path, target_size=None):
     anno = open(annotation_path, 'r')
     result = []
     for line in anno:
         s = line.strip().split(' ')
-        s = s[2:]
+        img_w = int(s[2])
+        img_h = int(s[3])
+        s = s[4:]
         box_cnt = len(s) // 5
         for i in range(box_cnt):
             x_min, y_min, x_max, y_max = float(s[i*5+1]), float(s[i*5+2]), float(s[i*5+3]), float(s[i*5+4])
             width = x_max - x_min
             height = y_max - y_min
             assert width > 0
             assert height > 0
-            result.append([width, height])
+            # use letterbox resize, i.e. keep the original aspect ratio
+            # get k-means anchors on the resized target image size
+            if target_size is not None:
+                resize_ratio = min(target_size[0] / img_w, target_size[1] / img_h)
+                width *= resize_ratio
+                height *= resize_ratio
+                result.append([width, height])
+            # get k-means anchors on the original image size
+            else:
+                result.append([width, height])
     result = np.asarray(result)
     return result
 
@@ -123,8 +135,12 @@ def get_kmeans(anno, cluster_num=9):
 
 
 if __name__ == '__main__':
-    annotation_path = "./data/my_data/train.txt"
-    anno_result = parse_anno(annotation_path)
+    # target resize format: [width, height]
+    # if target_resize is speficied, the anchors are on the resized image scale
+    # if target_resize is set to None, the anchors are on the original image scale
+    target_size = [416, 416]
+    annotation_path = "train.txt"
+    anno_result = parse_anno(annotation_path, target_size=target_size)
     anchors, ave_iou = get_kmeans(anno_result, 9)
 
     anchor_string = ''
diff --git a/model.py b/model.py
@@ -215,6 +215,8 @@ def loss_layer(self, feature_map_i, y_true, anchors):
         # [N, 13, 13, 3, 1]
         object_mask = y_true[..., 4:5]
 
+        # the calculation of ignore mask if referred from
+        # https://github.com/pjreddie/darknet/blob/master/src/yolo_layer.c#L179
         ignore_mask = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
         def loop_cond(idx, ignore_mask):
             return tf.less(idx, tf.cast(N, tf.int32))
diff --git a/test_single_image.py b/test_single_image.py
@@ -10,6 +10,7 @@
 from utils.misc_utils import parse_anchors, read_class_names
 from utils.nms_utils import gpu_nms
 from utils.plot_utils import get_color_table, plot_one_box
+from utils.data_aug import letterbox_resize
 
 from model import yolov3
 
@@ -20,6 +21,8 @@
                     help="The path of the anchor txt file.")
 parser.add_argument("--new_size", nargs='*', type=int, default=[416, 416],
                     help="Resize the input image with `new_size`, size format: [width, height]")
+parser.add_argument("--letterbox_resize", type=lambda x: (str(x).lower() == 'true'), default=False,
+                    help="Whether to use the letterbox resize.")
 parser.add_argument("--class_name_path", type=str, default="./data/coco.names",
                     help="The path of the class names.")
 parser.add_argument("--restore_path", type=str, default="./data/darknet_weights/yolov3.ckpt",
@@ -33,8 +36,11 @@
 color_table = get_color_table(args.num_class)
 
 img_ori = cv2.imread(args.input_image)
-height_ori, width_ori = img_ori.shape[:2]
-img = cv2.resize(img_ori, tuple(args.new_size))
+if args.letterbox_resize:
+    img, resize_ratio, dw, dh = letterbox_resize(img_ori, args.new_size[0], args.new_size[1])
+else:
+    height_ori, width_ori = img_ori.shape[:2]
+    img = cv2.resize(img_ori, tuple(args.new_size))
 img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
 img = np.asarray(img, np.float32)
 img = img[np.newaxis, :] / 255.
@@ -56,10 +62,12 @@
     boxes_, scores_, labels_ = sess.run([boxes, scores, labels], feed_dict={input_data: img})
 
     # rescale the coordinates to the original image
-    boxes_[:, 0] *= (width_ori/float(args.new_size[0]))
-    boxes_[:, 2] *= (width_ori/float(args.new_size[0]))
-    boxes_[:, 1] *= (height_ori/float(args.new_size[1]))
-    boxes_[:, 3] *= (height_ori/float(args.new_size[1]))
+    if args.letterbox_resize:
+        boxes_[:, [0, 2]] = (boxes_[:, [0, 2]] - dw) / resize_ratio
+        boxes_[:, [1, 3]] = (boxes_[:, [1, 3]] - dh) / resize_ratio
+    else:
+        boxes_[:, [0, 2]] *= (width_ori/float(args.new_size[0]))
+        boxes_[:, [1, 3]] *= (height_ori/float(args.new_size[1]))
 
     print("box coords:")
     print(boxes_)
diff --git a/train.py b/train.py
@@ -36,7 +36,7 @@
 train_dataset = train_dataset.batch(args.batch_size)
 train_dataset = train_dataset.map(
     lambda x: tf.py_func(get_batch_data,
-                         inp=[x, args.class_num, args.img_size, args.anchors, 'train', args.multi_scale_train, args.use_mix_up],
+                         inp=[x, args.class_num, args.img_size, args.anchors, 'train', args.multi_scale_train, args.use_mix_up, args.letterbox_resize],
                          Tout=[tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]),
     num_parallel_calls=args.num_threads
 )
@@ -46,7 +46,7 @@
 val_dataset = val_dataset.batch(1)
 val_dataset = val_dataset.map(
     lambda x: tf.py_func(get_batch_data,
-                         inp=[x, args.class_num, args.img_size, args.anchors, 'val', False, False],
+                         inp=[x, args.class_num, args.img_size, args.anchors, 'val', False, False, args.letterbox_resize],
                          Tout=[tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]),
     num_parallel_calls=args.num_threads
 )
@@ -107,7 +107,12 @@
 # set dependencies for BN ops
 update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
 with tf.control_dependencies(update_ops):
-    train_op = optimizer.minimize(loss[0] + l2_loss, var_list=update_vars, global_step=global_step)
+    # train_op = optimizer.minimize(loss[0] + l2_loss, var_list=update_vars, global_step=global_step)
+    # apple gradient clip to avoid gradient exploding
+    gvs = optimizer.compute_gradients(loss[0] + l2_loss, var_list=update_vars)
+    clip_grad_var = [gv if gv[0] is None else [
+          tf.clip_by_norm(gv[0], 50.), gv[1]] for gv in gvs]
+    train_op = optimizer.apply_gradients(clip_grad_var, global_step=global_step)
 
 if args.save_optimizer:
     print('Saving optimizer parameters to checkpoint! Remember to restore the global_step in the fine-tuning afterwards.')
@@ -166,7 +171,7 @@
                 saver_to_save.save(sess, args.save_dir + 'model-epoch_{}_step_{}_loss_{:.4f}_lr_{:.5g}'.format(epoch, int(__global_step), loss_total.average, __lr))
 
         # switch to validation dataset for evaluation
-        if epoch % args.val_evaluation_epoch == 0 and epoch > 0:
+        if epoch % args.val_evaluation_epoch == 0 and epoch >= args.warm_up_epoch:
             sess.run(val_init_op)
 
             val_loss_total, val_loss_xy, val_loss_wh, val_loss_conf, val_loss_class = \
@@ -187,12 +192,12 @@
 
             # calc mAP
             rec_total, prec_total, ap_total = AverageMeter(), AverageMeter(), AverageMeter()
-            gt_dict = parse_gt_rec(args.val_file, args.img_size)
+            gt_dict = parse_gt_rec(args.val_file, args.img_size, args.letterbox_resize)
 
             info = '======> Epoch: {}, global_step: {}, lr: {:.6g} <======\n'.format(epoch, __global_step, __lr)
 
             for ii in range(args.class_num):
-                npos, nd, rec, prec, ap = voc_eval(gt_dict, val_preds, ii, iou_thres=args.eval_threshold, use_07_metric=False)
+                npos, nd, rec, prec, ap = voc_eval(gt_dict, val_preds, ii, iou_thres=args.eval_threshold, use_07_metric=args.use_voc_07_metric)
                 info += 'EVAL: Class {}: Recall: {:.4f}, Precision: {:.4f}, AP: {:.4f}\n'.format(ii, rec, prec, ap)
                 rec_total.update(rec, npos)
                 prec_total.update(prec, nd)
diff --git a/utils/data_aug.py b/utils/data_aug.py
@@ -271,24 +271,35 @@ def random_brightness(img, brightness_delta, p=0.5):
     return img
 
 
-def resize_with_bbox(img, bbox, new_width, new_height, interp=0, letterbox=False):
+def letterbox_resize(img, new_width, new_height, interp=0):
     '''
-    Resize the image and correct the bbox accordingly.
+    Letterbox resize. keep the original aspect ratio in the resized image.
     '''
     ori_height, ori_width = img.shape[:2]
 
-    if letterbox:
-        resize_ratio = min(new_width / ori_width, new_height / ori_height)
-        resize_w = int(resize_ratio * ori_width)
-        resize_h = int(resize_ratio * ori_height)
-        img = cv2.resize(img, (resize_w, resize_h), interpolation=interp)
+    resize_ratio = min(new_width / ori_width, new_height / ori_height)
+
+    resize_w = int(resize_ratio * ori_width)
+    resize_h = int(resize_ratio * ori_height)
+
+    img = cv2.resize(img, (resize_w, resize_h), interpolation=interp)
+    image_padded = np.full((new_height, new_width, 3), 128, np.uint8)
+
+    dw = int((new_width - resize_w) / 2)
+    dh = int((new_height - resize_h) / 2)
 
-        image_padded = np.full((new_height, new_width, 3), 128, np.uint8)
+    image_padded[dh: resize_h + dh, dw: resize_w + dw, :] = img
 
-        dw = int((new_width - resize_w) / 2)
-        dh = int((new_height - resize_h) / 2)
+    return image_padded, resize_ratio, dw, dh
 
-        image_padded[dh: resize_h + dh, dw: resize_w + dw, :] = img
+
+def resize_with_bbox(img, bbox, new_width, new_height, interp=0, letterbox=False):
+    '''
+    Resize the image and correct the bbox accordingly.
+    '''
+
+    if letterbox:
+        image_padded, resize_ratio, dw, dh = letterbox_resize(img, new_width, new_height, interp)
 
         # xmin, xmax
         bbox[:, [0, 2]] = bbox[:, [0, 2]] * resize_ratio + dw
@@ -297,6 +308,8 @@ def resize_with_bbox(img, bbox, new_width, new_height, interp=0, letterbox=False
 
         return image_padded, bbox
     else:
+        ori_height, ori_width = img.shape[:2]
+
         img = cv2.resize(img, (new_width, new_height), interpolation=interp)
 
         # xmin, xmax
@@ -365,10 +378,3 @@ def random_expand(img, bbox, max_ratio=4, fill=0, keep_ratio=True):
     bbox[:, 2:4] += (off_x, off_y)
 
     return dst, bbox
-
-
-
-
-
-
-
diff --git a/utils/data_utils.py b/utils/data_utils.py
diff --git a/utils/eval_utils.py b/utils/eval_utils.py
diff --git a/video_test.py b/video_test.py