diff --git a/DisplaceNet.png b/DisplaceNet.png
new file mode 100644
index 0000000..7b59027
Binary files /dev/null and b/DisplaceNet.png differ
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..7711cd9
--- /dev/null
+++ b/README.md
@@ -0,0 +1,24 @@
+# Recognising Displaced People from Images by Exploiting Dominance Level
+
+
+
+
+
+
+**[Grigorios Kalliatakis](https://scholar.google.com/citations?user=LMY5lhwAAAAJ&hl=en&oi=ao)
+ [Shoaib Ehsan](https://scholar.google.com/citations?user=40KlWugAAAAJ&hl=en)
+ [Maria Fasli](https://scholar.google.com/citations?user=Hg2osmAAAAAJ&hl=en)
+ [Klaus McDonald-Maier](https://scholar.google.com/citations?user=xYARJTQAAAAJ&hl=en) **
+
+**To appear in 1st CVPR Workshop on [COMPUTER VISION FOR GLOBAL CHALLENGES (CV4GC)](https://www.cv4gc.org/)**
+
+**[[arXiv preprint]]()**
+
+
+
+### Citing DisplaceNet
+
+Please cite our paper in your publications if it helps your research:
+
+ @inproceedings{
+ }
\ No newline at end of file
diff --git a/abusenet_evaluator_mini.py b/abusenet_evaluator_mini.py
new file mode 100644
index 0000000..63cc5f1
--- /dev/null
+++ b/abusenet_evaluator_mini.py
@@ -0,0 +1,141 @@
+from __future__ import print_function
+import os
+
+
+from sklearn.metrics import accuracy_score, classification_report, precision_score, confusion_matrix, average_precision_score
+from inference.displacenet_single_image_inference_unified import displaceNet_inference
+
+class AbuseNetBaseEvaluator(object):
+ """Perfofmance metrics base class.
+ """
+
+
+ def __init__(self,
+ hra_model_backend_name,nb_of_conv_layers_to_fine_tune,
+ emotic_model_a_backend_name,emotic_model_b_backend_name,emotic_model_c_backend_name,
+ violation_class,
+ main_test_dir ='/home/sandbox/Desktop/Human_Rights_Archive_DB/test',
+ ):
+
+ self.hra_model_backend_name = hra_model_backend_name
+ self.nb_of_conv_layers_to_fine_tune = nb_of_conv_layers_to_fine_tune
+ self.emotic_model_a_backend_name = emotic_model_a_backend_name
+ self.emotic_model_b_backend_name = emotic_model_b_backend_name
+ self.emotic_model_c_backend_name = emotic_model_c_backend_name
+ self.main_test_dir = main_test_dir
+ self.total_nb_of_test_images = sum([len(files) for r, d, files in os.walk(main_test_dir)])
+ self.sorted_categories_names = sorted(os.listdir(main_test_dir))
+ self.violation_class = violation_class
+ self.y_true = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,]
+
+
+
+ def _obtain_y_pred(self,
+ prob_threshold=0.75):
+
+ y_pred = []
+ y_scores = []
+
+ predicted_class_list = []
+ actual_class_list = []
+ coverage_count = 0
+
+ for hra_class in self.sorted_categories_names:
+
+ # variable that contains the main dir alongside the selected category
+ tmp = os.path.join(self.main_test_dir, hra_class)
+ img_names = sorted(os.listdir(tmp))
+
+
+ for raw_img in img_names:
+ # variable that contains the final image to be loaded
+ print('Processing [' + raw_img + ']')
+ final_img = os.path.join(tmp, raw_img)
+
+ preds = displaceNet_inference(img_path=final_img,
+ emotic_model_a_backend_name=self.emotic_model_a_backend_name,
+ emotic_model_b_backend_name=self.emotic_model_b_backend_name,
+ emotic_model_c_backend_name=self.emotic_model_c_backend_name,
+ hra_model_backend_name=self.hra_model_backend_name,
+ nb_of_fine_tuned_conv_layers=self.nb_of_conv_layers_to_fine_tune,
+ violation_class=self.violation_class)
+
+
+ preds = preds[0]
+
+ y_pred.append(int(preds[0][0]))
+ y_scores.append(preds[0][2])
+
+ top_1_predicted_probability = preds[0][2]
+
+ # top_1_predicted = np.argmax(preds)
+ top_1_predicted_label = preds[0][1]
+
+ if top_1_predicted_probability >= prob_threshold:
+ coverage_count += 1
+
+ print ('`' + hra_class + '/' + raw_img + '` ===> `' +
+ top_1_predicted_label + '`' + ' with ' + str(top_1_predicted_probability) + ' P')
+
+ predicted_class_list.append(top_1_predicted_label)
+ actual_class_list.append(hra_class)
+
+ total_coverage_per = (coverage_count * 100) / self.total_nb_of_test_images
+
+ return y_pred, self.y_true, y_scores, total_coverage_per
+
+
+
+if __name__ == "__main__":
+
+ violation_class = 'cl'
+ hra_model_backend_name = 'VGG16'
+ nb_of_conv_layers_to_fine_tune = 1
+
+ emotic_model_a_backend_name = 'VGG19'
+ emotic_model_b_backend_name = 'VGG16'
+ emotic_model_c_backend_name = None
+
+ model_backend_name = 'VGG16'
+
+ # server
+ # if violation_class == 'cl':
+ # main_test_dir = '/home/gkallia/git/AbuseNet/datasets/HRA-2clas-full-test/ChildLabour'
+ # elif violation_class =='dp':
+ # main_test_dir = '/home/gkallia/git/AbuseNet/datasets/HRA-2clas-full-test/DisplacedPopulations'
+
+ if violation_class == 'cl':
+ main_test_dir = '/home/sandbox/Desktop/HRA-2clas-full-test-mini/cl-1'
+ elif violation_class =='dp':
+ main_test_dir = '/home/sandbox/Desktop/HRA-2clas-full-test-mini/DisplacedPopulations'
+
+ # ---------------------------------------------------- #
+
+
+
+
+
+ base_evaluator = AbuseNetBaseEvaluator(hra_model_backend_name=hra_model_backend_name,nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune,
+ emotic_model_a_backend_name=emotic_model_a_backend_name,
+ emotic_model_b_backend_name=emotic_model_b_backend_name,
+ emotic_model_c_backend_name=emotic_model_c_backend_name,
+ violation_class=violation_class,
+ main_test_dir =main_test_dir,
+ )
+
+ y_pred, y_true, y_scores, total_coverage_per = base_evaluator._obtain_y_pred()
+
+ # print y_true
+ top1_acc = accuracy_score(y_true, y_pred)
+
+ AP = average_precision_score(y_true, y_scores, 'micro')
+
+
+ string = model_backend_name+'-'+violation_class+'-'+str(nb_of_conv_layers_to_fine_tune)+'layer(s)'
+
+ print('\n')
+ print( '============================= %s =============================' %string)
+ print(' Top-1 acc. => ' + str(top1_acc))
+ print(' Coverage => ' + str(total_coverage_per) + '%')
+ print(' Average Precision (AP) => ' + str(AP) + '%')
\ No newline at end of file
diff --git a/abusenet_evaluator_v2.py b/abusenet_evaluator_v2.py
new file mode 100644
index 0000000..634ef03
--- /dev/null
+++ b/abusenet_evaluator_v2.py
@@ -0,0 +1,223 @@
+# -*- coding: utf-8 -*-
+'''Evaluates AbuseNet. Whole test-set cannot fit into the memory, therefore it is splitted into 5 folds of 10 images each.
+
+'''
+from __future__ import print_function
+import os
+import argparse
+import time
+from utils.generic_utils import hms_string
+
+
+from sklearn.metrics import accuracy_score, classification_report, precision_score, confusion_matrix, average_precision_score
+from inference.displacenet_single_image_inference_unified import displaceNet_inference
+
+class AbuseNetBaseEvaluator(object):
+ """Perfofmance metrics base class.
+ """
+
+
+ def __init__(self,
+ hra_model_backend_name,nb_of_conv_layers_to_fine_tune,
+ emotic_model_a_backend_name,emotic_model_b_backend_name,emotic_model_c_backend_name,
+ violation_class,
+ main_test_dir ='/home/sandbox/Desktop/Human_Rights_Archive_DB/test',
+ ):
+
+ self.hra_model_backend_name = hra_model_backend_name
+ self.nb_of_conv_layers_to_fine_tune = nb_of_conv_layers_to_fine_tune
+ self.emotic_model_a_backend_name = emotic_model_a_backend_name
+ self.emotic_model_b_backend_name = emotic_model_b_backend_name
+ self.emotic_model_c_backend_name = emotic_model_c_backend_name
+ self.main_test_dir = main_test_dir
+ self.total_nb_of_test_images = sum([len(files) for r, d, files in os.walk(main_test_dir)])
+ self.sorted_categories_names = sorted(os.listdir(main_test_dir))
+ self.violation_class = violation_class
+ self.y_true = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+
+
+
+ def _obtain_y_pred(self,
+ prob_threshold=0.75):
+
+ y_pred = []
+ y_scores = []
+
+ predicted_class_list = []
+ actual_class_list = []
+ coverage_count = 0
+
+ for hra_class in self.sorted_categories_names:
+
+ # variable that contains the main dir alongside the selected category
+ tmp = os.path.join(self.main_test_dir, hra_class)
+ img_names = sorted(os.listdir(tmp))
+
+ for raw_img in img_names:
+ # variable that contains the final image to be loaded
+ print(' Processing [' + raw_img + ']')
+ final_img = os.path.join(tmp, raw_img)
+
+ preds = displaceNet_inference(img_path=final_img,
+ emotic_model_a_backend_name=self.emotic_model_a_backend_name,
+ emotic_model_b_backend_name=self.emotic_model_b_backend_name,
+ emotic_model_c_backend_name=self.emotic_model_c_backend_name,
+ hra_model_backend_name=self.hra_model_backend_name,
+ nb_of_fine_tuned_conv_layers=self.nb_of_conv_layers_to_fine_tune,
+ violation_class=self.violation_class)
+
+
+ preds = preds[0]
+
+ y_pred.append(int(preds[0][0]))
+ y_scores.append(preds[0][2])
+
+ top_1_predicted_probability = preds[0][2]
+
+ # top_1_predicted = np.argmax(preds)
+ top_1_predicted_label = preds[0][1]
+
+ if top_1_predicted_probability >= prob_threshold:
+ coverage_count += 1
+
+ # print ('`' + hra_class + '/' + raw_img + '` ===> `' +
+ # top_1_predicted_label + '`' + ' with ' + str(top_1_predicted_probability) + ' P')
+
+ print(' GT `' + hra_class + '`' + ' <--> PRED. `' +
+ top_1_predicted_label + '`' + ' with ' + str(top_1_predicted_probability))
+
+ print ('\n')
+
+ predicted_class_list.append(top_1_predicted_label)
+ actual_class_list.append(hra_class)
+
+ total_coverage_per = (coverage_count * 100) / self.total_nb_of_test_images
+
+ return y_pred, self.y_true, y_scores, total_coverage_per
+
+
+
+if __name__ == "__main__":
+
+ def get_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--violation_class", type=str,
+ help='One of `cl` or `dp`')
+
+ parser.add_argument("--fold_number", type=int, default=None,
+ help="Number of selected subset to test (1-5)")
+
+ parser.add_argument("--hra_model_backend_name", type=str,
+ help='One of `VGG16`, `VGG19`, `ResNet50`, `VGG16_Places365`')
+
+ parser.add_argument("--nb_of_conv_layers", type=int, default=None,
+ help="Number of fine-tuned conv. layers")
+
+ parser.add_argument("--emotic_model_a_backend_name", type=str,
+ help='One of `VGG16`, `VGG19`, `ResNet50`')
+
+ parser.add_argument("--emotic_model_b_backend_name", type=str,
+ help='One of `VGG16`, `VGG19`, `ResNet50`', default=None)
+
+ parser.add_argument("--emotic_model_c_backend_name", type=str,
+ help='One of `VGG16`, `VGG19`, `ResNet50`', default=None)
+
+ args = parser.parse_args()
+ return args
+
+
+ args = get_args()
+
+
+
+
+ # server
+ if args.violation_class == 'cl':
+ if args.fold_number == 1:
+ main_test_dir = '/home/gkallia/git/AbuseNet/datasets/HRA-2clas-full-test-mini/cl-1'
+
+ elif args.fold_number == 2:
+ main_test_dir = '/home/gkallia/git/AbuseNet/datasets/HRA-2clas-full-test-mini/cl-2'
+
+ elif args.fold_number == 3:
+ main_test_dir = '/home/gkallia/git/AbuseNet/datasets/HRA-2clas-full-test-mini/cl-3'
+
+ elif args.fold_number == 4:
+ main_test_dir = '/home/gkallia/git/AbuseNet/datasets/HRA-2clas-full-test-mini/cl-4'
+
+ elif args.fold_number == 5:
+ main_test_dir = '/home/gkallia/git/AbuseNet/datasets/HRA-2clas-full-test-mini/cl-5'
+
+
+ elif args.violation_class =='dp':
+ if args.fold_number == 1:
+ main_test_dir = '/home/gkallia/git/AbuseNet/datasets/HRA-2clas-full-test-mini/dp-1'
+
+ elif args.fold_number == 2:
+ main_test_dir = '/home/gkallia/git/AbuseNet/datasets/HRA-2clas-full-test-mini/dp-2'
+
+ elif args.fold_number == 3:
+ main_test_dir = '/home/gkallia/git/AbuseNet/datasets/HRA-2clas-full-test-mini/dp-3'
+
+ elif args.fold_number == 4:
+ main_test_dir = '/home/gkallia/git/AbuseNet/datasets/HRA-2clas-full-test-mini/dp-4'
+
+ elif args.fold_number == 5:
+ main_test_dir = '/home/gkallia/git/AbuseNet/datasets/HRA-2clas-full-test-mini/dp-5'
+
+
+ # if violation_class == 'cl':
+ # main_test_dir = '/home/sandbox/Desktop/HRA-2clas-full-test-mini/ChildLabour'
+ # elif violation_class =='dp':
+ # main_test_dir = '/home/sandbox/Desktop/HRA-2clas-full-test-mini/DisplacedPopulations'
+
+ # ---------------------------------------------------- #
+
+
+
+
+
+ base_evaluator = AbuseNetBaseEvaluator(hra_model_backend_name=args.hra_model_backend_name,
+ nb_of_conv_layers_to_fine_tune=args.nb_of_conv_layers,
+ emotic_model_a_backend_name=args.emotic_model_a_backend_name,
+ emotic_model_b_backend_name=args.emotic_model_b_backend_name,
+ emotic_model_c_backend_name=args.emotic_model_c_backend_name,
+ violation_class=args.violation_class,
+ main_test_dir =main_test_dir,
+ )
+
+ start_time = time.time()
+
+ y_pred, y_true, y_scores, total_coverage_per = base_evaluator._obtain_y_pred()
+
+ end_time = time.time()
+ print("[INFO] It took {} to obtain AbuseNet predictions".format(hms_string(end_time - start_time)))
+
+ # print y_true
+ top1_acc = accuracy_score(y_true, y_pred)
+
+ AP = average_precision_score(y_true, y_scores, 'micro')
+
+
+ string = args.hra_model_backend_name+'-'+args.violation_class+'-'+str(args.nb_of_conv_layers)+'layer(s)-'+\
+ str(args.fold_number) +'fold-' +str(args.emotic_model_a_backend_name) \
+ +'-' +str(args.emotic_model_b_backend_name) +'-'+str(args.emotic_model_c_backend_name)
+
+ print('\n')
+ print( '============================= %s =============================' %string)
+ print(' Top-1 acc. => ' + str(top1_acc))
+ print(' Coverage => ' + str(total_coverage_per) + '%')
+ print(' Average Precision (AP) => ' + str(AP) + '%')
+
+ text_file = open("25_March_dp_dominance_VGG16_Places365__VGG16.txt", "a+")
+ text_file.write( '============================= %s =============================\n' %string)
+ text_file.write('Acc. => ' + str(top1_acc)+'\n')
+ text_file.write('Coverage => ' + str(total_coverage_per) + '%\n')
+ text_file.write('Average Precision (AP) => ' + str(AP) + '%\n')
+ text_file.write('\n')
+
+ text_file.close()
+
+ print("[INFO] Results for fold %s were successfully saved " %str(args.fold_number))
+
diff --git a/abusenet_evaluator_v3.py b/abusenet_evaluator_v3.py
new file mode 100644
index 0000000..81802ab
--- /dev/null
+++ b/abusenet_evaluator_v3.py
@@ -0,0 +1,163 @@
+from __future__ import print_function
+import os
+import argparse
+
+
+from sklearn.metrics import accuracy_score, classification_report, precision_score, confusion_matrix, average_precision_score
+from inference.displacenet_single_image_inference_unified import displaceNet_inference
+
+class AbuseNetBaseEvaluator(object):
+ """Perfofmance metrics base class.
+ """
+
+
+ def __init__(self,
+ hra_model_backend_name,nb_of_conv_layers_to_fine_tune,
+ emotic_model_a_backend_name,emotic_model_b_backend_name,emotic_model_c_backend_name,
+ violation_class,
+ main_test_dir ='/home/sandbox/Desktop/Human_Rights_Archive_DB/test',
+ ):
+
+ self.hra_model_backend_name = hra_model_backend_name
+ self.nb_of_conv_layers_to_fine_tune = nb_of_conv_layers_to_fine_tune
+ self.emotic_model_a_backend_name = emotic_model_a_backend_name
+ self.emotic_model_b_backend_name = emotic_model_b_backend_name
+ self.emotic_model_c_backend_name = emotic_model_c_backend_name
+ self.main_test_dir = main_test_dir
+ self.total_nb_of_test_images = sum([len(files) for r, d, files in os.walk(main_test_dir)])
+ self.sorted_categories_names = sorted(os.listdir(main_test_dir))
+ self.violation_class = violation_class
+ self.y_true = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+
+
+
+ def _obtain_y_pred(self,
+ prob_threshold=0.75):
+
+ y_pred = []
+ y_scores = []
+
+ predicted_class_list = []
+ actual_class_list = []
+ coverage_count = 0
+
+ for hra_class in self.sorted_categories_names:
+
+ # variable that contains the main dir alongside the selected category
+ tmp = os.path.join(self.main_test_dir, hra_class)
+ img_names = sorted(os.listdir(tmp))
+
+ for raw_img in img_names:
+ # variable that contains the final image to be loaded
+ print(' Processing [' + raw_img + ']')
+ final_img = os.path.join(tmp, raw_img)
+
+ preds = displaceNet_inference(img_path=final_img,
+ emotic_model_a_backend_name=self.emotic_model_a_backend_name,
+ emotic_model_b_backend_name=self.emotic_model_b_backend_name,
+ emotic_model_c_backend_name=self.emotic_model_c_backend_name,
+ hra_model_backend_name=self.hra_model_backend_name,
+ nb_of_fine_tuned_conv_layers=self.nb_of_conv_layers_to_fine_tune,
+ violation_class=self.violation_class)
+
+
+ preds = preds[0]
+
+ y_pred.append(int(preds[0][0]))
+ y_scores.append(preds[0][2])
+
+ top_1_predicted_probability = preds[0][2]
+
+ # top_1_predicted = np.argmax(preds)
+ top_1_predicted_label = preds[0][1]
+
+ if top_1_predicted_probability >= prob_threshold:
+ coverage_count += 1
+
+ # print ('`' + hra_class + '/' + raw_img + '` ===> `' +
+ # top_1_predicted_label + '`' + ' with ' + str(top_1_predicted_probability) + ' P')
+
+ print(' GT `' + hra_class + '`' + '` <--> ` Pred. `' +
+ top_1_predicted_label + '`' + ' with ' + str(top_1_predicted_probability))
+
+ print ('\n')
+
+ predicted_class_list.append(top_1_predicted_label)
+ actual_class_list.append(hra_class)
+
+ total_coverage_per = (coverage_count * 100) / self.total_nb_of_test_images
+
+ return y_pred, self.y_true, y_scores, total_coverage_per
+
+
+
+if __name__ == "__main__":
+
+ def get_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--violation_class", type=str,
+ help='One of `cl` or `dp`')
+
+ parser.add_argument("--hra_model_backend_name", type=str,
+ help='One of `VGG16`, `VGG19`, `ResNet50`, `VGG16_Places365`')
+
+ parser.add_argument("--nb_of_conv_layers", type=int, default=None,
+ help="Number of fine-tuned conv. layers")
+
+ parser.add_argument("--emotic_model_a_backend_name", type=str,
+ help='One of `VGG16`, `VGG19`, `ResNet50`')
+
+ parser.add_argument("--emotic_model_b_backend_name", type=str,
+ help='One of `VGG16`, `VGG19`, `ResNet50`', default=None)
+
+ parser.add_argument("--emotic_model_c_backend_name", type=str,
+ help='One of `VGG16`, `VGG19`, `ResNet50`', default=None)
+
+ args = parser.parse_args()
+ return args
+
+
+ args = get_args()
+
+
+
+
+ # server
+ if args.violation_class == 'cl':
+ main_test_dir = '/home/gkallia/git/AbuseNet/datasets/HRA-2clas-full-test/ChildLabour'
+ elif args.violation_class =='dp':
+ main_test_dir = '/home/gkallia/git/AbuseNet/datasets/HRA-2clas-full-test/DisplacedPopulations'
+
+ # ---------------------------------------------------- #
+
+
+
+
+
+ base_evaluator = AbuseNetBaseEvaluator(hra_model_backend_name=args.hra_model_backend_name,
+ nb_of_conv_layers_to_fine_tune=args.nb_of_conv_layers,
+ emotic_model_a_backend_name=args.emotic_model_a_backend_name,
+ emotic_model_b_backend_name=args.emotic_model_b_backend_name,
+ emotic_model_c_backend_name=args.emotic_model_c_backend_name,
+ violation_class=args.violation_class,
+ main_test_dir =main_test_dir,
+ )
+
+ y_pred, y_true, y_scores, total_coverage_per = base_evaluator._obtain_y_pred()
+
+ # print y_true
+ top1_acc = accuracy_score(y_true, y_pred)
+
+ AP = average_precision_score(y_true, y_scores, 'micro')
+
+
+ string = args.hra_model_backend_name+'-'+args.violation_class+'-'+str(args.nb_of_conv_layers)+'layer(s)-'
+
+ print('\n')
+ print( '============================= %s =============================' %string)
+ print(' Top-1 acc. => ' + str(top1_acc))
+ print(' Coverage => ' + str(total_coverage_per) + '%')
+ print(' Average Precision (AP) => ' + str(AP) + '%')
\ No newline at end of file
diff --git a/applications/README.md b/applications/README.md
new file mode 100644
index 0000000..d53f4ac
--- /dev/null
+++ b/applications/README.md
@@ -0,0 +1,34 @@
+## Applications
+
+Applications is the _Keras-like-applications_ module of DisplaceNet.
+It provides model definitions and fine-tuned weights for a number of popular archictures, such as VGG16, VGG19, ResNet50 and VGG16-places365.
+
+
+
+### Usage
+
+All architectures are compatible with both TensorFlow and Theano, and upon instantiation the models will be built according to the
+image dimension ordering set in your Keras configuration file at ~/.keras/keras.json.
+For instance, if you have set image_dim_ordering=tf, then any model loaded from this repository will get built according to
+the TensorFlow dimension ordering convention, "Width-Height-Depth".
+
+Pre-trained weights can be automatically loaded upon instantiation (weights='HRA' argument in model constructor for
+models trained on Human Rights Archive two-class dataset or weights='EMOTIC' for models trained on EMOTIC dataset).
+Weights in all cases are automatically downloaded. The input size used was 224x224 for all models.
+
+
+### Available fine-tuned models
+**Models for image classification with weights trained on HRA subset:**
+- [VGG16](https://github.com/GKalliatakis/AbuseNet/blob/master/applications/hra_vgg16.py)
+- [VGG19](https://github.com/GKalliatakis/AbuseNet/blob/master/applications/hra_vgg19.py)
+- [ResNet50](https://github.com/GKalliatakis/AbuseNet/blob/master/applications/hra_resnet50.py)
+- [VGG16-places365](https://github.com/GKalliatakis/AbuseNet/blob/master/applications/hra_vgg16_places365.py)
+
+
+**Models for continuous emotion recognition in Valence-Arousal-Dominance space with weights trained on EMOTIC:**
+- [VGG16](https://github.com/GKalliatakis/AbuseNet/blob/master/applications/emotic_vgg16__vgg16_places365.py)
+- [VGG19](https://github.com/GKalliatakis/AbuseNet/blob/master/applications/emotic_vgg19__vgg16_places365.py)
+- [ResNet50](https://github.com/GKalliatakis/AbuseNet/blob/master/applications/emotic_resnet50__vgg16_places365.py)
+
+
+```
diff --git a/applications/__init__.py b/applications/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/applications/emotic_resnet50.py b/applications/emotic_resnet50.py
new file mode 100644
index 0000000..a6adb14
--- /dev/null
+++ b/applications/emotic_resnet50.py
@@ -0,0 +1,133 @@
+# -*- coding: utf-8 -*-
+'''Emotion Recognition in Context model for Keras
+
+# Reference:
+- [Emotion Recognition in Context](http://sunai.uoc.edu/emotic/pdf/EMOTIC_cvpr2017.pdf)
+'''
+
+from __future__ import division, print_function
+import os
+
+from keras.layers import Input
+from keras.layers.core import Dense
+from keras.models import Model
+from keras.layers.core import Dropout
+from keras.layers import GlobalAveragePooling2D
+from keras.utils.data_utils import get_file
+from keras import regularizers
+
+from keras.applications.resnet50 import ResNet50
+from keras.layers.merge import concatenate
+from applications.vgg16_places_365 import VGG16_Places365
+from keras.optimizers import SGD
+
+from utils.generic_utils import euclidean_distance_loss, rmse
+
+
+
+WEIGHTS_PATH = 'https://github.com/GKalliatakis/ubiquitous-assets/releases/download/v0.7.0/emotic_vad_ResNet50_weights_tf_dim_ordering_tf_kernels.h5'
+WEIGHTS_PATH_NO_TOP = ''
+
+
+def EMOTIC_VAD_ResNet50(include_top=True,
+ weights='emotic'):
+ """Instantiates the EMOTIC_VAD_ResNet50 architecture.
+
+ Optionally loads weights pre-trained
+ on EMOTIC. Note that when using TensorFlow,
+ for best performance you should set
+ `image_data_format="channels_last"` in your Keras config
+ at ~/.keras/keras.json.
+
+ The model and the weights are compatible with both
+ TensorFlow and Theano. The data format
+ convention used by the model is the one
+ specified in your Keras config file.
+
+ # Arguments
+ include_top: whether to include the 3 fully-connected
+ layers at the top of the network.
+ weights: one of `None` (random initialization),
+ 'emotic' (pre-training on EMOTIC),
+ or the path to the weights file to be loaded.
+ classes: optional number of discrete emotion classes to classify images into.
+ # Returns
+ A Keras model instance.
+ # Raises
+ ValueError: in case of invalid argument for `weights`
+ """
+
+ if not (weights in {'emotic', None} or os.path.exists(weights)):
+ raise ValueError('The `weights` argument should be either '
+ '`None` (random initialization), `emotic` '
+ '(pre-training on EMOTIC dataset), '
+ 'or the path to the weights file to be loaded.')
+
+ body_inputs = Input(shape=(224, 224, 3), name='INPUT')
+ image_inputs = Input(shape=(224, 224, 3), name='INPUT')
+
+ # Body module
+ tmp_model = ResNet50(include_top=False, weights='imagenet', input_tensor=body_inputs, pooling='avg')
+
+ body_truncated_model = Model(inputs=tmp_model.input, outputs=tmp_model.get_layer(index=169).output)
+
+ # body_truncated_model = Model(inputs=tmp_model.input, outputs=tmp_model.get_layer('activation_48').output)
+
+ for layer in body_truncated_model.layers:
+ layer.name = str("body-") + layer.name
+
+ # Image module
+ image_truncated_model = VGG16_Places365(include_top=False, weights='places', input_tensor=image_inputs,
+ pooling='avg')
+
+ for layer in image_truncated_model.layers:
+ layer.name = str("image-") + layer.name
+
+
+ # retrieve the ouputs
+ body_plain_model_output = body_truncated_model.output
+ image_plain_model_output = image_truncated_model.output
+
+ # In case ResNet50 is selected we need to use a global average pooling layer to follow the process used for the othe CNNs.
+ body_plain_model_output = GlobalAveragePooling2D(name='GAP')(body_plain_model_output)
+
+ merged = concatenate([body_plain_model_output, image_plain_model_output])
+
+ x = Dense(256, activation='relu', name='FC1', kernel_regularizer=regularizers.l2(0.01), kernel_initializer='random_normal')(merged)
+
+ x = Dropout(0.5, name='DROPOUT')(x)
+
+ vad_cont_prediction = Dense(units=3, kernel_initializer='random_normal', name='VAD')(x)
+
+ # At model instantiation, you specify the two inputs and the output.
+ model = Model(inputs=[body_inputs, image_inputs], outputs=vad_cont_prediction, name='EMOTIC-VAD-regression-ResNet50')
+
+ for layer in body_truncated_model.layers:
+ layer.trainable = False
+
+ for layer in image_truncated_model.layers:
+ layer.trainable = False
+
+ model.compile(optimizer=SGD(lr=1e-5, momentum=0.9),
+ loss=euclidean_distance_loss,
+ metrics=['mae', 'mse', rmse])
+
+ # load weights
+ if weights == 'emotic':
+ if include_top:
+ weights_path = get_file('emotic_vad_ResNet50_weights_tf_dim_ordering_tf_kernels.h5',
+ WEIGHTS_PATH,
+ cache_subdir='AbuseNet')
+ else:
+ weights_path = get_file('emotic_vad_ResNet50_weights_tf_dim_ordering_tf_kernels_notop.h5',
+ WEIGHTS_PATH_NO_TOP,
+ cache_subdir='AbuseNet')
+
+ model.load_weights(weights_path)
+
+
+ elif weights is not None:
+ model.load_weights(weights)
+
+ return model
+
diff --git a/applications/emotic_utils.py b/applications/emotic_utils.py
new file mode 100644
index 0000000..c45adf5
--- /dev/null
+++ b/applications/emotic_utils.py
@@ -0,0 +1,333 @@
+"""Generic utilities continuous-emotion-recognition-in-VAD-space-based models.
+"""
+
+from __future__ import print_function
+import numpy as np
+
+import keras.backend as K
+import itertools
+from itertools import product
+import tensorflow as tf
+import keras.backend.tensorflow_backend as tfb
+from keras.preprocessing import image
+from applications.emotic_vgg16 import EMOTIC_VAD_VGG16
+from applications.emotic_vgg19 import EMOTIC_VAD_VGG19
+from applications.emotic_resnet50 import EMOTIC_VAD_ResNet50
+from utils.generic_utils import imagenet_preprocess_input, places_preprocess_input
+
+
+
+target_size = (224, 224)
+
+CLASS_INDEX = None
+CLASS_INDEX_PATH = 'https://github.com/GKalliatakis/Keras-EMOTIC-resources/releases/download/v1.0/emotic_class_index.json'
+
+
+def _obtain_weights_CSVLogger_filenames(body_backbone_CNN, image_backbone_CNN):
+ """Obtains the polished filenames for the weights and the CSVLogger of the model.
+
+ # Arguments
+ model_name: String to declare the name of the model
+
+ # Returns
+ Two strings that will serve as the filenames for the weights and the CSVLogger respectively.
+ """
+
+ prefix= 'trained_models/emotic_vad_'
+ suffix= '_weights_tf_dim_ordering_tf_kernels.h5'
+ weights_filename = prefix + body_backbone_CNN + suffix
+
+
+ CSVLogger_filename = 'emotic_vad_'+body_backbone_CNN+'_training.csv'
+
+ return weights_filename, CSVLogger_filename
+
+
+
+# ----------------------------------------------------------------------------------------------------- #
+# Obtain number of classifiers
+# ----------------------------------------------------------------------------------------------------- #
+
+def _obtain_nb_classifiers(model_a_name = None, model_b_name = None, model_c_name = None):
+ """Obtains the number of different classifiers based on given model names.
+ Note that EMOTIC model has already combined body backbone CNN features (which in this case are the `model_b_name` or `model_c_name`
+ features, with `VGG16_Places365` features at training stage, but for simplicity reasons only the body backbone CNN name is adjustable.
+
+ # Arguments
+ model_b_name: One of `VGG16`, `VGG19`, `ResNet50` or `None`.
+ model_c_name: One of `VGG16`, `VGG19`, `ResNet50` or `None`.
+
+ # Returns
+ The number of different classifiers alongside a polished file_name
+ """
+
+ if (model_b_name) is None and (model_c_name is not None):
+ raise ValueError('The models names must be set in the correct order starting from model_a --> model_b --> model_c. ')
+
+
+ if (model_b_name in {None}) and (model_c_name in {None}):
+ nb_classifiers = 1
+ file_name = model_a_name
+
+ return nb_classifiers, file_name
+
+ if (model_b_name in {'VGG16', 'VGG19', 'ResNet50'}) and (model_c_name in {None}):
+ nb_classifiers = 2
+ file_name = model_a_name + '_' + model_b_name
+ return nb_classifiers, file_name
+
+ if (model_b_name in {'VGG16', 'VGG19', 'ResNet50'}) and (model_c_name in {'VGG16', 'VGG19', 'ResNet50'}) :
+ nb_classifiers = 3
+ file_name = model_a_name + '_' + model_b_name + '_' + model_c_name
+ return nb_classifiers, file_name
+
+
+# ----------------------------------------------------------------------------------------------------- #
+# Prepare input images
+# ----------------------------------------------------------------------------------------------------- #
+
+def prepare_input_data(body_path,
+ image_path):
+ """Prepares the raw images for the EMOTIC model.
+
+ # Arguments
+ body_path: Path to body only image file.
+ image_path: Path to entire image file.
+
+ # Returns
+ The two processed images
+ """
+
+ body_img = image.load_img(body_path, target_size=(224, 224))
+ x1 = image.img_to_array(body_img)
+ x1 = np.expand_dims(x1, axis=0)
+ x1 = imagenet_preprocess_input(x1)
+
+ entire_img = image.load_img(image_path, target_size=(224, 224))
+ x2 = image.img_to_array(entire_img)
+ x2 = np.expand_dims(x2, axis=0)
+ x2 = places_preprocess_input(x2)
+
+
+ return x1, x2
+
+
+# ----------------------------------------------------------------------------------------------------- #
+# Obtain ensembling weights for different classifiers
+# ----------------------------------------------------------------------------------------------------- #
+
+def _obtain_ensembling_weights(nb_classifiers,
+ model_a_name,
+ model_b_name,
+ model_c_name):
+ """Obtains the set of ensembling weights that will be used for conducting weighted average.
+
+ # Arguments
+ nb_classifiers: Integer, number of different classifiers that will be used for ensembling weights.
+ model_a_name: One of `VGG16`, `VGG19`, `ResNet50` or `None`.
+ model_b_name: One of `VGG16`, `VGG19`, `ResNet50` or `None`.
+ model_c_name: One of `VGG16`, `VGG19`, `ResNet50` or `None`.
+
+ # Returns
+ The weights (float) for every model and every dimension.
+ """
+
+ if nb_classifiers == 2:
+
+ if model_a_name == 'VGG16' and model_b_name == 'ResNet50':
+ w_model_a = 0.55
+ w_model_b = 0.45
+
+ return w_model_a, w_model_b
+
+ # ensure that giving models with different order will not effect the weights
+ elif model_a_name == 'ResNet50' and model_b_name == 'VGG16':
+ w_model_a = 0.45
+ w_model_b = 0.55
+
+ return w_model_a, w_model_b
+
+
+ elif model_a_name == 'VGG16' and model_b_name == 'VGG19':
+ w_model_a = 0.48
+ w_model_b = 0.52
+
+ return w_model_a, w_model_b
+
+ # ensure that giving models with different order will not effect the weights
+ elif model_a_name == 'VGG19' and model_b_name == 'VGG16':
+ w_model_a = 0.52
+ w_model_b = 0.48
+
+ return w_model_a, w_model_b
+
+
+ elif model_a_name == 'ResNet50' and model_b_name == 'VGG19':
+ w_model_a = 0.40
+ w_model_b = 0.60
+
+ return w_model_a, w_model_b
+
+
+ # ensure that giving models with different order will not effect the weights
+ elif model_a_name == 'VGG19' and model_b_name == 'ResNet50':
+
+ w_model_a = 0.60
+ w_model_b = 0.40
+
+ return w_model_a, w_model_b
+
+
+ elif nb_classifiers == 3:
+
+ if model_a_name == 'VGG16' and model_b_name == 'ResNet50' and model_c_name == 'VGG19':
+ w_model_a = 0.35
+ w_model_b = 0.28
+ w_model_c = 0.37
+
+ return w_model_a, w_model_b, w_model_c
+
+ elif model_a_name == 'VGG16' and model_b_name == 'VGG19' and model_c_name == 'ResNet50':
+ w_model_a = 0.35
+ w_model_b = 0.37
+ w_model_c = 0.28
+
+ return w_model_a, w_model_b, w_model_c
+
+
+ elif model_a_name == 'ResNet50' and model_b_name == 'VGG16' and model_c_name == 'VGG19':
+ w_model_a = 0.28
+ w_model_b = 0.35
+ w_model_c = 0.37
+
+ return w_model_a, w_model_b, w_model_c
+
+
+ elif model_a_name == 'ResNet50' and model_b_name == 'VGG19' and model_c_name == 'VGG16':
+ w_model_a = 0.28
+ w_model_b = 0.37
+ w_model_c = 0.35
+
+ return w_model_a, w_model_b, w_model_c
+
+ elif model_a_name == 'VGG19' and model_b_name == 'ResNet50' and model_c_name == 'VGG16':
+ w_model_a = 0.37
+ w_model_b = 0.28
+ w_model_c = 0.35
+
+ return w_model_a, w_model_b, w_model_c
+
+
+ elif model_a_name == 'VGG19' and model_b_name == 'VGG16' and model_c_name == 'ResNet50':
+ w_model_a = 0.37
+ w_model_b = 0.35
+ w_model_c = 0.28
+
+ return w_model_a, w_model_b, w_model_c
+
+
+
+# ----------------------------------------------------------------------------------------------------- #
+# Obtain Keras model instances based on given model names
+# ----------------------------------------------------------------------------------------------------- #
+
+def _obtain_single_model_VAD(model_a_name):
+ """Instantiates and returns 1 Keras model instance based on the given model name.
+
+ # Arguments
+ model_a_name: String to declare the name of the 1st model
+
+ # Returns
+ Single Keras model instance.
+ """
+
+ if model_a_name == 'VGG16':
+ model_a = EMOTIC_VAD_VGG16(include_top=True, weights='emotic')
+
+ elif model_a_name == 'VGG19':
+ model_a = EMOTIC_VAD_VGG19(include_top=True, weights='emotic')
+
+ elif model_a_name == 'ResNet50':
+ model_a = EMOTIC_VAD_ResNet50(include_top=True, weights='emotic')
+
+ return model_a
+
+
+def _obtain_two_models_ensembling_VAD(model_a_name, model_b_name):
+ """Instantiates and returns 2 Keras model instances based on the given model names.
+
+ # Arguments
+ model_a_name: String to declare the name of the 1st model
+ model_b_name: String to declare the name of the 2nd model
+
+ # Returns
+ Two Keras model instances.
+ """
+
+ if model_a_name == 'VGG16':
+ model_a = EMOTIC_VAD_VGG16(include_top=True, weights='emotic')
+
+ elif model_a_name == 'VGG19':
+ model_a = EMOTIC_VAD_VGG19(include_top=True, weights='emotic')
+
+ elif model_a_name == 'ResNet50':
+ model_a = EMOTIC_VAD_ResNet50(include_top=True, weights='emotic')
+
+
+ if model_b_name == 'VGG16':
+ model_b = EMOTIC_VAD_VGG16(include_top=True, weights='emotic')
+
+ elif model_b_name == 'VGG19':
+ model_b = EMOTIC_VAD_VGG19(include_top=True, weights='emotic')
+
+ elif model_b_name == 'ResNet50':
+ model_b = EMOTIC_VAD_ResNet50(include_top=True, weights='emotic')
+
+ return model_a, model_b
+
+
+def _obtain_three_models_ensembling_VAD(model_a_name, model_b_name, model_c_name):
+ """Instantiates and returns 3 Keras model instances based on the given model names.
+
+ # Arguments
+ model_a_name: String to declare the name of the 1st model
+ model_b_name: String to declare the name of the 2nd model
+ model_c_name: String to declare the name of the 3rd model
+
+ # Returns
+ Three Keras model instances.
+ """
+
+ if model_a_name == 'VGG16':
+ model_a = EMOTIC_VAD_VGG16(include_top=True, weights='emotic')
+
+ elif model_a_name == 'VGG19':
+ model_a = EMOTIC_VAD_VGG19(include_top=True, weights='emotic')
+
+ elif model_a_name == 'ResNet50':
+ model_a = EMOTIC_VAD_ResNet50(include_top=True, weights='emotic')
+
+
+ if model_b_name == 'VGG16':
+ model_b = EMOTIC_VAD_VGG16(include_top=True, weights='emotic')
+
+ elif model_b_name == 'VGG19':
+ model_b = EMOTIC_VAD_VGG19(include_top=True, weights='emotic')
+
+ elif model_b_name == 'ResNet50':
+ model_b = EMOTIC_VAD_ResNet50(include_top=True, weights='emotic')
+
+
+ if model_c_name == 'VGG16':
+ model_c = EMOTIC_VAD_VGG16(include_top=True, weights='emotic')
+
+ elif model_c_name == 'VGG19':
+ model_c = EMOTIC_VAD_VGG19(include_top=True, weights='emotic')
+
+ elif model_c_name == 'ResNet50':
+ model_c = EMOTIC_VAD_ResNet50(include_top=True, weights='emotic')
+
+ return model_a, model_b, model_c
+
+
+
diff --git a/applications/emotic_vgg16.py b/applications/emotic_vgg16.py
new file mode 100644
index 0000000..0746c59
--- /dev/null
+++ b/applications/emotic_vgg16.py
@@ -0,0 +1,119 @@
+# -*- coding: utf-8 -*-
+'''Emotion Recognition in Context model for Keras
+
+# Reference:
+- [Emotion Recognition in Context](http://sunai.uoc.edu/emotic/pdf/EMOTIC_cvpr2017.pdf)
+'''
+
+from __future__ import division, print_function
+import os
+
+from keras.layers import Input
+from keras.layers.core import Dense
+from keras.models import Model
+from keras.layers.core import Dropout
+from keras.utils.data_utils import get_file
+from keras import regularizers
+
+from keras.applications.vgg16 import VGG16
+from keras.layers.merge import concatenate
+from applications.vgg16_places_365 import VGG16_Places365
+from keras.optimizers import SGD
+
+from utils.generic_utils import euclidean_distance_loss, rmse
+
+
+WEIGHTS_PATH = 'https://github.com/GKalliatakis/ubiquitous-assets/releases/download/v0.7.0/emotic_vad_VGG16_weights_tf_dim_ordering_tf_kernels.h5'
+WEIGHTS_PATH_NO_TOP = ''
+
+
+def EMOTIC_VAD_VGG16(include_top=True,
+ weights='emotic'):
+ """Instantiates the EMOTIC_VAD_VGG16 architecture.
+
+ Optionally loads weights pre-trained
+ on EMOTIC. Note that when using TensorFlow,
+ for best performance you should set
+ `image_data_format="channels_last"` in your Keras config
+ at ~/.keras/keras.json.
+
+ The model and the weights are compatible with both
+ TensorFlow and Theano. The data format
+ convention used by the model is the one
+ specified in your Keras config file.
+
+ # Arguments
+ include_top: whether to include the 3 fully-connected
+ layers at the top of the network.
+ weights: one of `None` (random initialization),
+ 'emotic' (pre-training on EMOTIC),
+ or the path to the weights file to be loaded.
+ classes: optional number of discrete emotion classes to classify images into.
+ # Returns
+ A Keras model instance.
+ # Raises
+ ValueError: in case of invalid argument for `weights`
+ """
+
+ if not (weights in {'emotic', None} or os.path.exists(weights)):
+ raise ValueError('The `weights` argument should be either '
+ '`None` (random initialization), `emotic` '
+ '(pre-training on EMOTIC dataset), '
+ 'or the path to the weights file to be loaded.')
+
+ body_inputs = Input(shape=(224, 224, 3), name='INPUT')
+ image_inputs = Input(shape=(224, 224, 3), name='INPUT')
+
+
+ body_truncated_model = VGG16(include_top=False, weights='imagenet', input_tensor=body_inputs, pooling='avg')
+ for layer in body_truncated_model.layers:
+ layer.name = str("body-") + layer.name
+
+ image_truncated_model = VGG16_Places365(include_top=False, weights='places', input_tensor=image_inputs, pooling='avg')
+ for layer in image_truncated_model.layers:
+ layer.name = str("image-") + layer.name
+
+ # retrieve the ouputs
+ body_plain_model_output = body_truncated_model.output
+ image_plain_model_output = image_truncated_model.output
+
+ merged = concatenate([body_plain_model_output, image_plain_model_output])
+
+ x = Dense(256, activation='relu', name='FC1', kernel_regularizer=regularizers.l2(0.01), kernel_initializer='random_normal')(merged)
+
+ x = Dropout(0.5, name='DROPOUT')(x)
+
+ vad_cont_prediction = Dense(units=3, kernel_initializer='random_normal', name='VAD')(x)
+
+ # At model instantiation, you specify the two inputs and the output.
+ model = Model(inputs=[body_inputs, image_inputs], outputs=vad_cont_prediction, name='EMOTIC-VAD-regression-ResNet50')
+
+ for layer in body_truncated_model.layers:
+ layer.trainable = False
+
+ for layer in image_truncated_model.layers:
+ layer.trainable = False
+
+ model.compile(optimizer=SGD(lr=1e-5, momentum=0.9),
+ loss=euclidean_distance_loss,
+ metrics=['mae', 'mse', rmse])
+
+ # load weights
+ if weights == 'emotic':
+ if include_top:
+ weights_path = get_file('emotic_vad_VGG16_weights_tf_dim_ordering_tf_kernels.h5',
+ WEIGHTS_PATH,
+ cache_subdir='AbuseNet')
+ else:
+ weights_path = get_file('emotic_vad_VGG16_weights_tf_dim_ordering_tf_kernels_notop.h5',
+ WEIGHTS_PATH_NO_TOP,
+ cache_subdir='AbuseNet')
+
+ model.load_weights(weights_path)
+
+
+ elif weights is not None:
+ model.load_weights(weights)
+
+ return model
+
diff --git a/applications/emotic_vgg19.py b/applications/emotic_vgg19.py
new file mode 100644
index 0000000..185c717
--- /dev/null
+++ b/applications/emotic_vgg19.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+'''Emotion Recognition in Context model for Keras
+
+# Reference:
+- [Emotion Recognition in Context](http://sunai.uoc.edu/emotic/pdf/EMOTIC_cvpr2017.pdf)
+'''
+
+from __future__ import division, print_function
+import os
+
+from keras.layers import Input
+from keras import regularizers
+from keras.layers.core import Dense
+from keras.models import Model
+from keras.layers.core import Dropout
+from keras.utils.data_utils import get_file
+
+from keras.applications.vgg19 import VGG19
+from keras.layers.merge import concatenate
+from applications.vgg16_places_365 import VGG16_Places365
+from keras.optimizers import SGD
+
+from utils.generic_utils import euclidean_distance_loss, rmse
+
+
+WEIGHTS_PATH = 'https://github.com/GKalliatakis/ubiquitous-assets/releases/download/v0.7.0/emotic_vad_VGG19_weights_tf_dim_ordering_tf_kernels.h5'
+WEIGHTS_PATH_NO_TOP = ''
+
+
+def EMOTIC_VAD_VGG19(include_top=True,
+ weights='emotic'):
+ """Instantiates the EMOTIC_VAD_VGG19 architecture.
+
+ Optionally loads weights pre-trained
+ on EMOTIC. Note that when using TensorFlow,
+ for best performance you should set
+ `image_data_format="channels_last"` in your Keras config
+ at ~/.keras/keras.json.
+
+ The model and the weights are compatible with both
+ TensorFlow and Theano. The data format
+ convention used by the model is the one
+ specified in your Keras config file.
+
+ # Arguments
+ include_top: whether to include the 3 fully-connected
+ layers at the top of the network.
+ weights: one of `None` (random initialization),
+ 'emotic' (pre-training on EMOTIC),
+ or the path to the weights file to be loaded.
+ classes: optional number of discrete emotion classes to classify images into.
+ # Returns
+ A Keras model instance.
+ # Raises
+ ValueError: in case of invalid argument for `weights`
+ """
+
+ if not (weights in {'emotic', None} or os.path.exists(weights)):
+ raise ValueError('The `weights` argument should be either '
+ '`None` (random initialization), `emotic` '
+ '(pre-training on EMOTIC dataset), '
+ 'or the path to the weights file to be loaded.')
+
+
+ body_inputs = Input(shape=(224, 224, 3), name='INPUT')
+ image_inputs = Input(shape=(224, 224, 3), name='INPUT')
+
+ body_truncated_model = VGG19(include_top=False, weights='imagenet', input_tensor=body_inputs, pooling='avg')
+ for layer in body_truncated_model.layers:
+ layer.name = str("body-") + layer.name
+
+ image_truncated_model = VGG16_Places365(include_top=False, weights='places', input_tensor=image_inputs,
+ pooling='avg')
+ for layer in image_truncated_model.layers:
+ layer.name = str("image-") + layer.name
+
+ # retrieve the ouputs
+ body_plain_model_output = body_truncated_model.output
+ image_plain_model_output = image_truncated_model.output
+
+ merged = concatenate([body_plain_model_output, image_plain_model_output])
+
+ x = Dense(256, activation='relu', name='FC1', kernel_regularizer=regularizers.l2(0.01), kernel_initializer='random_normal')(merged)
+
+ x = Dropout(0.5, name='DROPOUT')(x)
+
+ vad_cont_prediction = Dense(units=3, kernel_initializer='random_normal', name='VAD')(x)
+
+ # At model instantiation, you specify the two inputs and the output.
+ model = Model(inputs=[body_inputs, image_inputs], outputs=vad_cont_prediction, name='EMOTIC-VAD-regression-ResNet50')
+
+ for layer in body_truncated_model.layers:
+ layer.trainable = False
+
+ for layer in image_truncated_model.layers:
+ layer.trainable = False
+
+ model.compile(optimizer=SGD(lr=1e-5, momentum=0.9),
+ loss=euclidean_distance_loss,
+ metrics=['mae', 'mse', rmse])
+
+ # load weights
+ if weights == 'emotic':
+ if include_top:
+ weights_path = get_file('emotic_vad_VGG19_weights_tf_dim_ordering_tf_kernels.h5',
+ WEIGHTS_PATH,
+ cache_subdir='AbuseNet')
+ else:
+ weights_path = get_file('emotic_vad_VGG19_weights_tf_dim_ordering_tf_kernels_notop.h5',
+ WEIGHTS_PATH_NO_TOP,
+ cache_subdir='AbuseNet')
+
+ model.load_weights(weights_path)
+
+
+ elif weights is not None:
+ model.load_weights(weights)
+
+ return model
+
diff --git a/applications/hra_resnet50.py b/applications/hra_resnet50.py
new file mode 100644
index 0000000..0af7ad1
--- /dev/null
+++ b/applications/hra_resnet50.py
@@ -0,0 +1,376 @@
+# -*- coding: utf-8 -*-
+"""2 clss Human Rights Archive (HRA) ResNet50 model for Keras
+
+"""
+
+from __future__ import division, print_function
+import os
+
+import warnings
+import numpy as np
+
+from keras import backend as K
+from keras.utils.data_utils import get_file
+from keras.layers import Input
+from keras.layers.core import Dense
+from keras.models import Model
+from keras.layers.core import Dropout
+from keras.layers import GlobalAveragePooling2D
+from keras.applications.resnet50 import ResNet50
+from keras.applications.imagenet_utils import _obtain_input_shape
+from keras.engine.topology import get_source_inputs
+from keras.optimizers import SGD
+from applications.hra_utils import _obtain_weights_path as owp
+
+from applications.hra_utils import _obtain_train_mode
+
+pre_trained_model = 'resnet50'
+
+# ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------
+CL_WEIGHTS_FEATURE_EXTRACTION_PATH, CL_FEATURE_EXTRACTION_FNAME = owp('cl', pre_trained_model, None, True)
+DP_WEIGHTS_FEATURE_EXTRACTION_PATH, DP_FEATURE_EXTRACTION_FNAME = owp('cl', pre_trained_model, None, True)
+
+# ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------
+CL_WEIGHTS_PATH_ONE_CONV_LAYER, CL_PATH_ONE_CONV_LAYER_FNAME = owp('cl', pre_trained_model, 1, True)
+CL_WEIGHTS_PATH_ONE_CONV_LAYER_NO_TOP, CL_PATH_ONE_CONV_LAYER_NO_TOP_FNAME = owp('cl', pre_trained_model, 1, False)
+
+CL_WEIGHTS_PATH_TWO_CONV_LAYERS, CL_PATH_TWO_CONV_LAYERS_FNAME = owp('cl', pre_trained_model, 2, True)
+CL_WEIGHTS_PATH_TWO_CONV_LAYERS_NO_TOP, CL_PATH_TWO_CONV_LAYERS_NO_TOP_FNAME = owp('cl', pre_trained_model, 2, False)
+
+CL_WEIGHTS_PATH_THREE_CONV_LAYERS, CL_PATH_THREE_CONV_LAYERS_FNAME = owp('cl', pre_trained_model, 3, True)
+CL_WEIGHTS_PATH_THREE_CONV_LAYERS_NO_TOP, CL_PATH_THREE_CONV_LAYERS_NO_TOP_FNAME = owp('cl', pre_trained_model, 3, False)
+
+# ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------
+DP_WEIGHTS_PATH_ONE_CONV_LAYER, DP_PATH_ONE_CONV_LAYER_FNAME = owp('dp', pre_trained_model, 1, True)
+DP_WEIGHTS_PATH_ONE_CONV_LAYER_NO_TOP, DP_PATH_ONE_CONV_LAYER_NO_TOP_FNAME = owp('dp', pre_trained_model, 1, False)
+
+DP_WEIGHTS_PATH_TWO_CONV_LAYERS, DP_PATH_TWO_CONV_LAYERS_FNAME = owp('dp', pre_trained_model, 2, True)
+DP_WEIGHTS_PATH_TWO_CONV_LAYERS_NO_TOP, DP_PATH_TWO_CONV_LAYERS_NO_TOP_FNAME = owp('dp', pre_trained_model, 2, False)
+
+DP_WEIGHTS_PATH_THREE_CONV_LAYERS, DP_PATH_THREE_CONV_LAYERS_FNAME = owp('dp', pre_trained_model, 3, True)
+DP_WEIGHTS_PATH_THREE_CONV_LAYERS_NO_TOP, DP_PATH_THREE_CONV_LAYERS_NO_TOP_FNAME = owp('dp', pre_trained_model, 3, False)
+
+# ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------
+
+def HRA_ResNet50(include_top=True, weights='HRA',
+ input_tensor=None, input_shape=None,
+ nb_of_conv_layers_to_fine_tune=None,
+ first_phase_trained_weights=None,
+ violation_class='cl',
+ verbose=0):
+ """Instantiates the ResNet50 architecture fine-tuned (2 steps) on Human Rights Archive dataset.
+
+ Optionally loads weights pre-trained on the 2 class version of Human Rights Archive Database.
+
+ # Arguments
+ include_top: whether to include the 3 fully-connected
+ layers at the top of the network.
+ weights: one of `None` (random initialization),
+ 'HRA' (pre-training on Human Rights Archive),
+ or the path to the weights file to be loaded.
+ input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+ to use as image input for the model.
+ input_shape: optional shape tuple, only to be specified
+ if `include_top` is False (otherwise the input shape
+ has to be `(224, 224, 3)` (with `channels_last` data format)
+ or `(3, 224, 224)` (with `channels_first` data format).
+ It should have exactly 3 input channels,
+ and width and height should be no smaller than 48.
+ E.g. `(200, 200, 3)` would be one valid value.
+ nb_of_conv_layers_to_fine_tune: integer to indicate the number of convolutional
+ layers to fine-tune. One of `1` (1,588,512 trainable params), `2` (4,998,432 trainable params) or `3` (6,054,176 trainable params).
+ first_phase_trained_weights: Weights of an already trained Keras model instance.
+ Only relevant when using `fine_tuning` as train_mode after `feature_extraction` weights have been saved.
+ violation_class: one of `cl` (HRA dataset with 2 classes - [i]'child_labour' and [ii]'no violation')
+ or `dp` (HRA dataset with 2 classes - [i]'displaced_populations' and [ii]'no violation')
+ verbose: Integer. 0, or 1. Verbosity mode. 0 = silent, 1 = model summary and weights info.
+
+ # Returns
+ A Keras model instance.
+
+ # Raises
+ ValueError: in case of invalid argument for `weights`, `violation_class`, `nb_of_conv_layers_to_fine_tune` or invalid input shape
+ """
+ if not (weights in {'HRA', None} or os.path.exists(weights)):
+ raise ValueError('The `weights` argument should be either '
+ '`None` (random initialization), `HRA` '
+ '(pre-training on Human Rights Archive two-class), '
+ 'or the path to the weights file to be loaded.')
+
+
+ if not (violation_class in {'cl', 'dp'}):
+ raise ValueError("The `violation_class` argument should be either "
+ "`cl` (HRA dataset with 2 classes - [i]'child_labour' and [ii]'no violation') "
+ "'or `dp` (HRA dataset with 2 classes - [i]'displaced_populations' and [ii]'no violation')")
+
+ if nb_of_conv_layers_to_fine_tune is None and include_top is False:
+ raise ValueError('Setting the `include_top` argument as false '
+ 'is only relevant when the `nb_of_conv_layers_to_fine_tune` argument is not None (feature extraction), '
+ 'otherwise the returned model would be exactly the default '
+ 'keras-applications model.')
+
+ if weights == 'HRA' and first_phase_trained_weights is not None:
+ raise ValueError('Setting the `first_phase_trained_weights` argument as the path to the weights file '
+ 'obtained from utilising feature_extraction '
+ 'is only relevant when the `weights` argument is `None`. '
+ 'If the `weights` argument is `HRA`, it means the model has already been trained on HRA dataset '
+ 'and there is no need to provide a path to the weights file (saved from feature_extraction) to be loaded.')
+
+ if not (nb_of_conv_layers_to_fine_tune in {1, 2, 3, None}):
+ raise ValueError('The `nb_of_conv_layers_to_fine_tune` argument should be either '
+ '`None` (indicates feature extraction mode), '
+ '`1`, `2` or `3`. '
+ 'More than 3 conv. layers are not supported because the more parameters we are training , '
+ 'the more we are at risk of overfitting.')
+
+ cache_subdir = 'AbuseNet'
+
+ mode = _obtain_train_mode(nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune)
+
+ # Determine proper input shape
+ input_shape = _obtain_input_shape(input_shape,
+ default_size=224,
+ min_size=48,
+ data_format=K.image_data_format(),
+ require_flatten=include_top,
+ weights=weights)
+
+ if input_tensor is None:
+ img_input = Input(shape=input_shape)
+ else:
+ if not K.is_keras_tensor(input_tensor):
+ img_input = Input(tensor=input_tensor, shape=input_shape)
+ else:
+ img_input = input_tensor
+
+ # Ensure that the model takes into account any potential predecessors of `input_tensor`.
+ if input_tensor is not None:
+ inputs = get_source_inputs(input_tensor)
+ else:
+ inputs = img_input
+
+
+ # create the base pre-trained model
+ base_model = ResNet50(weights='imagenet', include_top=False, input_tensor=img_input)
+ x = base_model.output
+
+
+ losses_list = {'EMOTIONS': 'binary_crossentropy',
+ 'VALENCE': 'mse',
+ 'AROUSAL': 'mse',
+ 'DOMINANCE': 'mse',
+ 'AGE': 'categorical_crossentropy'}
+
+ losses_weights = {'EMOTIONS': 10.,
+ 'VALENCE': 0.25,
+ 'AROUSAL': 0.25,
+ 'DOMINANCE': 0.25,
+ 'AGE': 1.}
+
+ metrics = {'EMOTIONS': 'categorical_accuracy',
+ 'VALENCE': 'mse',
+ 'AROUSAL': 'mse',
+ 'DOMINANCE': 'mse',
+ 'AGE': 'categorical_accuracy'}
+
+
+ # Classification block - build a classifier model to put on top of the convolutional model
+ if include_top:
+
+ # add a global spatial pooling layer (which seems to have the best performance)
+ x = GlobalAveragePooling2D(name='GAP')(x)
+
+ # add a fully-connected layer
+ x = Dense(256, activation='relu', name='FC1')(x)
+
+ # When random init is enabled, we want to include Dropout,
+ # otherwise when loading a pre-trained HRA model we want to omit that layer,
+ # so the visualisations are done properly (there is an issue if it is included)
+ if weights is None:
+ x = Dropout(0.5,name='DROPOUT')(x)
+ # and a logistic layer with the number of classes defined by the `classes` argument
+ x = Dense(2, activation='softmax', name='PREDICTIONS')(x)
+
+ model = Model(inputs=inputs, outputs=x, name='HRA-2CLASS-ResNet50')
+
+ else:
+ model = Model(inputs=inputs, outputs=x, name='HRA-2CLASS-ResNet50-NO-TOP')
+ model.compile(optimizer=SGD(lr=0.0001, momentum=0.9),
+ loss='categorical_crossentropy',
+ metrics=['accuracy'])
+
+ return model
+
+
+
+
+ if mode == 'feature_extraction':
+
+ print('[INFO] Feature extraction mode. \n')
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights before freezing the conv. base of the original pre-trained convnet: '
+ '' + str(len(model.trainable_weights)))
+
+ for layer in base_model.layers:
+ layer.trainable = False
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights after freezing the conv. base of the original pre-trained convnet: '
+ '' + str(len(model.trainable_weights)))
+
+ model.compile(optimizer=SGD(lr=0.0001, momentum=0.9),
+ loss='categorical_crossentropy',
+ metrics=['accuracy'])
+
+
+ elif mode == 'fine_tuning':
+
+
+ if nb_of_conv_layers_to_fine_tune == 1:
+ # Uncomment for extra verbosity
+ # print('[INFO] Fine-tuning of the last one (1) conv. layer. \n')
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights before unfreezing the last conv. layer of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ for layer in model.layers[:168]:
+ layer.trainable = False
+ for layer in model.layers[168:]:
+ layer.trainable = True
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights after unfreezing the last conv. layer of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ # Uncomment for extra verbosity
+ # print('[INFO] Fine-tuning of the last two (2) conv. layers. \n')
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights before unfreezing the last two (2) conv. layers of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ for layer in model.layers[:164]:
+ layer.trainable = False
+ for layer in model.layers[164:]:
+ layer.trainable = True
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights after unfreezing the last two (2) conv. layers of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ # Uncomment for extra verbosity
+ # print('[INFO] Fine-tuning of the last three (3) conv. layers. \n')
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights before unfreezing the last three (3) conv. layers of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ for layer in model.layers[:158]:
+ layer.trainable = False
+ for layer in model.layers[158:]:
+ layer.trainable = True
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights after unfreezing the last three (3) conv. layers of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ model.compile(optimizer=SGD(lr=0.0001, momentum=0.9),
+ loss='categorical_crossentropy',
+ metrics=['accuracy'])
+
+ if verbose == 1:
+ model.summary()
+
+ # load weights
+ if weights == 'HRA':
+
+ # Child labour
+ if violation_class =='cl':
+ if include_top:
+ if mode == 'feature_extraction':
+ weights_path = get_file(CL_FEATURE_EXTRACTION_FNAME,
+ CL_WEIGHTS_FEATURE_EXTRACTION_PATH,
+ cache_subdir=cache_subdir)
+
+ elif mode == 'fine_tuning':
+
+ if nb_of_conv_layers_to_fine_tune == 1:
+ weights_path = get_file(CL_PATH_ONE_CONV_LAYER_FNAME,
+ CL_WEIGHTS_PATH_ONE_CONV_LAYER,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ weights_path = get_file(CL_PATH_TWO_CONV_LAYERS_FNAME,
+ CL_WEIGHTS_PATH_TWO_CONV_LAYERS,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ weights_path = get_file(CL_PATH_THREE_CONV_LAYERS_FNAME,
+ CL_WEIGHTS_PATH_THREE_CONV_LAYERS,
+ cache_subdir=cache_subdir)
+
+ # no top
+ else:
+ if nb_of_conv_layers_to_fine_tune == 1:
+ weights_path = get_file(CL_PATH_ONE_CONV_LAYER_NO_TOP_FNAME,
+ CL_WEIGHTS_PATH_ONE_CONV_LAYER_NO_TOP,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ weights_path = get_file(CL_PATH_TWO_CONV_LAYERS_NO_TOP_FNAME,
+ CL_WEIGHTS_PATH_TWO_CONV_LAYERS_NO_TOP,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ weights_path = get_file(CL_PATH_THREE_CONV_LAYERS_NO_TOP_FNAME,
+ CL_WEIGHTS_PATH_THREE_CONV_LAYERS_NO_TOP,
+ cache_subdir=cache_subdir)
+ # Displaced populations
+ elif violation_class == 'dp':
+ if include_top:
+ if mode == 'feature_extraction':
+ weights_path = get_file(DP_FEATURE_EXTRACTION_FNAME,
+ DP_WEIGHTS_FEATURE_EXTRACTION_PATH,
+ cache_subdir=cache_subdir)
+
+ elif mode == 'fine_tuning':
+
+ if nb_of_conv_layers_to_fine_tune == 1:
+ weights_path = get_file(DP_PATH_ONE_CONV_LAYER_FNAME,
+ DP_WEIGHTS_PATH_ONE_CONV_LAYER,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ weights_path = get_file(DP_PATH_TWO_CONV_LAYERS_FNAME,
+ DP_WEIGHTS_PATH_TWO_CONV_LAYERS,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ weights_path = get_file(DP_PATH_THREE_CONV_LAYERS_FNAME,
+ DP_WEIGHTS_PATH_THREE_CONV_LAYERS,
+ cache_subdir=cache_subdir)
+
+ # no top
+ else:
+ if nb_of_conv_layers_to_fine_tune == 1:
+ weights_path = get_file(DP_PATH_ONE_CONV_LAYER_NO_TOP_FNAME,
+ DP_WEIGHTS_PATH_ONE_CONV_LAYER_NO_TOP,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ weights_path = get_file(DP_PATH_TWO_CONV_LAYERS_NO_TOP_FNAME,
+ DP_WEIGHTS_PATH_TWO_CONV_LAYERS_NO_TOP,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ weights_path = get_file(DP_PATH_THREE_CONV_LAYERS_NO_TOP_FNAME,
+ DP_WEIGHTS_PATH_THREE_CONV_LAYERS_NO_TOP,
+ cache_subdir=cache_subdir)
+ model.load_weights(weights_path)
+
+ elif weights is not None:
+ model.load_weights(weights)
+
+ return model
diff --git a/applications/hra_utils.py b/applications/hra_utils.py
new file mode 100644
index 0000000..1526c40
--- /dev/null
+++ b/applications/hra_utils.py
@@ -0,0 +1,400 @@
+"""Utilities for HRA data preprocessing, prediction decoding and plotting.
+"""
+
+from __future__ import print_function
+import numpy as np
+import matplotlib.pyplot as plt
+
+from keras.preprocessing import image
+from keras.applications.inception_v3 import preprocess_input
+
+from keras.utils import get_file
+import json
+from utils.generic_utils import imagenet_preprocess_input, places_preprocess_input
+
+
+target_size = (224, 224)
+
+CLASS_INDEX = None
+CL_CLASS_INDEX_PATH = 'https://github.com/GKalliatakis/ubiquitous-assets/releases/download/v0.1.3/HRA_2classCL_index.json'
+DP_CLASS_INDEX_PATH = 'https://github.com/GKalliatakis/ubiquitous-assets/releases/download/v0.1.3/HRA_2classDP_index.json'
+
+
+def _obtain_train_mode(nb_of_conv_layers_to_fine_tune):
+ """Obtains the train mode string based on the provided number of conv. layers that will be fine-tuned.
+
+ # Arguments
+ nb_of_conv_layers_to_fine_tune: integer to indicate the number of convolutional layersto fine-tune.
+ One of `1`, `2` or `3`.
+ # Returns
+ A string that will serve as the train mode of the model.
+ """
+
+ if nb_of_conv_layers_to_fine_tune == None:
+ return 'feature_extraction'
+ elif nb_of_conv_layers_to_fine_tune in {1, 2, 3}:
+ return 'fine_tuning'
+ else:
+ raise ValueError('The `nb_of_conv_layers_to_fine_tune` argument should be either '
+ '`None` (indicates feature extraction mode), '
+ '`1`, `2` or `3`. More than 3 conv. blocks are not included '
+ 'because the more parameters we are training (unfreezing), the more we are at risk of overfitting.')
+
+
+
+def _obtain_first_phase_trained_weights (violation_class,
+ model_name):
+ """Retrieves the weights of an already trained feature extraction model.
+ Only relevant when using `fine_tuning` as train_mode after `feature_extraction` weights have been saved.
+
+ # Arguments
+ violation_class: violation_class: one of `cl` (HRA dataset with 2 classes - [i]'child_labour' and [ii]'no violation')
+ or `dp` (HRA dataset with 2 classes - [i]'displaced_populations' and [ii]'no violation').
+ model_name: String to declare the name of the model
+ # Returns
+ A string with the weights path.
+ """
+
+ if violation_class == 'cl':
+ first_phase_trained_weights_filename = 'trained_models/' + 'cl_' + model_name + '_weights_feature_extraction_tf_dim_ordering_tf_kernels.h5'
+ elif violation_class == 'dp':
+ first_phase_trained_weights_filename = 'trained_models/' + 'dp_' + model_name + '_weights_feature_extraction_tf_dim_ordering_tf_kernels.h5'
+
+
+ return first_phase_trained_weights_filename
+
+
+
+
+
+def _obtain_weights_CSVLogger_filenames(violation_class,
+ train_mode,
+ model_name,
+ nb_of_conv_layers_to_fine_tune):
+ """Obtains the polished filenames for the weights and the CSVLogger of the model.
+
+ # Arguments
+ violation_class: violation_class: one of `cl` (HRA dataset with 2 classes - [i]'child_labour' and [ii]'no violation')
+ or `dp` (HRA dataset with 2 classes - [i]'displaced_populations' and [ii]'no violation').
+ train_mode: String to declare the train mode of the model (how many layers will be frozen during training).
+ - `feature_extraction` taking the convolutional base of a previously-trained network,
+ running the new data through it, and training a new classifier on top of the output.
+ - `fine_tuning` unfreezing a few of the top layers of a frozen conv. base used for feature extraction,
+ and jointly training both the newly added part of the model and these top layers.
+ model_name: String to declare the name of the model
+ nb_of_conv_layers_to_fine_tune: integer to indicate the number of convolutional
+ layers to fine-tune. One of `1`, `2` or `3`.
+
+ # Returns
+ Two strings that will serve as the filenames for the weights and the CSVLogger respectively.
+ """
+
+ if violation_class == 'cl':
+ if train_mode == 'feature_extraction':
+
+ weights_filename = 'trained_models/' + 'cl_' + model_name + '_weights_feature_extraction_tf_dim_ordering_tf_kernels.h5'
+ CSVLogger_filename = model_name + '_cl_feature_extraction.csv'
+
+ else:
+
+ if nb_of_conv_layers_to_fine_tune == 1:
+ weights_filename = 'trained_models/' + 'cl_' + model_name + '_weights_one_layer_tf_dim_ordering_tf_kernels.h5'
+ CSVLogger_filename = model_name + '_cl_fine_tuning_one_layer.csv'
+
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ weights_filename = 'trained_models/' + 'cl_' + model_name + '_weights_two_layers_tf_dim_ordering_tf_kernels.h5'
+ CSVLogger_filename = model_name + '_cl_fine_tuning_two_layers.csv'
+
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ weights_filename = 'trained_models/' + 'cl_' + model_name + '_weights_three_layers_tf_dim_ordering_tf_kernels.h5'
+ CSVLogger_filename = model_name + '_cl_fine_tuning_three_layers.csv'
+
+ else:
+ raise NotImplementedError(
+ 'The `nb_of_conv_layers_to_fine_tune` argument should be either `1`, `2` or `3`. '
+ 'More than 3 conv. blocks are not supported because the more parameters we are training, '
+ 'the more we are at risk of overfitting.')
+
+ elif violation_class == 'dp':
+ if train_mode == 'feature_extraction':
+
+ weights_filename = 'trained_models/' + 'dp_' + model_name + '_weights_feature_extraction_tf_dim_ordering_tf_kernels.h5'
+ CSVLogger_filename = model_name + '_dp_feature_extraction.csv'
+
+ else:
+
+ if nb_of_conv_layers_to_fine_tune == 1:
+ weights_filename = 'trained_models/' + 'dp_' + model_name + '_weights_one_layer_tf_dim_ordering_tf_kernels.h5'
+ CSVLogger_filename = model_name + '_dp_fine_tuning_one_layer.csv'
+
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ weights_filename = 'trained_models/' + 'dp_' + model_name + '_weights_two_layers_tf_dim_ordering_tf_kernels.h5'
+ CSVLogger_filename = model_name + '_dp_fine_tuning_two_layers.csv'
+
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ weights_filename = 'trained_models/' + 'dp_' + model_name + '_weights_three_layers_tf_dim_ordering_tf_kernels.h5'
+ CSVLogger_filename = model_name + '_dp_cl_fine_tuning_three_layers.csv'
+
+ else:
+ raise NotImplementedError(
+ 'The `nb_of_conv_layers_to_fine_tune` argument should be either `1`, `2` or `3`. '
+ 'More than 3 conv. blocks are not supported because the more parameters we are training, '
+ 'the more we are at risk of overfitting.')
+
+
+ return weights_filename, CSVLogger_filename
+
+
+
+
+def _obtain_weights_path(violation_class,
+ pre_trained_model,
+ nb_of_conv_layers_to_fine_tune,
+ include_top):
+ """Obtains the polished filenames for the weights of a trained Keras model.
+
+ # Arguments
+ violation_class: violation_class: one of `cl` (HRA dataset with 2 classes - [i]'child_labour' and [ii]'no violation')
+ or `dp` (HRA dataset with 2 classes - [i]'displaced_populations' and [ii]'no violation').
+ pre_trained_model: 'One of `vgg16`, `vgg19`, `resnet50` or `vgg16_places365`.
+ nb_of_conv_layers_to_fine_tune: integer to indicate the number of convolutional
+ layers to fine-tune. One of `1`, `2` or `3`.
+ include_top: whether to include the 3 fully-connected layers at the top of the network.
+
+ # Returns
+ Two strings that will serve as the original URL of the file (origin) and name of the file (fname) for loading the weights.
+ """
+
+ # This is the only URL that must be altered in case of changing the repo where weights files are stored.
+ # The rest will be automatically inferred from the github_repo URL.
+ # Note that the structure of the releases must comply with the following structure:
+ # main_release_dir/
+ # v0.1.1(weights for feature extraction mode)/
+ # v0.1.2(weights for fine-tuning mode)/
+
+
+ github_repo = 'https://github.com/GKalliatakis/ubiquitous-assets/releases'
+
+ if nb_of_conv_layers_to_fine_tune == None:
+ fname = violation_class + '_' + pre_trained_model + '_weights_feature_extraction_tf_dim_ordering_tf_kernels.h5'
+
+ elif nb_of_conv_layers_to_fine_tune == 1:
+ if include_top:
+ fname = violation_class + '_' + pre_trained_model + '_weights_one_layer_tf_dim_ordering_tf_kernels.h5'
+ else:
+ fname = violation_class + '_' + pre_trained_model + '_weights_one_layer_tf_dim_ordering_tf_kernels_notop.h5'
+
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ if include_top:
+ fname = violation_class + '_' + pre_trained_model + '_weights_two_layers_tf_dim_ordering_tf_kernels.h5'
+ else:
+ fname = violation_class + '_' + pre_trained_model + '_weights_two_layers_tf_dim_ordering_tf_kernels_notop.h5'
+
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ if include_top:
+ fname = violation_class + '_' + pre_trained_model + '_weights_three_layers_tf_dim_ordering_tf_kernels.h5'
+ else:
+ fname = violation_class + '_' + pre_trained_model + '_weights_three_layers_tf_dim_ordering_tf_kernels_notop.h5'
+
+
+
+ if nb_of_conv_layers_to_fine_tune == None:
+ origin = github_repo + '/download/v0.1.1/' + fname
+ else:
+ origin = github_repo + '/download/v0.1.2/' + fname
+
+
+ return origin, fname
+
+
+
+def decode_predictions(violation_class,preds, top=2):
+ """Decodes the prediction of a HRA-2CLASS model.
+
+ # Arguments
+ violation_class: violation_class: one of `cl` (HRA dataset with 2 classes - [i]'child_labour' and [ii]'no violation')
+ or `dp` (HRA dataset with 2 classes - [i]'displaced_populations' and [ii]'no violation')
+ preds: Numpy tensor encoding a batch of predictions.
+ top: integer, how many top-guesses to return.
+
+ # Returns
+ A list of lists of top class prediction tuples `(class_name, class_description, score)`.
+ One list of tuples per sample in batch input.
+
+ # Raises
+ ValueError: in case of invalid shape of the `pred` array must be 2D).
+ """
+ global CLASS_INDEX
+ if len(preds.shape) != 2 or preds.shape[1] != 2:
+ raise ValueError('`decode_predictions` expects '
+ 'a batch of predictions '
+ '(i.e. a 2D array of shape (samples, 2)). '
+ 'Found array with shape: ' + str(preds.shape))
+ if CLASS_INDEX is None:
+ if violation_class =='cl':
+ fpath = get_file('HRA_2classCL_index.json',
+ CL_CLASS_INDEX_PATH,
+ cache_subdir='AbuseNet')
+ CLASS_INDEX = json.load(open(fpath))
+ elif violation_class =='dp':
+ fpath = get_file('HRA_2classDP_index.json',
+ DP_CLASS_INDEX_PATH,
+ cache_subdir='AbuseNet')
+ CLASS_INDEX = json.load(open(fpath))
+
+ results = []
+ for pred in preds:
+ top_indices = pred.argsort()[-top:][::-1]
+ result = [tuple(CLASS_INDEX[str(i)]) + (pred[i],) for i in top_indices]
+ result.sort(key=lambda x: x[2], reverse=True)
+ results.append(result)
+
+ return results
+
+
+def predict(violation_class, model, img, target_size):
+ """Generates output predictions for a single PIL image.
+
+ # Arguments
+ violation_class: violation_class: one of `cl` (HRA dataset with 2 classes - [i]'child_labour' and [ii]'no violation')
+ or `dp` (HRA dataset with 2 classes - [i]'displaced_populations' and [ii]'no violation')
+ model: keras model
+ img: PIL format image
+ target_size: (w,h) tuple
+
+ # Returns
+ list of predicted labels and their probabilities
+ """
+ if img.size != target_size:
+ img = img.resize(target_size)
+
+ x = image.img_to_array(img)
+ x = np.expand_dims(x, axis=0)
+ x = preprocess_input(x)
+ preds = model.predict(x)
+
+ # print ('Raw preds: ',preds )
+
+ return preds, decode_predictions(violation_class = violation_class, preds = preds, top=2)[0]
+
+
+def predict_v2(violation_class, model, img, target_size):
+ """Generates output predictions for a single PIL image.
+ # Arguments
+ violation_class: violation_class: one of `cl` (HRA dataset with 2 classes - [i]'child_labour' and [ii]'no violation')
+ or `dp` (HRA dataset with 2 classes - [i]'displaced_populations' and [ii]'no violation')
+ model: keras model
+ img: PIL format image
+ target_size: (w,h) tuple
+ # Returns
+ list of predicted labels and their probabilities
+ """
+ if img.size != target_size:
+ img = img.resize(target_size)
+
+ x = image.img_to_array(img)
+ x = np.expand_dims(x, axis=0)
+ x = preprocess_input(x)
+ preds = model.predict(x)
+
+ return decode_predictions(violation_class = violation_class, preds = preds, top=2)[0]
+
+
+def duo_ensemble_predict(violation_class,
+ model_a, model_b,
+ img,
+ target_size
+ ):
+ """Generates output predictions for a single PIL image for 2 different models,
+ and then puts together those predictions by averaging them at inference time.
+
+ # Arguments
+ violation_class: violation_class: one of `cl` (HRA dataset with 2 classes - [i]'child_labour' and [ii]'no violation')
+ or `dp` (HRA dataset with 2 classes - [i]'displaced_populations' and [ii]'no violation')
+ model_a: 1st model
+ model_b: 2nd model
+ img: PIL format image
+ target_size: (w,h) tuple
+
+ # Returns
+ list of predicted labels (which have been pooled accordingly) and their probabilities
+ """
+ if img.size != target_size:
+ img = img.resize(target_size)
+
+ x = image.img_to_array(img)
+ x = np.expand_dims(x, axis=0)
+ x = preprocess_input(x)
+
+ preds_a = model_a.predict(x)
+ preds_b = model_b.predict(x)
+ final_preds = 0.50 * (preds_a + preds_b)
+
+ return decode_predictions(violation_class = violation_class, preds = final_preds, top=2)[0]
+
+
+
+def plot_preds(violation_class, image, preds):
+ """Displays image and the top-n predicted probabilities in a bar graph.
+
+ # Arguments
+ violation_class: violation_class: one of `cl` (HRA dataset with 2 classes - [i]'child_labour' and [ii]'no violation')
+ or `dp` (HRA dataset with 2 classes - [i]'displaced_populations' and [ii]'no violation')
+ image: PIL image
+ preds: list of predicted labels and their probabilities
+ """
+
+ if violation_class == 'cl':
+ labels = ("Child Labour", "NO Child Labour")
+
+ elif violation_class == 'dp':
+ labels = ("Displaced Populations", "NO Displaced Populations")
+
+
+
+ order = list(reversed(range(len(preds))))
+ plt.imshow(image)
+ plt.axis('off')
+
+ # fig = plt.figure(figsize=(2, 2))
+ #
+ # fig.add_subplot(1, 1, 1)
+ # plt.imshow(image)
+ #
+ # fig.add_subplot(2, 2, 2)
+ # plt.barh(order, preds, alpha=0.55)
+
+
+ plt.figure()
+ plt.barh(order, preds, alpha=0.55)
+ plt.yticks(order, labels)
+ plt.xlabel('Probability')
+ plt.xlim(0,1.01)
+ plt.tight_layout()
+ plt.show()
+
+
+def prepare_input_data(img_path,
+ objects_or_places_flag):
+ """Prepares the raw images for the EMOTIC model.
+
+ # Arguments
+ body_path: Path to body only image file.
+ image_path: Path to entire image file.
+
+ # Returns
+ The two processed images
+ """
+
+ body_img = image.load_img(img_path, target_size=(224, 224))
+ x1 = image.img_to_array(body_img)
+ x1 = np.expand_dims(x1, axis=0)
+
+ if objects_or_places_flag == 'objects':
+ x1 = imagenet_preprocess_input(x1)
+
+ elif objects_or_places_flag == 'places':
+ x1 = places_preprocess_input(x1)
+
+
+ return x1
\ No newline at end of file
diff --git a/applications/hra_vgg16.py b/applications/hra_vgg16.py
new file mode 100644
index 0000000..de538c0
--- /dev/null
+++ b/applications/hra_vgg16.py
@@ -0,0 +1,357 @@
+# -*- coding: utf-8 -*-
+"""2 clss Human Rights Archive (HRA) VGG16 model for Keras
+
+"""
+
+from __future__ import division, print_function
+import os
+
+import warnings
+import numpy as np
+
+from keras import backend as K
+from keras.utils.data_utils import get_file
+from keras.layers import Input
+from keras.layers.core import Dense
+from keras.models import Model
+from keras.layers.core import Dropout
+from keras.layers import GlobalAveragePooling2D
+from keras.applications.vgg16 import VGG16
+from keras.applications.imagenet_utils import _obtain_input_shape
+from keras.engine.topology import get_source_inputs
+from keras.optimizers import SGD
+from applications.hra_utils import _obtain_weights_path as owp
+
+from applications.hra_utils import _obtain_train_mode
+
+pre_trained_model = 'vgg16'
+
+# ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------
+CL_WEIGHTS_FEATURE_EXTRACTION_PATH, CL_FEATURE_EXTRACTION_FNAME = owp('cl', pre_trained_model, None, True)
+DP_WEIGHTS_FEATURE_EXTRACTION_PATH, DP_FEATURE_EXTRACTION_FNAME = owp('cl', pre_trained_model, None, True)
+
+# ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------
+CL_WEIGHTS_PATH_ONE_CONV_LAYER, CL_PATH_ONE_CONV_LAYER_FNAME = owp('cl', pre_trained_model, 1, True)
+CL_WEIGHTS_PATH_ONE_CONV_LAYER_NO_TOP, CL_PATH_ONE_CONV_LAYER_NO_TOP_FNAME = owp('cl', pre_trained_model, 1, False)
+
+CL_WEIGHTS_PATH_TWO_CONV_LAYERS, CL_PATH_TWO_CONV_LAYERS_FNAME = owp('cl', pre_trained_model, 2, True)
+CL_WEIGHTS_PATH_TWO_CONV_LAYERS_NO_TOP, CL_PATH_TWO_CONV_LAYERS_NO_TOP_FNAME = owp('cl', pre_trained_model, 2, False)
+
+CL_WEIGHTS_PATH_THREE_CONV_LAYERS, CL_PATH_THREE_CONV_LAYERS_FNAME = owp('cl', pre_trained_model, 3, True)
+CL_WEIGHTS_PATH_THREE_CONV_LAYERS_NO_TOP, CL_PATH_THREE_CONV_LAYERS_NO_TOP_FNAME = owp('cl', pre_trained_model, 3, False)
+
+# ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------
+DP_WEIGHTS_PATH_ONE_CONV_LAYER, DP_PATH_ONE_CONV_LAYER_FNAME = owp('dp', pre_trained_model, 1, True)
+DP_WEIGHTS_PATH_ONE_CONV_LAYER_NO_TOP, DP_PATH_ONE_CONV_LAYER_NO_TOP_FNAME = owp('dp', pre_trained_model, 1, False)
+
+DP_WEIGHTS_PATH_TWO_CONV_LAYERS, DP_PATH_TWO_CONV_LAYERS_FNAME = owp('dp', pre_trained_model, 2, True)
+DP_WEIGHTS_PATH_TWO_CONV_LAYERS_NO_TOP, DP_PATH_TWO_CONV_LAYERS_NO_TOP_FNAME = owp('dp', pre_trained_model, 2, False)
+
+DP_WEIGHTS_PATH_THREE_CONV_LAYERS, DP_PATH_THREE_CONV_LAYERS_FNAME = owp('dp', pre_trained_model, 3, True)
+DP_WEIGHTS_PATH_THREE_CONV_LAYERS_NO_TOP, DP_PATH_THREE_CONV_LAYERS_NO_TOP_FNAME = owp('dp', pre_trained_model, 3, False)
+
+# ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------
+
+def HRA_VGG16(include_top=True, weights='HRA',
+ input_tensor=None, input_shape=None,
+ nb_of_conv_layers_to_fine_tune=None,
+ first_phase_trained_weights = None,
+ violation_class = 'cl',
+ verbose=0):
+ """Instantiates the VGG16 architecture fine-tuned (2 steps) on Human Rights Archive dataset.
+
+ Optionally loads weights pre-trained on the 2 class version of Human Rights Archive Database.
+
+ # Arguments
+ include_top: whether to include the 3 fully-connected
+ layers at the top of the network.
+ weights: one of `None` (random initialization),
+ 'HRA' (pre-training on Human Rights Archive),
+ or the path to the weights file to be loaded.
+ input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+ to use as image input for the model.
+ input_shape: optional shape tuple, only to be specified
+ if `include_top` is False (otherwise the input shape
+ has to be `(224, 224, 3)` (with `channels_last` data format)
+ or `(3, 224, 224)` (with `channels_first` data format).
+ It should have exactly 3 input channels,
+ and width and height should be no smaller than 48.
+ E.g. `(200, 200, 3)` would be one valid value.
+ nb_of_conv_layers_to_fine_tune: integer to indicate the number of convolutional
+ layers to fine-tune. One of `1` (2,499,360 trainable params), `2` (4,859,168 trainable params) or `3` (7,218,976 trainable params).
+ first_phase_trained_weights: Weights of an already trained Keras model instance.
+ Only relevant when using `fine_tuning` as train_mode after `feature_extraction` weights have been saved.
+ violation_class: one of `cl` (HRA dataset with 2 classes - [i]'child_labour' and [ii]'no violation')
+ or `dp` (HRA dataset with 2 classes - [i]'displaced_populations' and [ii]'no violation')
+ verbose: Integer. 0, or 1. Verbosity mode. 0 = silent, 1 = model summary and weights info.
+
+ # Returns
+ A Keras model instance.
+
+ # Raises
+ ValueError: in case of invalid argument for `weights`, `violation_class`, `nb_of_conv_layers_to_fine_tune` or invalid input shape
+ """
+ if not (weights in {'HRA', None} or os.path.exists(weights)):
+ raise ValueError('The `weights` argument should be either '
+ '`None` (random initialization), `HRA` '
+ '(pre-training on Human Rights Archive two-class), '
+ 'or the path to the weights file to be loaded.')
+
+
+ if not (violation_class in {'cl', 'dp'}):
+ raise ValueError("The `violation_class` argument should be either "
+ "`cl` (HRA dataset with 2 classes - [i]'child_labour' and [ii]'no violation') "
+ "'or `dp` (HRA dataset with 2 classes - [i]'displaced_populations' and [ii]'no violation')")
+
+ if nb_of_conv_layers_to_fine_tune is None and include_top is False:
+ raise ValueError('Setting the `include_top` argument as false '
+ 'is only relevant when the `nb_of_conv_layers_to_fine_tune` argument is not None (feature extraction), '
+ 'otherwise the returned model would be exactly the default '
+ 'keras-applications model.')
+
+ if weights == 'HRA' and first_phase_trained_weights is not None:
+ raise ValueError('Setting the `first_phase_trained_weights` argument as the path to the weights file '
+ 'obtained from utilising feature_extraction '
+ 'is only relevant when the `weights` argument is `None`. '
+ 'If the `weights` argument is `HRA`, it means the model has already been trained on HRA dataset '
+ 'and there is no need to provide a path to the weights file (saved from feature_extraction) to be loaded.')
+
+ if not (nb_of_conv_layers_to_fine_tune in {1, 2, 3, None}):
+ raise ValueError('The `nb_of_conv_layers_to_fine_tune` argument should be either '
+ '`None` (indicates feature extraction mode), '
+ '`1`, `2` or `3`. '
+ 'More than 3 conv. layers are not supported because the more parameters we are training , '
+ 'the more we are at risk of overfitting.')
+
+ cache_subdir = 'AbuseNet'
+
+ mode = _obtain_train_mode(nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune)
+
+ # Determine proper input shape
+ input_shape = _obtain_input_shape(input_shape,
+ default_size=224,
+ min_size=48,
+ data_format=K.image_data_format(),
+ require_flatten=include_top,
+ weights=weights)
+
+ if input_tensor is None:
+ img_input = Input(shape=input_shape)
+ else:
+ if not K.is_keras_tensor(input_tensor):
+ img_input = Input(tensor=input_tensor, shape=input_shape)
+ else:
+ img_input = input_tensor
+
+ # Ensure that the model takes into account any potential predecessors of `input_tensor`.
+ if input_tensor is not None:
+ inputs = get_source_inputs(input_tensor)
+ else:
+ inputs = img_input
+
+
+ # create the base pre-trained model
+ base_model = VGG16(weights='imagenet', include_top=False, input_tensor=img_input)
+ x = base_model.output
+
+ # Classification block - build a classifier model to put on top of the convolutional model
+ if include_top:
+
+ # add a global spatial pooling layer (which seems to have the best performance)
+ x = GlobalAveragePooling2D(name='GAP')(x)
+
+ # add a fully-connected layer
+ x = Dense(256, activation='relu', name='FC1')(x)
+
+ # When random init is enabled, we want to include Dropout,
+ # otherwise when loading a pre-trained HRA model we want to omit that layer,
+ # so the visualisations are done properly (there is an issue if it is included)
+ if weights is None:
+ x = Dropout(0.5,name='DROPOUT')(x)
+ # and a logistic layer with the number of classes defined by the `classes` argument
+ x = Dense(2, activation='softmax', name='PREDICTIONS')(x)
+
+ model = Model(inputs=inputs, outputs=x, name='HRA-2CLASS-VGG16')
+
+ else:
+ model = Model(inputs=inputs, outputs=x, name='HRA-2CLASS-VGG16-NO-TOP')
+ model.compile(optimizer=SGD(lr=0.0001, momentum=0.9),
+ loss='categorical_crossentropy',
+ metrics=['accuracy'])
+
+ return model
+
+
+
+
+ if mode == 'feature_extraction':
+
+ print('[INFO] Feature extraction mode. \n')
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights before freezing the conv. base of the original pre-trained convnet: '
+ '' + str(len(model.trainable_weights)))
+
+ for layer in base_model.layers:
+ layer.trainable = False
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights after freezing the conv. base of the original pre-trained convnet: '
+ '' + str(len(model.trainable_weights)))
+
+ model.compile(optimizer=SGD(lr=0.0001, momentum=0.9),
+ loss='categorical_crossentropy',
+ metrics=['accuracy'])
+
+
+ elif mode == 'fine_tuning':
+
+
+ if nb_of_conv_layers_to_fine_tune == 1:
+ # Uncomment for extra verbosity
+ # print('[INFO] Fine-tuning of the last one (1) conv. layer. \n')
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights before unfreezing the last conv. layer of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ for layer in model.layers[:17]:
+ layer.trainable = False
+ for layer in model.layers[17:]:
+ layer.trainable = True
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights after unfreezing the last conv. layer of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ # Uncomment for extra verbosity
+ # print('[INFO] Fine-tuning of the last two (2) conv. layers. \n')
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights before unfreezing the last two (2) conv. layers of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ for layer in model.layers[:16]:
+ layer.trainable = False
+ for layer in model.layers[16:]:
+ layer.trainable = True
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights after unfreezing the last two (2) conv. layers of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ # Uncomment for extra verbosity
+ # print('[INFO] Fine-tuning of the last three (3) conv. layers. \n')
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights before unfreezing the last three (3) conv. layers of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ for layer in model.layers[:15]:
+ layer.trainable = False
+ for layer in model.layers[15:]:
+ layer.trainable = True
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights after unfreezing the last three (3) conv. layers of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ model.compile(optimizer=SGD(lr=0.0001, momentum=0.9),
+ loss='categorical_crossentropy',
+ metrics=['accuracy'])
+
+ if verbose == 1:
+ model.summary()
+
+ # load weights
+ if weights == 'HRA':
+
+ # Child labour
+ if violation_class =='cl':
+ if include_top:
+ if mode == 'feature_extraction':
+ weights_path = get_file(CL_FEATURE_EXTRACTION_FNAME,
+ CL_WEIGHTS_FEATURE_EXTRACTION_PATH,
+ cache_subdir=cache_subdir)
+
+ elif mode == 'fine_tuning':
+
+ if nb_of_conv_layers_to_fine_tune == 1:
+ weights_path = get_file(CL_PATH_ONE_CONV_LAYER_FNAME,
+ CL_WEIGHTS_PATH_ONE_CONV_LAYER,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ weights_path = get_file(CL_PATH_TWO_CONV_LAYERS_FNAME,
+ CL_WEIGHTS_PATH_TWO_CONV_LAYERS,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ weights_path = get_file(CL_PATH_THREE_CONV_LAYERS_FNAME,
+ CL_WEIGHTS_PATH_THREE_CONV_LAYERS,
+ cache_subdir=cache_subdir)
+
+ # no top
+ else:
+ if nb_of_conv_layers_to_fine_tune == 1:
+ weights_path = get_file(CL_PATH_ONE_CONV_LAYER_NO_TOP_FNAME,
+ CL_WEIGHTS_PATH_ONE_CONV_LAYER_NO_TOP,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ weights_path = get_file(CL_PATH_TWO_CONV_LAYERS_NO_TOP_FNAME,
+ CL_WEIGHTS_PATH_TWO_CONV_LAYERS_NO_TOP,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ weights_path = get_file(CL_PATH_THREE_CONV_LAYERS_NO_TOP_FNAME,
+ CL_WEIGHTS_PATH_THREE_CONV_LAYERS_NO_TOP,
+ cache_subdir=cache_subdir)
+ # Displaced populations
+ elif violation_class == 'dp':
+ if include_top:
+ if mode == 'feature_extraction':
+ weights_path = get_file(DP_FEATURE_EXTRACTION_FNAME,
+ DP_WEIGHTS_FEATURE_EXTRACTION_PATH,
+ cache_subdir=cache_subdir)
+
+ elif mode == 'fine_tuning':
+
+ if nb_of_conv_layers_to_fine_tune == 1:
+ weights_path = get_file(DP_PATH_ONE_CONV_LAYER_FNAME,
+ DP_WEIGHTS_PATH_ONE_CONV_LAYER,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ weights_path = get_file(DP_PATH_TWO_CONV_LAYERS_FNAME,
+ DP_WEIGHTS_PATH_TWO_CONV_LAYERS,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ weights_path = get_file(DP_PATH_THREE_CONV_LAYERS_FNAME,
+ DP_WEIGHTS_PATH_THREE_CONV_LAYERS,
+ cache_subdir=cache_subdir)
+
+ # no top
+ else:
+ if nb_of_conv_layers_to_fine_tune == 1:
+ weights_path = get_file(DP_PATH_ONE_CONV_LAYER_NO_TOP_FNAME,
+ DP_WEIGHTS_PATH_ONE_CONV_LAYER_NO_TOP,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ weights_path = get_file(DP_PATH_TWO_CONV_LAYERS_NO_TOP_FNAME,
+ DP_WEIGHTS_PATH_TWO_CONV_LAYERS_NO_TOP,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ weights_path = get_file(DP_PATH_THREE_CONV_LAYERS_NO_TOP_FNAME,
+ DP_WEIGHTS_PATH_THREE_CONV_LAYERS_NO_TOP,
+ cache_subdir=cache_subdir)
+
+ model.load_weights(weights_path)
+
+ elif weights is not None:
+ model.load_weights(weights)
+
+ return model
diff --git a/applications/hra_vgg16_places365.py b/applications/hra_vgg16_places365.py
new file mode 100644
index 0000000..f020bab
--- /dev/null
+++ b/applications/hra_vgg16_places365.py
@@ -0,0 +1,357 @@
+# -*- coding: utf-8 -*-
+"""2 clss Human Rights Archive (HRA) VGG16-Places365 model for Keras
+
+"""
+
+from __future__ import division, print_function
+import os
+
+import warnings
+import numpy as np
+
+from keras import backend as K
+from keras.utils.data_utils import get_file
+from keras.layers import Input
+from keras.layers.core import Dense
+from keras.models import Model
+from keras.layers.core import Dropout
+from keras.layers import GlobalAveragePooling2D
+from applications.vgg16_places_365 import VGG16_Places365
+from keras.applications.imagenet_utils import _obtain_input_shape
+from keras.engine.topology import get_source_inputs
+from keras.optimizers import SGD
+from applications.hra_utils import _obtain_weights_path as owp
+
+from applications.hra_utils import _obtain_train_mode
+
+pre_trained_model = 'vgg16_places365'
+
+# ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------
+CL_WEIGHTS_FEATURE_EXTRACTION_PATH, CL_FEATURE_EXTRACTION_FNAME = owp('cl', pre_trained_model, None, True)
+DP_WEIGHTS_FEATURE_EXTRACTION_PATH, DP_FEATURE_EXTRACTION_FNAME = owp('cl', pre_trained_model, None, True)
+
+# ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------
+CL_WEIGHTS_PATH_ONE_CONV_LAYER, CL_PATH_ONE_CONV_LAYER_FNAME = owp('cl', pre_trained_model, 1, True)
+CL_WEIGHTS_PATH_ONE_CONV_LAYER_NO_TOP, CL_PATH_ONE_CONV_LAYER_NO_TOP_FNAME = owp('cl', pre_trained_model, 1, False)
+
+CL_WEIGHTS_PATH_TWO_CONV_LAYERS, CL_PATH_TWO_CONV_LAYERS_FNAME = owp('cl', pre_trained_model, 2, True)
+CL_WEIGHTS_PATH_TWO_CONV_LAYERS_NO_TOP, CL_PATH_TWO_CONV_LAYERS_NO_TOP_FNAME = owp('cl', pre_trained_model, 2, False)
+
+CL_WEIGHTS_PATH_THREE_CONV_LAYERS, CL_PATH_THREE_CONV_LAYERS_FNAME = owp('cl', pre_trained_model, 3, True)
+CL_WEIGHTS_PATH_THREE_CONV_LAYERS_NO_TOP, CL_PATH_THREE_CONV_LAYERS_NO_TOP_FNAME = owp('cl', pre_trained_model, 3, False)
+
+# ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------
+DP_WEIGHTS_PATH_ONE_CONV_LAYER, DP_PATH_ONE_CONV_LAYER_FNAME = owp('dp', pre_trained_model, 1, True)
+DP_WEIGHTS_PATH_ONE_CONV_LAYER_NO_TOP, DP_PATH_ONE_CONV_LAYER_NO_TOP_FNAME = owp('dp', pre_trained_model, 1, False)
+
+DP_WEIGHTS_PATH_TWO_CONV_LAYERS, DP_PATH_TWO_CONV_LAYERS_FNAME = owp('dp', pre_trained_model, 2, True)
+DP_WEIGHTS_PATH_TWO_CONV_LAYERS_NO_TOP, DP_PATH_TWO_CONV_LAYERS_NO_TOP_FNAME = owp('dp', pre_trained_model, 2, False)
+
+DP_WEIGHTS_PATH_THREE_CONV_LAYERS, DP_PATH_THREE_CONV_LAYERS_FNAME = owp('dp', pre_trained_model, 3, True)
+DP_WEIGHTS_PATH_THREE_CONV_LAYERS_NO_TOP, DP_PATH_THREE_CONV_LAYERS_NO_TOP_FNAME = owp('dp', pre_trained_model, 3, False)
+
+# ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------
+
+def HRA_VGG16_Places365(include_top=True, weights='HRA',
+ input_tensor=None, input_shape=None,
+ nb_of_conv_layers_to_fine_tune=None,
+ first_phase_trained_weights=None,
+ violation_class='cl',
+ verbose=0):
+ """Instantiates the VGG16-Places365 architecture fine-tuned (2 steps) on Human Rights Archive dataset.
+
+ Optionally loads weights pre-trained on the 2 class version of Human Rights Archive Database.
+
+ # Arguments
+ include_top: whether to include the 3 fully-connected
+ layers at the top of the network.
+ weights: one of `None` (random initialization),
+ 'HRA' (pre-training on Human Rights Archive),
+ or the path to the weights file to be loaded.
+ input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+ to use as image input for the model.
+ input_shape: optional shape tuple, only to be specified
+ if `include_top` is False (otherwise the input shape
+ has to be `(224, 224, 3)` (with `channels_last` data format)
+ or `(3, 224, 224)` (with `channels_first` data format).
+ It should have exactly 3 input channels,
+ and width and height should be no smaller than 48.
+ E.g. `(200, 200, 3)` would be one valid value.
+ nb_of_conv_layers_to_fine_tune: integer to indicate the number of convolutional
+ layers to fine-tune. One of `1` (2,499,360 trainable params), `2` (4,859,168 trainable params) or `3` (7,218,976 trainable params).
+ first_phase_trained_weights: Weights of an already trained Keras model instance.
+ Only relevant when using `fine_tuning` as train_mode after `feature_extraction` weights have been saved.
+ violation_class: one of `cl` (HRA dataset with 2 classes - [i]'child_labour' and [ii]'no violation')
+ or `dp` (HRA dataset with 2 classes - [i]'displaced_populations' and [ii]'no violation')
+ verbose: Integer. 0, or 1. Verbosity mode. 0 = silent, 1 = model summary and weights info.
+
+ # Returns
+ A Keras model instance.
+
+ # Raises
+ ValueError: in case of invalid argument for `weights`, `violation_class`, `nb_of_conv_layers_to_fine_tune` or invalid input shape
+ """
+ if not (weights in {'HRA', None} or os.path.exists(weights)):
+ raise ValueError('The `weights` argument should be either '
+ '`None` (random initialization), `HRA` '
+ '(pre-training on Human Rights Archive two-class), '
+ 'or the path to the weights file to be loaded.')
+
+
+ if not (violation_class in {'cl', 'dp'}):
+ raise ValueError("The `violation_class` argument should be either "
+ "`cl` (HRA dataset with 2 classes - [i]'child_labour' and [ii]'no violation') "
+ "'or `dp` (HRA dataset with 2 classes - [i]'displaced_populations' and [ii]'no violation')")
+
+ if nb_of_conv_layers_to_fine_tune is None and include_top is False:
+ raise ValueError('Setting the `include_top` argument as false '
+ 'is only relevant when the `nb_of_conv_layers_to_fine_tune` argument is not None (feature extraction), '
+ 'otherwise the returned model would be exactly the default '
+ 'keras-applications model.')
+
+ if weights == 'HRA' and first_phase_trained_weights is not None:
+ raise ValueError('Setting the `first_phase_trained_weights` argument as the path to the weights file '
+ 'obtained from utilising feature_extraction '
+ 'is only relevant when the `weights` argument is `None`. '
+ 'If the `weights` argument is `HRA`, it means the model has already been trained on HRA dataset '
+ 'and there is no need to provide a path to the weights file (saved from feature_extraction) to be loaded.')
+
+ if not (nb_of_conv_layers_to_fine_tune in {1, 2, 3, None}):
+ raise ValueError('The `nb_of_conv_layers_to_fine_tune` argument should be either '
+ '`None` (indicates feature extraction mode), '
+ '`1`, `2` or `3`. '
+ 'More than 3 conv. layers are not supported because the more parameters we are training , '
+ 'the more we are at risk of overfitting.')
+
+ cache_subdir = 'AbuseNet'
+
+ mode = _obtain_train_mode(nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune)
+
+ # Determine proper input shape
+ input_shape = _obtain_input_shape(input_shape,
+ default_size=224,
+ min_size=48,
+ data_format=K.image_data_format(),
+ require_flatten=include_top,
+ weights=weights)
+
+ if input_tensor is None:
+ img_input = Input(shape=input_shape)
+ else:
+ if not K.is_keras_tensor(input_tensor):
+ img_input = Input(tensor=input_tensor, shape=input_shape)
+ else:
+ img_input = input_tensor
+
+ # Ensure that the model takes into account any potential predecessors of `input_tensor`.
+ if input_tensor is not None:
+ inputs = get_source_inputs(input_tensor)
+ else:
+ inputs = img_input
+
+
+ # create the base pre-trained model
+ base_model = VGG16_Places365(weights='places', include_top=False, input_tensor=img_input)
+ x = base_model.output
+
+ # Classification block - build a classifier model to put on top of the convolutional model
+ if include_top:
+
+ # add a global spatial pooling layer (which seems to have the best performance)
+ x = GlobalAveragePooling2D(name='GAP')(x)
+
+ # add a fully-connected layer
+ x = Dense(256, activation='relu', name='FC1')(x)
+
+ # When random init is enabled, we want to include Dropout,
+ # otherwise when loading a pre-trained HRA model we want to omit that layer,
+ # so the visualisations are done properly (there is an issue if it is included)
+ if weights is None:
+ x = Dropout(0.5,name='DROPOUT')(x)
+ # and a logistic layer with the number of classes defined by the `classes` argument
+ x = Dense(2, activation='softmax', name='PREDICTIONS')(x)
+
+ model = Model(inputs=inputs, outputs=x, name='HRA-2CLASS-VGG16_Places365')
+
+ else:
+ model = Model(inputs=inputs, outputs=x, name='HRA-2CLASS-VGG16_Places365-NO-TOP')
+ model.compile(optimizer=SGD(lr=0.0001, momentum=0.9),
+ loss='categorical_crossentropy',
+ metrics=['accuracy'])
+
+ return model
+
+
+
+
+ if mode == 'feature_extraction':
+
+ print('[INFO] Feature extraction mode. \n')
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights before freezing the conv. base of the original pre-trained convnet: '
+ '' + str(len(model.trainable_weights)))
+
+ for layer in base_model.layers:
+ layer.trainable = False
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights after freezing the conv. base of the original pre-trained convnet: '
+ '' + str(len(model.trainable_weights)))
+
+ model.compile(optimizer=SGD(lr=0.0001, momentum=0.9),
+ loss='categorical_crossentropy',
+ metrics=['accuracy'])
+
+
+ elif mode == 'fine_tuning':
+
+
+ if nb_of_conv_layers_to_fine_tune == 1:
+ # Uncomment for extra verbosity
+ # print('[INFO] Fine-tuning of the last one (1) conv. layer. \n')
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights before unfreezing the last conv. layer of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ for layer in model.layers[:17]:
+ layer.trainable = False
+ for layer in model.layers[17:]:
+ layer.trainable = True
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights after unfreezing the last conv. layer of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ # Uncomment for extra verbosity
+ # print('[INFO] Fine-tuning of the last two (2) conv. layers. \n')
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights before unfreezing the last two (2) conv. layers of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ for layer in model.layers[:16]:
+ layer.trainable = False
+ for layer in model.layers[16:]:
+ layer.trainable = True
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights after unfreezing the last two (2) conv. layers of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ # Uncomment for extra verbosity
+ # print('[INFO] Fine-tuning of the last three (3) conv. layers. \n')
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights before unfreezing the last three (3) conv. layers of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ for layer in model.layers[:15]:
+ layer.trainable = False
+ for layer in model.layers[15:]:
+ layer.trainable = True
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights after unfreezing the last three (3) conv. layers of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ model.compile(optimizer=SGD(lr=0.0001, momentum=0.9),
+ loss='categorical_crossentropy',
+ metrics=['accuracy'])
+
+ if verbose == 1:
+ model.summary()
+
+ # load weights
+ if weights == 'HRA':
+
+ # Child labour
+ if violation_class == 'cl':
+ if include_top:
+ if mode == 'feature_extraction':
+ weights_path = get_file(CL_FEATURE_EXTRACTION_FNAME,
+ CL_WEIGHTS_FEATURE_EXTRACTION_PATH,
+ cache_subdir=cache_subdir)
+
+ elif mode == 'fine_tuning':
+
+ if nb_of_conv_layers_to_fine_tune == 1:
+ weights_path = get_file(CL_PATH_ONE_CONV_LAYER_FNAME,
+ CL_WEIGHTS_PATH_ONE_CONV_LAYER,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ weights_path = get_file(CL_PATH_TWO_CONV_LAYERS_FNAME,
+ CL_WEIGHTS_PATH_TWO_CONV_LAYERS,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ weights_path = get_file(CL_PATH_THREE_CONV_LAYERS_FNAME,
+ CL_WEIGHTS_PATH_THREE_CONV_LAYERS,
+ cache_subdir=cache_subdir)
+
+ # no top
+ else:
+ if nb_of_conv_layers_to_fine_tune == 1:
+ weights_path = get_file(CL_PATH_ONE_CONV_LAYER_NO_TOP_FNAME,
+ CL_WEIGHTS_PATH_ONE_CONV_LAYER_NO_TOP,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ weights_path = get_file(CL_PATH_TWO_CONV_LAYERS_NO_TOP_FNAME,
+ CL_WEIGHTS_PATH_TWO_CONV_LAYERS_NO_TOP,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ weights_path = get_file(CL_PATH_THREE_CONV_LAYERS_NO_TOP_FNAME,
+ CL_WEIGHTS_PATH_THREE_CONV_LAYERS_NO_TOP,
+ cache_subdir=cache_subdir)
+ # Displaced populations
+ elif violation_class == 'dp':
+ if include_top:
+ if mode == 'feature_extraction':
+ weights_path = get_file(DP_FEATURE_EXTRACTION_FNAME,
+ DP_WEIGHTS_FEATURE_EXTRACTION_PATH,
+ cache_subdir=cache_subdir)
+
+ elif mode == 'fine_tuning':
+
+ if nb_of_conv_layers_to_fine_tune == 1:
+ weights_path = get_file(DP_PATH_ONE_CONV_LAYER_FNAME,
+ DP_WEIGHTS_PATH_ONE_CONV_LAYER,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ weights_path = get_file(DP_PATH_TWO_CONV_LAYERS_FNAME,
+ DP_WEIGHTS_PATH_TWO_CONV_LAYERS,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ weights_path = get_file(DP_PATH_THREE_CONV_LAYERS_FNAME,
+ DP_WEIGHTS_PATH_THREE_CONV_LAYERS,
+ cache_subdir=cache_subdir)
+
+ # no top
+ else:
+ if nb_of_conv_layers_to_fine_tune == 1:
+ weights_path = get_file(DP_PATH_ONE_CONV_LAYER_NO_TOP_FNAME,
+ DP_WEIGHTS_PATH_ONE_CONV_LAYER_NO_TOP,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ weights_path = get_file(DP_PATH_TWO_CONV_LAYERS_NO_TOP_FNAME,
+ DP_WEIGHTS_PATH_TWO_CONV_LAYERS_NO_TOP,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ weights_path = get_file(DP_PATH_THREE_CONV_LAYERS_NO_TOP_FNAME,
+ DP_WEIGHTS_PATH_THREE_CONV_LAYERS_NO_TOP,
+ cache_subdir=cache_subdir)
+
+ model.load_weights(weights_path)
+
+ elif weights is not None:
+ model.load_weights(weights)
+
+ return model
diff --git a/applications/hra_vgg19.py b/applications/hra_vgg19.py
new file mode 100644
index 0000000..6039023
--- /dev/null
+++ b/applications/hra_vgg19.py
@@ -0,0 +1,357 @@
+# -*- coding: utf-8 -*-
+"""2 clss Human Rights Archive (HRA) VGG19 model for Keras
+
+"""
+
+from __future__ import division, print_function
+import os
+
+import warnings
+import numpy as np
+
+from keras import backend as K
+from keras.utils.data_utils import get_file
+from keras.layers import Input
+from keras.layers.core import Dense
+from keras.models import Model
+from keras.layers.core import Dropout
+from keras.layers import GlobalAveragePooling2D
+from keras.applications.vgg19 import VGG19
+from keras.applications.imagenet_utils import _obtain_input_shape
+from keras.engine.topology import get_source_inputs
+from keras.optimizers import SGD
+from applications.hra_utils import _obtain_weights_path as owp
+
+from applications.hra_utils import _obtain_train_mode
+
+pre_trained_model = 'vgg19'
+
+# ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------
+CL_WEIGHTS_FEATURE_EXTRACTION_PATH, CL_FEATURE_EXTRACTION_FNAME = owp('cl', pre_trained_model, None, True)
+DP_WEIGHTS_FEATURE_EXTRACTION_PATH, DP_FEATURE_EXTRACTION_FNAME = owp('cl', pre_trained_model, None, True)
+
+# ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------
+CL_WEIGHTS_PATH_ONE_CONV_LAYER, CL_PATH_ONE_CONV_LAYER_FNAME = owp('cl', pre_trained_model, 1, True)
+CL_WEIGHTS_PATH_ONE_CONV_LAYER_NO_TOP, CL_PATH_ONE_CONV_LAYER_NO_TOP_FNAME = owp('cl', pre_trained_model, 1, False)
+
+CL_WEIGHTS_PATH_TWO_CONV_LAYERS, CL_PATH_TWO_CONV_LAYERS_FNAME = owp('cl', pre_trained_model, 2, True)
+CL_WEIGHTS_PATH_TWO_CONV_LAYERS_NO_TOP, CL_PATH_TWO_CONV_LAYERS_NO_TOP_FNAME = owp('cl', pre_trained_model, 2, False)
+
+CL_WEIGHTS_PATH_THREE_CONV_LAYERS, CL_PATH_THREE_CONV_LAYERS_FNAME = owp('cl', pre_trained_model, 3, True)
+CL_WEIGHTS_PATH_THREE_CONV_LAYERS_NO_TOP, CL_PATH_THREE_CONV_LAYERS_NO_TOP_FNAME = owp('cl', pre_trained_model, 3, False)
+
+# ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------
+DP_WEIGHTS_PATH_ONE_CONV_LAYER, DP_PATH_ONE_CONV_LAYER_FNAME = owp('dp', pre_trained_model, 1, True)
+DP_WEIGHTS_PATH_ONE_CONV_LAYER_NO_TOP, DP_PATH_ONE_CONV_LAYER_NO_TOP_FNAME = owp('dp', pre_trained_model, 1, False)
+
+DP_WEIGHTS_PATH_TWO_CONV_LAYERS, DP_PATH_TWO_CONV_LAYERS_FNAME = owp('dp', pre_trained_model, 2, True)
+DP_WEIGHTS_PATH_TWO_CONV_LAYERS_NO_TOP, DP_PATH_TWO_CONV_LAYERS_NO_TOP_FNAME = owp('dp', pre_trained_model, 2, False)
+
+DP_WEIGHTS_PATH_THREE_CONV_LAYERS, DP_PATH_THREE_CONV_LAYERS_FNAME = owp('dp', pre_trained_model, 3, True)
+DP_WEIGHTS_PATH_THREE_CONV_LAYERS_NO_TOP, DP_PATH_THREE_CONV_LAYERS_NO_TOP_FNAME = owp('dp', pre_trained_model, 3, False)
+
+# ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------ ====================== ------------------------
+
+def HRA_VGG19(include_top=True, weights='HRA',
+ input_tensor=None, input_shape=None,
+ nb_of_conv_layers_to_fine_tune=None,
+ first_phase_trained_weights = None,
+ violation_class = 'cl',
+ verbose=0):
+ """Instantiates the VGG19 architecture fine-tuned (2 steps) on Human Rights Archive dataset.
+
+ Optionally loads weights pre-trained on the 2 class version of Human Rights Archive Database.
+
+ # Arguments
+ include_top: whether to include the 3 fully-connected
+ layers at the top of the network.
+ weights: one of `None` (random initialization),
+ 'HRA' (pre-training on Human Rights Archive),
+ or the path to the weights file to be loaded.
+ input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+ to use as image input for the model.
+ input_shape: optional shape tuple, only to be specified
+ if `include_top` is False (otherwise the input shape
+ has to be `(224, 224, 3)` (with `channels_last` data format)
+ or `(3, 224, 224)` (with `channels_first` data format).
+ It should have exactly 3 input channels,
+ and width and height should be no smaller than 48.
+ E.g. `(200, 200, 3)` would be one valid value.
+ nb_of_conv_layers_to_fine_tune: integer to indicate the number of convolutional
+ layers to fine-tune. One of `1` (2,359,808 trainable params), `2` (4,719,616 trainable params) or `3` (7,079,424 trainable params).
+ first_phase_trained_weights: Weights of an already trained Keras model instance.
+ Only relevant when using `fine_tuning` as train_mode after `feature_extraction` weights have been saved.
+ violation_class: one of `cl` (HRA dataset with 2 classes - [i]'child_labour' and [ii]'no violation')
+ or `dp` (HRA dataset with 2 classes - [i]'displaced_populations' and [ii]'no violation')
+ verbose: Integer. 0, or 1. Verbosity mode. 0 = silent, 1 = model summary and weights info.
+
+ # Returns
+ A Keras model instance.
+
+ # Raises
+ ValueError: in case of invalid argument for `weights`, `violation_class`, `nb_of_conv_layers_to_fine_tune` or invalid input shape
+ """
+ if not (weights in {'HRA', None} or os.path.exists(weights)):
+ raise ValueError('The `weights` argument should be either '
+ '`None` (random initialization), `HRA` '
+ '(pre-training on Human Rights Archive two-class), '
+ 'or the path to the weights file to be loaded.')
+
+
+ if not (violation_class in {'cl', 'dp'}):
+ raise ValueError("The `violation_class` argument should be either "
+ "`cl` (HRA dataset with 2 classes - [i]'child_labour' and [ii]'no violation') "
+ "'or `dp` (HRA dataset with 2 classes - [i]'displaced_populations' and [ii]'no violation')")
+
+ if nb_of_conv_layers_to_fine_tune is None and include_top is False:
+ raise ValueError('Setting the `include_top` argument as false '
+ 'is only relevant when the `nb_of_conv_layers_to_fine_tune` argument is not None (feature extraction), '
+ 'otherwise the returned model would be exactly the default '
+ 'keras-applications model.')
+
+ if weights == 'HRA' and first_phase_trained_weights is not None:
+ raise ValueError('Setting the `first_phase_trained_weights` argument as the path to the weights file '
+ 'obtained from utilising feature_extraction '
+ 'is only relevant when the `weights` argument is `None`. '
+ 'If the `weights` argument is `HRA`, it means the model has already been trained on HRA dataset '
+ 'and there is no need to provide a path to the weights file (saved from feature_extraction) to be loaded.')
+
+ if not (nb_of_conv_layers_to_fine_tune in {1, 2, 3, None}):
+ raise ValueError('The `nb_of_conv_layers_to_fine_tune` argument should be either '
+ '`None` (indicates feature extraction mode), '
+ '`1`, `2` or `3`. '
+ 'More than 3 conv. layers are not supported because the more parameters we are training , '
+ 'the more we are at risk of overfitting.')
+
+ cache_subdir = 'AbuseNet'
+
+ mode = _obtain_train_mode(nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune)
+
+ # Determine proper input shape
+ input_shape = _obtain_input_shape(input_shape,
+ default_size=224,
+ min_size=48,
+ data_format=K.image_data_format(),
+ require_flatten=include_top,
+ weights=weights)
+
+ if input_tensor is None:
+ img_input = Input(shape=input_shape)
+ else:
+ if not K.is_keras_tensor(input_tensor):
+ img_input = Input(tensor=input_tensor, shape=input_shape)
+ else:
+ img_input = input_tensor
+
+ # Ensure that the model takes into account any potential predecessors of `input_tensor`.
+ if input_tensor is not None:
+ inputs = get_source_inputs(input_tensor)
+ else:
+ inputs = img_input
+
+
+ # create the base pre-trained model
+ base_model = VGG19(weights='imagenet', include_top=False, input_tensor=img_input)
+ x = base_model.output
+
+ # Classification block - build a classifier model to put on top of the convolutional model
+ if include_top:
+
+ # add a global spatial pooling layer (which seems to have the best performance)
+ x = GlobalAveragePooling2D(name='GAP')(x)
+
+ # add a fully-connected layer
+ x = Dense(256, activation='relu', name='FC1')(x)
+
+ # When random init is enabled, we want to include Dropout,
+ # otherwise when loading a pre-trained HRA model we want to omit that layer,
+ # so the visualisations are done properly (there is an issue if it is included)
+ if weights is None:
+ x = Dropout(0.5,name='DROPOUT')(x)
+ # and a logistic layer with the number of classes defined by the `classes` argument
+ x = Dense(2, activation='softmax', name='PREDICTIONS')(x)
+
+ model = Model(inputs=inputs, outputs=x, name='HRA-2CLASS-VGG19')
+
+ else:
+ model = Model(inputs=inputs, outputs=x, name='HRA-2CLASS-VGG19-NO-TOP')
+ model.compile(optimizer=SGD(lr=0.0001, momentum=0.9),
+ loss='categorical_crossentropy',
+ metrics=['accuracy'])
+
+ return model
+
+
+
+
+ if mode == 'feature_extraction':
+
+ print('[INFO] Feature extraction mode. \n')
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights before freezing the conv. base of the original pre-trained convnet: '
+ '' + str(len(model.trainable_weights)))
+
+ for layer in base_model.layers:
+ layer.trainable = False
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights after freezing the conv. base of the original pre-trained convnet: '
+ '' + str(len(model.trainable_weights)))
+
+ model.compile(optimizer=SGD(lr=0.0001, momentum=0.9),
+ loss='categorical_crossentropy',
+ metrics=['accuracy'])
+
+
+ elif mode == 'fine_tuning':
+
+
+ if nb_of_conv_layers_to_fine_tune == 1:
+ # Uncomment for extra verbosity
+ # print('[INFO] Fine-tuning of the last one (1) conv. layer. \n')
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights before unfreezing the last conv. layer of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ for layer in model.layers[:20]:
+ layer.trainable = False
+ for layer in model.layers[20:]:
+ layer.trainable = True
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights after unfreezing the last conv. layer of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ # Uncomment for extra verbosity
+ # print('[INFO] Fine-tuning of the last two (2) conv. layers. \n')
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights before unfreezing the last two (2) conv. layers of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ for layer in model.layers[:19]:
+ layer.trainable = False
+ for layer in model.layers[19:]:
+ layer.trainable = True
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights after unfreezing the last two (2) conv. layers of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ # Uncomment for extra verbosity
+ # print('[INFO] Fine-tuning of the last three (3) conv. layers. \n')
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights before unfreezing the last three (3) conv. layers of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ for layer in model.layers[:18]:
+ layer.trainable = False
+ for layer in model.layers[18:]:
+ layer.trainable = True
+
+ if verbose == 1:
+ print(
+ '[INFO] Number of trainable weights after unfreezing the last three (3) conv. layers of the model with the retrained classifier: '
+ '' + str(len(model.trainable_weights)))
+
+ model.compile(optimizer=SGD(lr=0.0001, momentum=0.9),
+ loss='categorical_crossentropy',
+ metrics=['accuracy'])
+
+ if verbose == 1:
+ model.summary()
+
+ # load weights
+ if weights == 'HRA':
+
+ # Child labour
+ if violation_class =='cl':
+ if include_top:
+ if mode == 'feature_extraction':
+ weights_path = get_file(CL_FEATURE_EXTRACTION_FNAME,
+ CL_WEIGHTS_FEATURE_EXTRACTION_PATH,
+ cache_subdir=cache_subdir)
+
+ elif mode == 'fine_tuning':
+
+ if nb_of_conv_layers_to_fine_tune == 1:
+ weights_path = get_file(CL_PATH_ONE_CONV_LAYER_FNAME,
+ CL_WEIGHTS_PATH_ONE_CONV_LAYER,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ weights_path = get_file(CL_PATH_TWO_CONV_LAYERS_FNAME,
+ CL_WEIGHTS_PATH_TWO_CONV_LAYERS,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ weights_path = get_file(CL_PATH_THREE_CONV_LAYERS_FNAME,
+ CL_WEIGHTS_PATH_THREE_CONV_LAYERS,
+ cache_subdir=cache_subdir)
+
+ # no top
+ else:
+ if nb_of_conv_layers_to_fine_tune == 1:
+ weights_path = get_file(CL_PATH_ONE_CONV_LAYER_NO_TOP_FNAME,
+ CL_WEIGHTS_PATH_ONE_CONV_LAYER_NO_TOP,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ weights_path = get_file(CL_PATH_TWO_CONV_LAYERS_NO_TOP_FNAME,
+ CL_WEIGHTS_PATH_TWO_CONV_LAYERS_NO_TOP,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ weights_path = get_file(CL_PATH_THREE_CONV_LAYERS_NO_TOP_FNAME,
+ CL_WEIGHTS_PATH_THREE_CONV_LAYERS_NO_TOP,
+ cache_subdir=cache_subdir)
+ # Displaced populations
+ elif violation_class == 'dp':
+ if include_top:
+ if mode == 'feature_extraction':
+ weights_path = get_file(DP_FEATURE_EXTRACTION_FNAME,
+ DP_WEIGHTS_FEATURE_EXTRACTION_PATH,
+ cache_subdir=cache_subdir)
+
+ elif mode == 'fine_tuning':
+
+ if nb_of_conv_layers_to_fine_tune == 1:
+ weights_path = get_file(DP_PATH_ONE_CONV_LAYER_FNAME,
+ DP_WEIGHTS_PATH_ONE_CONV_LAYER,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ weights_path = get_file(DP_PATH_TWO_CONV_LAYERS_FNAME,
+ DP_WEIGHTS_PATH_TWO_CONV_LAYERS,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ weights_path = get_file(DP_PATH_THREE_CONV_LAYERS_FNAME,
+ DP_WEIGHTS_PATH_THREE_CONV_LAYERS,
+ cache_subdir=cache_subdir)
+
+ # no top
+ else:
+ if nb_of_conv_layers_to_fine_tune == 1:
+ weights_path = get_file(DP_PATH_ONE_CONV_LAYER_NO_TOP_FNAME,
+ DP_WEIGHTS_PATH_ONE_CONV_LAYER_NO_TOP,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 2:
+ weights_path = get_file(DP_PATH_TWO_CONV_LAYERS_NO_TOP_FNAME,
+ DP_WEIGHTS_PATH_TWO_CONV_LAYERS_NO_TOP,
+ cache_subdir=cache_subdir)
+ elif nb_of_conv_layers_to_fine_tune == 3:
+ weights_path = get_file(DP_PATH_THREE_CONV_LAYERS_NO_TOP_FNAME,
+ DP_WEIGHTS_PATH_THREE_CONV_LAYERS_NO_TOP,
+ cache_subdir=cache_subdir)
+
+ model.load_weights(weights_path)
+
+ elif weights is not None:
+ model.load_weights(weights)
+
+ return model
diff --git a/applications/vgg16_places_365.py b/applications/vgg16_places_365.py
new file mode 100644
index 0000000..7fce8fa
--- /dev/null
+++ b/applications/vgg16_places_365.py
@@ -0,0 +1,241 @@
+# -*- coding: utf-8 -*-
+'''VGG16-places365 model for Keras
+
+# Reference:
+- [Places: A 10 million Image Database for Scene Recognition](http://places2.csail.mit.edu/PAMI_places.pdf)
+'''
+
+from __future__ import division, print_function
+import os
+
+import warnings
+import numpy as np
+
+from keras import backend as K
+from keras.layers import Input
+from keras.layers.core import Activation, Dense, Flatten
+from keras.layers.pooling import MaxPooling2D
+from keras.models import Model
+from keras.layers import Conv2D
+from keras.regularizers import l2
+from keras.layers.core import Dropout
+from keras.layers import GlobalAveragePooling2D
+from keras.layers import GlobalMaxPooling2D
+from keras.applications.imagenet_utils import _obtain_input_shape
+from keras.engine.topology import get_source_inputs
+from keras.utils.data_utils import get_file
+from keras.utils import layer_utils
+from keras.preprocessing import image
+from keras.applications.imagenet_utils import preprocess_input
+
+WEIGHTS_PATH = 'https://github.com/GKalliatakis/Keras-VGG16-places365/releases/download/v1.0/vgg16-places365_weights_tf_dim_ordering_tf_kernels.h5'
+WEIGHTS_PATH_NO_TOP = 'https://github.com/GKalliatakis/Keras-VGG16-places365/releases/download/v1.0/vgg16-places365_weights_tf_dim_ordering_tf_kernels_notop.h5'
+
+
+def VGG16_Places365(include_top=True, weights='places',
+ input_tensor=None, input_shape=None,
+ pooling=None,
+ classes=365):
+ """Instantiates the VGG16-places365 architecture.
+
+ Optionally loads weights pre-trained
+ on Places. Note that when using TensorFlow,
+ for best performance you should set
+ `image_data_format="channels_last"` in your Keras config
+ at ~/.keras/keras.json.
+
+ The model and the weights are compatible with both
+ TensorFlow and Theano. The data format
+ convention used by the model is the one
+ specified in your Keras config file.
+
+ # Arguments
+ include_top: whether to include the 3 fully-connected
+ layers at the top of the network.
+ weights: one of `None` (random initialization),
+ 'places' (pre-training on Places),
+ or the path to the weights file to be loaded.
+ input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+ to use as image input for the model.
+ input_shape: optional shape tuple, only to be specified
+ if `include_top` is False (otherwise the input shape
+ has to be `(224, 224, 3)` (with `channels_last` data format)
+ or `(3, 224, 244)` (with `channels_first` data format).
+ It should have exactly 3 inputs channels,
+ and width and height should be no smaller than 48.
+ E.g. `(200, 200, 3)` would be one valid value.
+ pooling: Optional pooling mode for feature extraction
+ when `include_top` is `False`.
+ - `None` means that the output of the model will be
+ the 4D tensor output of the
+ last convolutional layer.
+ - `avg` means that global average pooling
+ will be applied to the output of the
+ last convolutional layer, and thus
+ the output of the model will be a 2D tensor.
+ - `max` means that global max pooling will
+ be applied.
+ classes: optional number of classes to classify images
+ into, only to be specified if `include_top` is True, and
+ if no `weights` argument is specified.
+ # Returns
+ A Keras model instance.
+ # Raises
+ ValueError: in case of invalid argument for `weights`, or invalid input shape
+ """
+ if not (weights in {'places', None} or os.path.exists(weights)):
+ raise ValueError('The `weights` argument should be either '
+ '`None` (random initialization), `places` '
+ '(pre-training on Places), '
+ 'or the path to the weights file to be loaded.')
+
+ if weights == 'places' and include_top and classes != 365:
+ raise ValueError('If using `weights` as places with `include_top`'
+ ' as true, `classes` should be 365')
+
+ # Determine proper input shape
+ input_shape = _obtain_input_shape(input_shape,
+ default_size=224,
+ min_size=48,
+ data_format=K.image_data_format(),
+ require_flatten=include_top)
+
+ if input_tensor is None:
+ img_input = Input(shape=input_shape)
+ else:
+ if not K.is_keras_tensor(input_tensor):
+ img_input = Input(tensor=input_tensor, shape=input_shape)
+ else:
+ img_input = input_tensor
+
+ # Block 1
+ x = Conv2D(filters=64, kernel_size=3, strides=(1, 1), padding='same',
+ kernel_regularizer=l2(0.0002),
+ activation='relu', name='block1_conv1')(img_input)
+
+ x = Conv2D(filters=64, kernel_size=3, strides=(1, 1), padding='same',
+ kernel_regularizer=l2(0.0002),
+ activation='relu', name='block1_conv2')(x)
+
+ x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name="block1_pool", padding='valid')(x)
+
+ # Block 2
+ x = Conv2D(filters=128, kernel_size=3, strides=(1, 1), padding='same',
+ kernel_regularizer=l2(0.0002),
+ activation='relu', name='block2_conv1')(x)
+
+ x = Conv2D(filters=128, kernel_size=3, strides=(1, 1), padding='same',
+ kernel_regularizer=l2(0.0002),
+ activation='relu', name='block2_conv2')(x)
+
+ x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name="block2_pool", padding='valid')(x)
+
+ # Block 3
+ x = Conv2D(filters=256, kernel_size=3, strides=(1, 1), padding='same',
+ kernel_regularizer=l2(0.0002),
+ activation='relu', name='block3_conv1')(x)
+
+ x = Conv2D(filters=256, kernel_size=3, strides=(1, 1), padding='same',
+ kernel_regularizer=l2(0.0002),
+ activation='relu', name='block3_conv2')(x)
+
+ x = Conv2D(filters=256, kernel_size=3, strides=(1, 1), padding='same',
+ kernel_regularizer=l2(0.0002),
+ activation='relu', name='block3_conv3')(x)
+
+ x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name="block3_pool", padding='valid')(x)
+
+ # Block 4
+ x = Conv2D(filters=512, kernel_size=3, strides=(1, 1), padding='same',
+ kernel_regularizer=l2(0.0002),
+ activation='relu', name='block4_conv1')(x)
+
+ x = Conv2D(filters=512, kernel_size=3, strides=(1, 1), padding='same',
+ kernel_regularizer=l2(0.0002),
+ activation='relu', name='block4_conv2')(x)
+
+ x = Conv2D(filters=512, kernel_size=3, strides=(1, 1), padding='same',
+ kernel_regularizer=l2(0.0002),
+ activation='relu', name='block4_conv3')(x)
+
+ x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name="block4_pool", padding='valid')(x)
+
+ # Block 5
+ x = Conv2D(filters=512, kernel_size=3, strides=(1, 1), padding='same',
+ kernel_regularizer=l2(0.0002),
+ activation='relu', name='block5_conv1')(x)
+
+ x = Conv2D(filters=512, kernel_size=3, strides=(1, 1), padding='same',
+ kernel_regularizer=l2(0.0002),
+ activation='relu', name='block5_conv2')(x)
+
+ x = Conv2D(filters=512, kernel_size=3, strides=(1, 1), padding='same',
+ kernel_regularizer=l2(0.0002),
+ activation='relu', name='block5_conv3')(x)
+
+ x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name="block5_pool", padding='valid')(x)
+
+ if include_top:
+ # Classification block
+ x = Flatten(name='flatten')(x)
+ x = Dense(4096, activation='relu', name='fc1')(x)
+ x = Dropout(0.5, name='drop_fc1')(x)
+
+ x = Dense(4096, activation='relu', name='fc2')(x)
+ x = Dropout(0.5, name='drop_fc2')(x)
+
+ x = Dense(365, activation='softmax', name="predictions")(x)
+
+ else:
+ if pooling == 'avg':
+ x = GlobalAveragePooling2D()(x)
+ elif pooling == 'max':
+ x = GlobalMaxPooling2D()(x)
+
+ # Ensure that the model takes into account
+ # any potential predecessors of `input_tensor`.
+ if input_tensor is not None:
+ inputs = get_source_inputs(input_tensor)
+ else:
+ inputs = img_input
+
+ # Create model.
+ model = Model(inputs, x, name='vgg16-places365')
+
+ # load weights
+ if weights == 'places':
+ if include_top:
+ weights_path = get_file('vgg16-places365_weights_tf_dim_ordering_tf_kernels.h5',
+ WEIGHTS_PATH,
+ cache_subdir='models')
+ else:
+ weights_path = get_file('vgg16-places365_weights_tf_dim_ordering_tf_kernels_notop.h5',
+ WEIGHTS_PATH_NO_TOP,
+ cache_subdir='models')
+
+ model.load_weights(weights_path)
+
+ if K.backend() == 'theano':
+ layer_utils.convert_all_kernels_in_model(model)
+
+ if K.image_data_format() == 'channels_first':
+ if include_top:
+ maxpool = model.get_layer(name='block5_pool')
+ shape = maxpool.output_shape[1:]
+ dense = model.get_layer(name='fc1')
+ layer_utils.convert_dense_weights_data_format(dense, shape, 'channels_first')
+
+ if K.backend() == 'tensorflow':
+ warnings.warn('You are using the TensorFlow backend, yet you '
+ 'are using the Theano '
+ 'image data format convention '
+ '(`image_data_format="channels_first"`). '
+ 'For best performance, set '
+ '`image_data_format="channels_last"` in '
+ 'your Keras config '
+ 'at ~/.keras/keras.json.')
+
+ elif weights is not None:
+ model.load_weights(weights)
+
+ return model
diff --git a/datasets/__init__.py b/datasets/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/displacenet_evaluator.py b/displacenet_evaluator.py
new file mode 100644
index 0000000..c33d46c
--- /dev/null
+++ b/displacenet_evaluator.py
@@ -0,0 +1,142 @@
+from __future__ import print_function
+import os
+
+
+from sklearn.metrics import accuracy_score, classification_report, precision_score, confusion_matrix, average_precision_score
+from inference.displacenet_single_image_inference_unified import displaceNet_inference
+
+class DisplaceNetBaseEvaluator(object):
+ """Perfofmance metrics base class.
+ """
+
+
+ def __init__(self,
+ hra_model_backend_name,nb_of_conv_layers_to_fine_tune,
+ emotic_model_a_backend_name,emotic_model_b_backend_name,emotic_model_c_backend_name,
+ violation_class,
+ main_test_dir ='/home/sandbox/Desktop/Human_Rights_Archive_DB/test',
+ ):
+
+ self.hra_model_backend_name = hra_model_backend_name
+ self.nb_of_conv_layers_to_fine_tune = nb_of_conv_layers_to_fine_tune
+ self.emotic_model_a_backend_name = emotic_model_a_backend_name
+ self.emotic_model_b_backend_name = emotic_model_b_backend_name
+ self.emotic_model_c_backend_name = emotic_model_c_backend_name
+ self.main_test_dir = main_test_dir
+ self.total_nb_of_test_images = sum([len(files) for r, d, files in os.walk(main_test_dir)])
+ self.sorted_categories_names = sorted(os.listdir(main_test_dir))
+ self.violation_class = violation_class
+ self.y_true = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+
+
+
+ def _obtain_y_pred(self,
+ prob_threshold=0.75):
+
+ y_pred = []
+ y_scores = []
+
+ predicted_class_list = []
+ actual_class_list = []
+ coverage_count = 0
+
+ for hra_class in self.sorted_categories_names:
+
+ # variable that contains the main dir alongside the selected category
+ tmp = os.path.join(self.main_test_dir, hra_class)
+ img_names = sorted(os.listdir(tmp))
+
+ for raw_img in img_names:
+ # variable that contains the final image to be loaded
+ print('Processing [' + raw_img + ']')
+ final_img = os.path.join(tmp, raw_img)
+
+ preds = displaceNet_inference(img_path=final_img,
+ emotic_model_a_backend_name=self.emotic_model_a_backend_name,
+ emotic_model_b_backend_name=self.emotic_model_b_backend_name,
+ emotic_model_c_backend_name=self.emotic_model_c_backend_name,
+ hra_model_backend_name=self.hra_model_backend_name,
+ nb_of_fine_tuned_conv_layers=self.nb_of_conv_layers_to_fine_tune,
+ violation_class=self.violation_class)
+
+
+ preds = preds[0]
+
+ y_pred.append(int(preds[0][0]))
+ y_scores.append(preds[0][2])
+
+ top_1_predicted_probability = preds[0][2]
+
+ # top_1_predicted = np.argmax(preds)
+ top_1_predicted_label = preds[0][1]
+
+ if top_1_predicted_probability >= prob_threshold:
+ coverage_count += 1
+
+ print ('`' + hra_class + '/' + raw_img + '` ===> `' +
+ top_1_predicted_label + '`' + ' with ' + str(top_1_predicted_probability) + ' P')
+
+ predicted_class_list.append(top_1_predicted_label)
+ actual_class_list.append(hra_class)
+
+ total_coverage_per = (coverage_count * 100) / self.total_nb_of_test_images
+
+ return y_pred, self.y_true, y_scores, total_coverage_per
+
+
+
+if __name__ == "__main__":
+
+ violation_class = 'cl'
+ hra_model_backend_name = 'VGG16'
+ nb_of_conv_layers_to_fine_tune = 1
+
+ emotic_model_a_backend_name = 'VGG19'
+ emotic_model_b_backend_name = 'VGG16'
+ emotic_model_c_backend_name = None
+
+ model_backend_name = 'VGG16'
+
+ # server
+ if violation_class == 'cl':
+ main_test_dir = '/home/gkallia/git/AbuseNet/datasets/HRA-2clas-full-test/ChildLabour'
+ elif violation_class =='dp':
+ main_test_dir = '/home/gkallia/git/AbuseNet/datasets/HRA-2clas-full-test/DisplacedPopulations'
+
+ # if violation_class == 'cl':
+ # main_test_dir = '/home/sandbox/Desktop/HRA-2clas-full-test-mini/ChildLabour'
+ # elif violation_class =='dp':
+ # main_test_dir = '/home/sandbox/Desktop/HRA-2clas-full-test-mini/DisplacedPopulations'
+
+ # ---------------------------------------------------- #
+
+
+
+
+
+ base_evaluator = DisplaceNetBaseEvaluator(hra_model_backend_name=hra_model_backend_name, nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune,
+ emotic_model_a_backend_name=emotic_model_a_backend_name,
+ emotic_model_b_backend_name=emotic_model_b_backend_name,
+ emotic_model_c_backend_name=emotic_model_c_backend_name,
+ violation_class=violation_class,
+ main_test_dir =main_test_dir,
+ )
+
+ y_pred, y_true, y_scores, total_coverage_per = base_evaluator._obtain_y_pred()
+
+ # print y_true
+ top1_acc = accuracy_score(y_true, y_pred)
+
+ AP = average_precision_score(y_true, y_scores, 'micro')
+
+
+ string = model_backend_name+'-'+violation_class+'-'+str(nb_of_conv_layers_to_fine_tune)+'layer(s)'
+
+ print('\n')
+ print( '============================= %s =============================' %string)
+ print(' Top-1 acc. => ' + str(top1_acc))
+ print(' Coverage => ' + str(total_coverage_per) + '%')
+ print(' Average Precision (AP) => ' + str(AP) + '%')
\ No newline at end of file
diff --git a/emotic_obtain_y_pred_unified.py b/emotic_obtain_y_pred_unified.py
new file mode 100644
index 0000000..24a7feb
--- /dev/null
+++ b/emotic_obtain_y_pred_unified.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+""" This is meant to run on the server in order to record the estimated target values of either a single classifier or ensemble of classifiers
+ for continuous emotion recognition in VAD space.
+"""
+
+from __future__ import print_function
+import argparse
+import pandas
+import numpy as np
+from utils.generic_utils import print_progress
+from applications.emotic_utils import prepare_input_data, _obtain_nb_classifiers,_obtain_ensembling_weights, \
+ _obtain_single_model_VAD, _obtain_two_models_ensembling_VAD, _obtain_three_models_ensembling_VAD
+
+
+def get_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_a", type = str,help = 'One of `VGG16`, `VGG19` or `ResNet50`')
+ parser.add_argument("--model_b", type=str, default= None, help='One of `VGG16`, `VGG19`, `ResNet50` or `None`')
+ parser.add_argument("--model_c", type=str, default= None, help='One of `VGG16`, `VGG19`, `ResNet50` or `None`')
+
+ args = parser.parse_args()
+ return args
+
+args = get_args()
+
+model_a_name = args.model_a
+model_b_name = args.model_b
+model_c_name = args.model_c
+
+nb_classifiers, numpy_name = _obtain_nb_classifiers(model_a_name=model_a_name,
+ model_b_name=model_b_name,
+ model_c_name=model_c_name)
+
+if nb_classifiers == 1:
+ model_a = _obtain_single_model_VAD(model_a_name)
+
+elif nb_classifiers == 2:
+ w_model_a, w_model_b = _obtain_ensembling_weights(nb_classifiers=nb_classifiers,
+ model_a_name=model_a_name,
+ model_b_name=model_b_name,
+ model_c_name=model_c_name)
+
+ model_a, model_b = _obtain_two_models_ensembling_VAD(model_a_name=model_a_name, model_b_name=model_b_name)
+
+elif nb_classifiers == 3:
+ w_model_a, w_model_b, w_model_c = _obtain_ensembling_weights(nb_classifiers=nb_classifiers,
+ model_a_name=model_a_name,
+ model_b_name=model_b_name,
+ model_c_name=model_c_name)
+
+ model_a, model_b, model_c = _obtain_three_models_ensembling_VAD(model_a_name=model_a_name,
+ model_b_name=model_b_name,
+ model_c_name=model_c_name)
+
+# counter to iterate through all csv entries
+field_number = 0
+
+final_list = []
+
+# server
+csv_file = pandas.read_csv('/home/gkallia/git/emotic-VAD-classification/dataset/test.csv')
+base_dir_of_cropped_imgs = '/home/gkallia/git/emotic-VAD-classification/dataset/raw_images/cropped_imgs/'
+base_dir_of_entire_imgs = '/home/gkallia/git/emotic-VAD-classification/dataset/raw_images/entire_imgs/'
+
+for entry in csv_file.filename:
+
+ print_progress(iteration=field_number, total=7280, prefix='Progress:', suffix='Complete')
+
+ person_img_path = base_dir_of_cropped_imgs + entry
+ entire_img_path = base_dir_of_entire_imgs + entry
+
+ x1, x2 = prepare_input_data(body_path = person_img_path,
+ image_path = entire_img_path)
+
+ if nb_classifiers == 1:
+
+ preds = model_a.predict([x1, x2])
+
+
+ elif nb_classifiers == 2:
+ # obtain predictions
+ preds_model_a = model_a.predict([x1, x2])
+ preds_model_b = model_b.predict([x1, x2])
+
+ if w_model_a is None and w_model_b is None:
+ # This new prediction array should be more accurate than any of the initial ones
+ preds = 0.50 * (preds_model_a + preds_model_b)
+
+ else:
+ preds = w_model_a * preds_model_a + w_model_b * preds_model_b
+
+ elif nb_classifiers == 3:
+ # obtain predictions
+ preds_model_a = model_a.predict([x1, x2])
+ preds_model_b = model_b.predict([x1, x2])
+ preds_model_c = model_c.predict([x1, x2])
+
+ if w_model_a is None and w_model_b is None and w_model_c is None:
+ # This new prediction array should be more accurate than any of the initial ones
+ preds = 0.33 * (preds_model_a + preds_model_b + preds_model_c)
+
+ else:
+ preds = w_model_a * preds_model_a + w_model_b * preds_model_b + w_model_c * preds_model_c
+
+ final_list.append(preds[0])
+
+
+ field_number += 1
+
+final_numpy_name = 'y_predicted/' + numpy_name + '_y_predicted.npy'
+
+
+np.save(final_numpy_name, final_list)
+
+
+print('\n')
+print('[INFO] NumPy array for estimated target values has been saved as `%s`' %final_numpy_name)
+
+
+
diff --git a/engine/README.md b/engine/README.md
new file mode 100644
index 0000000..57d054b
--- /dev/null
+++ b/engine/README.md
@@ -0,0 +1,23 @@
+## Engine
+
+Engine module contains the source code for the three branches of the DisplaceNet:
+
+ 1. Object detection branch - detects all humans in an image
+ 2. Human-centric branch - for each detected **human** we conduct continuous emotion recognition in VAD space
+ and then we detect the overall dominance level that characterises the entire image
+ 3. Displaced people branch - We label the image as either _displaced people_ or _non displaced people_ based on **image classification** and **dominance level**
+
+
+
+
+### Object detection branch
+Contains the source code for two popular object detectors: RetinaNet and SSD.
+
+### Human-centric branch
+Contains the source code for conducting continuous emotion recognition in VAD space, for each detected human,
+from their frame of reference & detecting the overall dominance level that characterises the entire image.
+
+### Displaced people branch
+Contains the source code for assigning a label to the input image based on image classification and overall dominance level.
+
+```
diff --git a/engine/__init__.py b/engine/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/displaced_people_branch/__init__.py b/engine/displaced_people_branch/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/displaced_people_branch/categories_HRA_2classCL.txt b/engine/displaced_people_branch/categories_HRA_2classCL.txt
new file mode 100644
index 0000000..05e5540
--- /dev/null
+++ b/engine/displaced_people_branch/categories_HRA_2classCL.txt
@@ -0,0 +1,2 @@
+/c/child_labour 0
+/n/no_child_labour 1
diff --git a/engine/displaced_people_branch/categories_HRA_2classDP.txt b/engine/displaced_people_branch/categories_HRA_2classDP.txt
new file mode 100644
index 0000000..b9daf53
--- /dev/null
+++ b/engine/displaced_people_branch/categories_HRA_2classDP.txt
@@ -0,0 +1,2 @@
+/d/displaced_populations 0
+/n/no_displaced_populations 1
diff --git a/engine/displaced_people_branch/single_image_inference_hra_2class.py b/engine/displaced_people_branch/single_image_inference_hra_2class.py
new file mode 100644
index 0000000..5cdd62d
--- /dev/null
+++ b/engine/displaced_people_branch/single_image_inference_hra_2class.py
@@ -0,0 +1,185 @@
+# -*- coding: utf-8 -*-
+'''
+
+'''
+from __future__ import print_function
+import os
+
+import numpy as np
+
+from keras.preprocessing import image
+from applications.hra_resnet50 import HRA_ResNet50
+from applications.hra_vgg16 import HRA_VGG16
+from applications.hra_vgg19 import HRA_VGG19
+from applications.hra_vgg16_places365 import HRA_VGG16_Places365
+from applications.hra_utils import plot_preds
+from applications.hra_utils import prepare_input_data
+
+
+def single_img_HRA_inference(img_path,
+ violation_class,
+ model_backend_name,
+ nb_of_conv_layers_to_fine_tune):
+
+ """Performs single image inference.
+
+ # Arguments
+ img_path: Path to image file
+ violation_class: one of `cl` (HRA dataset with 2 classes - [i]'child_labour' and [ii]'no violation')
+ or `dp` (HRA dataset with 2 classes - [i]'displaced_populations' and [ii]'no violation')
+ model_backend_name: One of `VGG16`, `VGG19`, `ResNet50` or `VGG16_Places365`.
+ nb_of_conv_layers_to_fine_tune: integer to indicate the number of convolutional layers to fine-tune.
+ # Returns
+ Three integer values corresponding to `valence`, `arousal` and `dominance`.
+
+ """
+ (head, tail) = os.path.split(img_path)
+ filename_only = os.path.splitext(tail)[0]
+
+ if model_backend_name == 'VGG16':
+ model = HRA_VGG16(weights='HRA',
+ violation_class=violation_class,
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune)
+
+ print("[INFO] Loading and preprocessing image...")
+
+ x = prepare_input_data(img_path = img_path, objects_or_places_flag = 'objects')
+
+ elif model_backend_name == 'VGG19':
+ model = HRA_VGG19(weights='HRA',
+ violation_class=violation_class,
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune)
+
+ print("[INFO] Loading and preprocessing image...")
+
+ x = prepare_input_data(img_path=img_path, objects_or_places_flag='objects')
+
+ elif model_backend_name == 'ResNet50':
+ model = HRA_ResNet50(weights='HRA',
+ violation_class=violation_class,
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune)
+
+ print("[INFO] Loading and preprocessing image...")
+
+ x = prepare_input_data(img_path=img_path, objects_or_places_flag='objects')
+
+ elif model_backend_name == 'VGG16_Places365':
+ model = HRA_VGG16_Places365(weights='HRA',
+ violation_class=violation_class,
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune)
+
+ print("[INFO] Loading and preprocessing image...")
+
+ x = prepare_input_data(img_path=img_path, objects_or_places_flag='places')
+
+
+
+ # There seems to be relatively high confidence scores with the following method
+ # preds = model.predict(x)[0]
+ #
+ # print ('Raw predictions: ', preds)
+ #
+ # top_preds = np.argsort(preds)[::-1][0:2]
+ #
+ # print('Sorted predictions: ', top_preds)
+ #
+ # if violation_class == 'cl':
+ # file_name = 'categories_HRA_2classCL.txt'
+ # elif violation_class == 'dp':
+ # file_name = 'categories_HRA_2classDP.txt'
+ #
+ # classes = list()
+ # with open(file_name) as class_file:
+ # for line in class_file:
+ # classes.append(line.strip().split(' ')[0][3:])
+ # classes = tuple(classes)
+ #
+ #
+ # print ('\n')
+ # print('--PREDICTED HRA 2 CLASSES:')
+ # # output the prediction
+ # for i in range(0, 2):
+ # print(classes[top_preds[i]], '->', preds[top_preds[i]])
+
+
+
+
+
+ img = image.load_img(img_path, target_size=(224, 224))
+
+ from applications.hra_utils import predict as pd
+ raw_preds, decoded_preds = pd(violation_class=violation_class,
+ model=model,
+ img=img,
+ target_size=(224, 224))
+
+ # print('Raw preds: ', raw_preds)
+ # print ('Decoded preds: ', decoded_preds)
+
+ # print (type(raw_preds))
+ # print('Raw preds: ', raw_preds[0])
+ # print(type(raw_preds[0]))
+
+ top_1_predicted_probability = decoded_preds[0][2]
+
+ # top_1_predicted = np.argmax(preds)
+ top_1_predicted_label = decoded_preds[0][1]
+ # print(top_1_predicted_label, '->' , top_1_predicted_probability)
+
+ overlayed_text = str(top_1_predicted_label)+ ' (' + str(round(top_1_predicted_probability, 2)) + ')'
+
+ return raw_preds, overlayed_text, top_1_predicted_label
+
+
+
+def single_img_HRA_inference_return_only(img_path,
+ violation_class,
+ model_backend_name,
+ nb_of_conv_layers_to_fine_tune):
+
+ """Performs single image inference.
+
+ # Arguments
+ img_path: Path to image file
+ violation_class: one of `cl` (HRA dataset with 2 classes - [i]'child_labour' and [ii]'no violation')
+ or `dp` (HRA dataset with 2 classes - [i]'displaced_populations' and [ii]'no violation')
+ model_backend_name: One of `VGG16`, `VGG19`, `ResNet50` or `VGG16_Places365`.
+ nb_of_conv_layers_to_fine_tune: integer to indicate the number of convolutional layers to fine-tune.
+ # Returns
+ Three integer values corresponding to `valence`, `arousal` and `dominance`.
+
+ """
+
+ if model_backend_name == 'VGG16':
+ model = HRA_VGG16(weights='HRA',
+ violation_class=violation_class,
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune)
+
+ elif model_backend_name == 'VGG19':
+ model = HRA_VGG19(weights='HRA',
+ violation_class=violation_class,
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune)
+
+ elif model_backend_name == 'ResNet50':
+ model = HRA_ResNet50(weights='HRA',
+ violation_class=violation_class,
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune)
+
+ elif model_backend_name == 'VGG16_Places365':
+ model = HRA_VGG16_Places365(weights='HRA',
+ violation_class=violation_class,
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune)
+
+ ## Uncomment for extra verbosity
+ # print('[INFO] HRA-2class model has been loaded')
+
+ img = image.load_img(img_path, target_size=(224, 224))
+
+ from applications.hra_utils import predict as pd
+ raw_preds, decoded_preds = pd(violation_class=violation_class,
+ model=model,
+ img=img,
+ target_size=(224, 224))
+
+ return raw_preds
+
diff --git a/engine/human_centric_branch/__init__.py b/engine/human_centric_branch/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/human_centric_branch/emotic_vad_model.py b/engine/human_centric_branch/emotic_vad_model.py
new file mode 100644
index 0000000..afa6e03
--- /dev/null
+++ b/engine/human_centric_branch/emotic_vad_model.py
@@ -0,0 +1,295 @@
+# -*- coding: utf-8 -*-
+""" EMOTIC_VAD is the base class for instantiating various end-to-end models for continuous emotion recognition in VAD space
+ using the EMOTIC dataset.
+
+# Reference
+- [Emotion Recognition in Context](http://sunai.uoc.edu/emotic/pdf/EMOTIC_cvpr2017.pdf)
+- https://stackoverflow.com/questions/43452441/keras-all-layer-names-should-be-unique
+
+"""
+
+
+from __future__ import print_function
+import numpy as np
+import time
+
+import h5py
+
+from utils.generic_utils import hms_string
+from functools import partial
+
+from keras.engine.topology import get_source_inputs
+from keras.models import Model
+from keras.optimizers import SGD, RMSprop
+from keras.applications.imagenet_utils import _obtain_input_shape
+from keras.layers import Dense, GlobalAveragePooling2D, Dropout, BatchNormalization, Activation
+from keras.layers.merge import concatenate
+from keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger
+from keras.layers import Input
+from keras import regularizers
+from keras.applications.vgg16 import VGG16
+from keras.applications.vgg19 import VGG19
+from keras.applications.resnet50 import ResNet50
+from applications.vgg16_places_365 import VGG16_Places365
+
+from keras.losses import binary_crossentropy
+
+from preprocessing.emotic.custom_generator import custom_generator, custom_generator_single_output
+from utils.generic_utils import rmse, euclidean_distance_loss
+
+
+class EMOTIC_VAD():
+ """Loads the parameters needed for the training process on class instantiation
+ & sets out the training process of various models (defined as different functions of the class) using the main `train` function.
+
+ # Arguments
+ hdf5_file: The HDF5 file containing the preprocessed images and their respective annotations.
+ body_backbone_CNN: Truncated version of a CNN which takes as input the region of the image comprising
+ the person who’s feelings are to be estimated. One of `VGG16`, `VGG19`, `ResNet50` or `VGG16_Places365`.
+ image_backbone_CNN: Truncated version of a CNN which takes as input the the entire image.
+ One of `VGG16`, `VGG19`, `ResNet50` or `VGG16_Places365`.
+ nb_of_epochs: Integer, total number of iterations on the data.
+ weights_to_file: File name or full path for saving the weights of the current training process.
+ modelCheckpoint_quantity: Quantity to monitor when saving the model after every epoch is enabled.
+ earlyStopping_quantity: Quantity to monitor when stopping training when a monitored quantity has stopped improving is enabled.
+ CSVLogger_filename: filename of the csv file, where the CSVLogger callback will stream epoch results to.
+
+ # Raises
+ ValueError: in case of invalid argument for `body_backbone_CNN`
+ or invalid argument for `image_backbone_CNN`.
+ """
+
+
+ def __init__(self,
+ hdf5_file,
+ body_backbone_CNN,
+ image_backbone_CNN,
+ nb_of_epochs,
+ weights_to_file,
+ modelCheckpoint_quantity,
+ earlyStopping_quantity,
+ CSVLogger_filename):
+
+
+ if not (body_backbone_CNN in {'VGG16', 'VGG19', 'ResNet50', 'VGG16_Places365'}):
+ raise ValueError('The `body_backbone_CNN` argument should be either '
+ '`VGG16`, `VGG19`, `ResNet50` or `VGG16_Places365`. ')
+
+ if not (image_backbone_CNN in {'VGG16', 'VGG19', 'ResNet50', 'VGG16_Places365'}):
+ raise ValueError('The `image_backbone_CNN` argument should be either '
+ '`VGG16`, `VGG19`, `ResNet50` or `VGG16_Places365`. ')
+
+ self.body_backbone_CNN = body_backbone_CNN
+ self.image_backbone_CNN = image_backbone_CNN
+
+ # -------------------------------------------------------------------------------- #
+ # Construct EMOTIC model
+ # -------------------------------------------------------------------------------- #
+
+ body_inputs = Input(shape=(224, 224, 3), name='INPUT')
+ image_inputs = Input(shape=(224, 224, 3), name='INPUT')
+
+ # Body module
+ if 'VGG16' == body_backbone_CNN:
+ self.body_truncated_model = VGG16(include_top=False, weights='imagenet', input_tensor=body_inputs, pooling='avg')
+
+ elif 'VGG19' == body_backbone_CNN:
+ self.body_truncated_model = VGG19(include_top=False, weights='imagenet', input_tensor=body_inputs, pooling='avg')
+
+ elif 'ResNet50' == body_backbone_CNN:
+ tmp_model = ResNet50(include_top=False, weights='imagenet', input_tensor=body_inputs, pooling='avg')
+ self.body_truncated_model = Model(inputs=tmp_model.input, outputs=tmp_model.get_layer('activation_48').output)
+
+ elif 'VGG16_Places365' == body_backbone_CNN:
+ self.body_truncated_model = VGG16_Places365(include_top=False, weights='places', input_tensor=body_inputs, pooling='avg')
+
+ for layer in self.body_truncated_model.layers:
+ layer.name = str("body-") + layer.name
+
+
+ print('[INFO] The plain, body `' + body_backbone_CNN + '` pre-trained convnet was successfully initialised.')
+
+ # Image module
+ if 'VGG16' == image_backbone_CNN:
+ self.image_truncated_model = VGG16(include_top=False, weights='imagenet', input_tensor=image_inputs, pooling='avg')
+
+ elif 'VGG19' == image_backbone_CNN:
+ self.image_truncated_model = VGG19(include_top=False, weights='imagenet', input_tensor=image_inputs, pooling='avg')
+
+ elif 'ResNet50' == image_backbone_CNN:
+ tmp_model = ResNet50(include_top=False, weights='imagenet',input_tensor=image_inputs, pooling='avg')
+ self.image_truncated_model = Model(inputs=tmp_model.input, outputs=tmp_model.get_layer('activation_48').output)
+
+ elif 'VGG16_Places365' == image_backbone_CNN:
+ self.image_truncated_model = VGG16_Places365(include_top=False, weights='places', input_tensor=image_inputs, pooling='avg')
+
+ for layer in self.image_truncated_model.layers:
+ layer.name = str("image-") + layer.name
+
+ print('[INFO] The plain, image `' + image_backbone_CNN + '` pre-trained convnet was successfully initialised.')
+
+ # retrieve the ouputs
+ body_plain_model_output = self.body_truncated_model.output
+ image_plain_model_output = self.image_truncated_model.output
+
+
+ # In case ResNet50 is selected we need to use a global average pooling layer to follow the process used for the othe CNNs.
+ if 'ResNet50' == body_backbone_CNN:
+ body_plain_model_output = GlobalAveragePooling2D(name='GAP')(body_plain_model_output)
+
+ if 'ResNet50' == image_backbone_CNN:
+ image_plain_model_output = GlobalAveragePooling2D(name='GAP')(image_plain_model_output)
+
+ merged = concatenate([body_plain_model_output, image_plain_model_output])
+
+ x = Dense(256, activation='relu', name='FC1', kernel_regularizer=regularizers.l2(0.01), kernel_initializer='random_normal')(merged)
+
+ x = Dropout(0.5, name='DROPOUT')(x)
+
+ vad_cont_prediction = Dense(units=3, kernel_initializer='random_normal', name='VAD')(x)
+
+ # At model instantiation, you specify the two inputs and the output.
+ self.model = Model(inputs=[body_inputs, image_inputs], outputs=vad_cont_prediction, name='EMOTIC-VAD-regression')
+
+ print('[INFO] Randomly initialised classifier was successfully added on top of the merged modules.')
+
+ print('[INFO] Number of trainable weights before freezing the conv. bases of the respective original models: '
+ '' + str(len(self.model.trainable_weights)))
+
+ # first: train only the top layers (which were randomly initialized)
+ # i.e. freeze all convolutional layers of the preliminary base model
+ for layer in self.body_truncated_model.layers:
+ layer.trainable = False
+
+ for layer in self.image_truncated_model.layers:
+ layer.trainable = False
+
+ print('[INFO] Number of trainable weights after freezing the conv. bases of the respective original models: '
+ '' + str(len(self.model.trainable_weights)))
+
+ # # reference https://github.com/keras-team/keras/issues/4735#issuecomment-267472549
+ # self.class_weight = { 'VALENCE': {0: 36.00, 1: 36.00, 2: 12.00, 3: 5.14, 4: 2.25, 5: 1.00, 6: 1.89, 7: 2.57, 8: 12.00, 9: 36.00},
+ # 'AROUSAL': {0: 23.00, 1: 11.50, 2: 4.60, 3: 1.00, 4: 2.09, 5: 1.64, 6: 1.14, 7: 2.09, 8: 3.83, 9: 4.60},
+ # 'DOMINANCE': {0: 34.00, 1: 17.00, 2: 11.33, 3: 6.80, 4: 5.66, 5: 1.70, 6: 1.00, 7: 2.42, 8: 3.40, 9: 6.80}
+ # }
+
+
+ self.model.compile(optimizer=SGD(lr=1e-5, momentum=0.9),
+ # loss='mse',
+ loss = euclidean_distance_loss,
+ metrics=['mae','mse', rmse])
+
+ # print ('[INFO] Metrics names: ',self.model.metrics_names )
+
+ print('[INFO] End-to-end `EMOTIC-VAD-regression` model has been successfully compiled.')
+
+ # -------------------------------------------------------------------------------- #
+ # Configurations
+ # -------------------------------------------------------------------------------- #
+
+
+ nb_train_samples = 23706
+ nb_val_samples = 3332
+ nb_test_samples = 7280
+
+ train_generator_batch_size = 54
+ val_generator_batch_size = 49
+ test_generator_batch_size = 52
+
+ self.steps_per_epoch = nb_train_samples // train_generator_batch_size
+ self.validation_steps = nb_val_samples // val_generator_batch_size
+ self.test_steps = nb_test_samples // test_generator_batch_size
+
+
+ # -------------------------------------------------------------------------------- #
+ # Read the HDF5 file
+ # -------------------------------------------------------------------------------- #
+ # open the hdf5 file
+ hdf5_file = h5py.File(hdf5_file, "r")
+
+ self.nb_train_data = hdf5_file["x_image_train"].shape[0]
+
+ self.nb_val_data = hdf5_file["x_image_val"].shape[0]
+
+ self.nb_test_data = hdf5_file["x_image_test"].shape[0]
+
+
+
+ # -------------------------------------------------------------------------------- #
+ # Instantiate the custom generators
+ # -------------------------------------------------------------------------------- #
+
+ print('[INFO] Setting up custom generators...')
+
+ self.train_generator = custom_generator_single_output(hdf5_file=hdf5_file,
+ nb_data=self.nb_train_data,
+ batch_size=train_generator_batch_size,
+ mode='train')
+
+ self.val_generator = custom_generator_single_output(hdf5_file=hdf5_file,
+ nb_data=self.nb_val_data,
+ batch_size=val_generator_batch_size,
+ mode='val')
+
+ self.test_generator = custom_generator_single_output(hdf5_file=hdf5_file,
+ nb_data=self.nb_test_data,
+ batch_size=test_generator_batch_size,
+ mode='test')
+
+
+
+
+ # -------------------------------------------------------------------------------- #
+ # Usage of callbacks
+ # -------------------------------------------------------------------------------- #
+
+ self.weights_to_file = weights_to_file
+ self.nb_of_epochs = nb_of_epochs
+
+ # CSVLogger
+ model_log = 'trained_models/logs/' + CSVLogger_filename
+ csv_logger = CSVLogger(model_log, append=True, separator=',')
+
+
+ # ModelCheckpoint
+ checkpointer = ModelCheckpoint(filepath=weights_to_file,
+ monitor=modelCheckpoint_quantity,
+ verbose=1,
+ save_best_only=True,
+ mode='auto',
+ period=1,
+ save_weights_only=True)
+
+ early_stop = EarlyStopping(monitor=earlyStopping_quantity, patience=5, mode='auto')
+
+ self.callbacks_list = [checkpointer, early_stop, csv_logger]
+
+
+
+
+ def train(self):
+ """Trains the EMOTIC model for a given number of epochs (iterations on a dataset).
+
+ """
+
+ self.model.summary()
+
+ print('[INFO] Start training the end-to-end EMOTIC model...')
+
+ start_time = time.time()
+
+ history = self.model.fit_generator(self.train_generator,
+ epochs=self.nb_of_epochs,
+ steps_per_epoch=self.steps_per_epoch,
+ validation_data=self.val_generator,
+ validation_steps=self.validation_steps,
+ callbacks=self.callbacks_list,
+ # class_weight= self.class_weight
+ )
+
+ end_time = time.time()
+ print("[INFO] It took {} to train the end-to-end EMOTIC model".format(hms_string(end_time - start_time)))
+
+ print('[INFO] Saved trained model as: %s ' % self.weights_to_file)
+
+
diff --git a/engine/human_centric_branch/global_emotional_traits_branch.py b/engine/human_centric_branch/global_emotional_traits_branch.py
new file mode 100644
index 0000000..4c6e29f
--- /dev/null
+++ b/engine/human_centric_branch/global_emotional_traits_branch.py
@@ -0,0 +1,634 @@
+# -*- coding: utf-8 -*-
+'''
+Use three emotional dimensions - valence, arousal and dominance - to describe human perceptions of physical environments.
+
+Interpretations of pleasure: Positive versus negative affective states (e.g. excitement, relaxation, love, and
+tranquility versus cruelty, humiliation, disinterest, and boredom)
+
+Interpretations of arousal: Level of mental alertness and physical activity. (e.g. sleep, inactivity, boredom, and
+relaxation at the lower end versus wakefulness, bodily tension, strenuous
+exercise, and concentration at the higher end).
+
+Interpretations of dominance: Ranges from feelings of total lack control or influence on events and surroundings to
+the opposite extreme of feeling influential and in control
+
+'''
+from __future__ import print_function
+import os
+import warnings
+
+from engine.object_detection_branch.retina_net.single_img_inference import RetinaNet_single_img_detection
+from engine.object_detection_branch.ssd_detector import single_shot_detector
+
+from applications.emotic_utils import _obtain_single_model_VAD,prepare_input_data, _obtain_nb_classifiers, _obtain_ensembling_weights,\
+ _obtain_two_models_ensembling_VAD,_obtain_three_models_ensembling_VAD
+
+from scipy.misc import imread
+from matplotlib import pyplot as plt
+from utils.generic_utils import crop, round_number
+
+
+
+
+def single_img_VAD_inference(img_path,
+ object_detector_backend,
+ model_a_backend_name,
+ model_b_backend_name = None,
+ model_c_backend_name = None):
+ """Performs single image inference.
+ It also saves the original image (`img_path`) with the overlaid recognised humans bounding boxes and their VAD values.
+
+ # Arguments
+ img_path: Path to image file
+ object_detector_backend: Backend with which the objects will be detected. One of `SSD` or `RetinaNet`.
+ the person who’s feelings are to be estimated.
+ model_backend_name: One of `VGG16`, `VGG19` or `ResNet50`.
+ Note that EMOTIC model has already combined `model_backend_name` features with `VGG16_Places365` features at training stage,
+ but for simplicity reasons only the body backbone CNN name is adjustable.
+
+ # Returns
+ Three integer values corresponding to `valence`, `arousal` and `dominance`.
+
+ # Raises
+ ImportError: if PIL is not available.
+ ValueError: if interpolation method is not supported.
+ """
+
+ if not (object_detector_backend in {'SSD', 'RetinaNet'}):
+ raise ValueError('The `object_detector_backend_name` argument should be either '
+ '`SSD` for Single-Shot MultiBox Detector or `RetinaNet` for RetinaNet dense detector. ')
+
+ (head, tail) = os.path.split(img_path)
+ filename_only = os.path.splitext(tail)[0]
+
+ nb_classifiers, classifiers_names = _obtain_nb_classifiers(model_a_name=model_a_backend_name,
+ model_b_name=model_b_backend_name,
+ model_c_name=model_c_backend_name)
+
+ save_as = 'results/'+filename_only + '_' + classifiers_names + '.png'
+
+ if nb_classifiers == 1:
+ model_a = _obtain_single_model_VAD(model_a_backend_name)
+
+ elif nb_classifiers == 2:
+ w_model_a, w_model_b = _obtain_ensembling_weights(nb_classifiers=nb_classifiers,
+ model_a_name=model_a_backend_name,
+ model_b_name=model_b_backend_name,
+ model_c_name=model_c_backend_name)
+
+ model_a, model_b = _obtain_two_models_ensembling_VAD(model_a_name=model_a_backend_name,
+ model_b_name=model_b_backend_name)
+
+ elif nb_classifiers == 3:
+ w_model_a, w_model_b, w_model_c = _obtain_ensembling_weights(nb_classifiers=nb_classifiers,
+ model_a_name=model_a_backend_name,
+ model_b_name=model_b_backend_name,
+ model_c_name=model_c_backend_name)
+
+ model_a, model_b, model_c = _obtain_three_models_ensembling_VAD(model_a_name=model_a_backend_name,
+ model_b_name=model_b_backend_name,
+ model_c_name=model_c_backend_name)
+
+
+ numpy_img_path = imread(img_path)
+
+ # ~Object detection branch~
+ if object_detector_backend == 'SSD':
+ coordinates, persons = single_shot_detector(img_path=img_path, imshow=False)
+
+ elif object_detector_backend == 'RetinaNet':
+ coordinates, persons = RetinaNet_single_img_detection(img_path=img_path, imshow=False)
+
+ # configure colours for bounding box and text
+ bounding_box_colour_rgbvar = (53, 42, 146)
+ bounding_box_colour_rgbvar2 = [x / 255.0 for x in bounding_box_colour_rgbvar]
+
+ text_colour_rgbvar = (214, 86, 100)
+ text_colour_rgbvar2 = [x / 255.0 for x in text_colour_rgbvar]
+
+ if persons != 0:
+ print('--IMAGE INFERENCE FOR |%d| PERSON(S) FOUND:' % persons)
+
+ plt.figure(figsize=(10, 12))
+ plt.imshow(numpy_img_path)
+
+ current_axis = plt.gca()
+
+ counter = 1
+ valence_sum = 0
+ arousal_sum = 0
+ dominance_sum = 0
+
+ for box in coordinates:
+
+ # checks if the number of persons have been reached in order to stop the for loop.
+ # if counter > persons:
+ # break
+
+ if box[0] != 0:
+ print('[INFO] Person #%d' % counter)
+
+ crop(image_path=img_path, coords=box, saved_location='body_img.jpg')
+
+ x1, x2 = prepare_input_data(body_path = 'body_img.jpg',
+ image_path = img_path)
+
+ if nb_classifiers == 1:
+
+ preds = model_a.predict([x1, x2])
+
+
+ elif nb_classifiers == 2:
+ # obtain predictions
+ preds_model_a = model_a.predict([x1, x2])
+ preds_model_b = model_b.predict([x1, x2])
+
+ if w_model_a is None and w_model_b is None:
+ # This new prediction array should be more accurate than any of the initial ones
+ preds = 0.50 * (preds_model_a + preds_model_b)
+
+ else:
+ preds = w_model_a * preds_model_a + w_model_b * preds_model_b
+
+ elif nb_classifiers == 3:
+ # obtain predictions
+ preds_model_a = model_a.predict([x1, x2])
+ preds_model_b = model_b.predict([x1, x2])
+ preds_model_c = model_c.predict([x1, x2])
+
+ if w_model_a is None and w_model_b is None and w_model_c is None:
+ # This new prediction array should be more accurate than any of the initial ones
+ preds = 0.33 * (preds_model_a + preds_model_b + preds_model_c)
+
+ else:
+ preds = w_model_a * preds_model_a + w_model_b * preds_model_b + w_model_c * preds_model_c
+
+ # Uncomment to round predicted values
+ # valence = round_number(preds[0][0])
+ # arousal = round_number(preds[0][1])
+ # dominance = round_number(preds[0][2])
+
+ valence = preds[0][0]
+ arousal = preds[0][1]
+ dominance = preds[0][2]
+
+ print(' Valence (V) -- how pleasant the emotions are: ', valence)
+ print(' Arousal (A) -- unrest level of the person(s): ', arousal)
+ print('Dominance (D) -- control level of the situation: ', dominance)
+
+ valence_sum += valence
+ arousal_sum += arousal
+ dominance_sum += dominance
+
+ # current_axis.add_patch(
+ # plt.Rectangle((box[0], box[1]), box[2] - box[0], box[3] - box[1],
+ # color=text_colour_rgbvar2,
+ # fill=False,
+ # linewidth=3.5))
+
+
+
+
+ counter += 1
+
+ global_valence = valence_sum/persons
+ global_arousal = arousal_sum/persons
+ global_dominance = dominance_sum/persons
+
+ print ('\n')
+ print('--GLOBAL EMOTIONAL TRAITS:')
+
+ print(" Valence (V) -- how pleasant the emotions are: %.2f" % global_valence)
+ print(" Arousal (A) -- unrest level of the person(s): %.2f" % global_arousal)
+ print("Dominance (D) -- control level of the situation: %.2f" % global_dominance)
+ # print(' Valence (V) -- how pleasant the emotions are: ', global_valence)
+ # print(' Arousal (A) -- unrest level of the person(s): ', global_arousal)
+ # print('Dominance (D) -- control level of the situation: ', global_dominance)
+ #
+ # overlayed_text = 'Global emotional traits:' + '\n' '(V): ' + str(round(global_valence,2)) + '\n' '(A): ' + str(round(global_arousal,2)) + '\n' '(D): ' + \
+ # str(round(global_dominance,2))
+
+ overlayed_text = 'DOMINANCE: ' + \
+ str(round(global_dominance,2))
+
+ current_axis.text(5, -10, overlayed_text, size='x-large', color='white',
+ bbox={'facecolor': bounding_box_colour_rgbvar2, 'alpha': 1.0})
+
+
+
+
+ plt.axis('off')
+ plt.savefig(save_as)
+ plt.show()
+ os.remove("body_img.jpg")
+
+ else:
+ warnings.warn('No global emotional traits were identified: '
+ 'there was no person detected in the image.')
+
+ global_valence = 0
+ global_arousal = 0
+ global_dominance = 0
+
+
+
+ return global_valence, global_arousal, global_dominance
+
+
+
+
+
+def single_img_VAD_inference_return_only(img_path,
+ object_detector_backend,
+ model_a_backend_name,
+ model_b_backend_name=None,
+ model_c_backend_name=None):
+ """Performs single image inference.
+
+ # Arguments
+ img_path: Path to image file
+ object_detector_backend: Backend with which the objects will be detected. One of `SSD` or `RetinaNet`.
+ the person who’s feelings are to be estimated.
+ model_backend_name: One of `VGG16`, `VGG19` or `ResNet50`.
+ Note that EMOTIC model has already combined `model_backend_name` features with `VGG16_Places365` features at training stage,
+ but for simplicity reasons only the body backbone CNN name is adjustable.
+
+ # Returns
+ Three integer values corresponding to `valence`, `arousal` and `dominance`.
+
+ # Raises
+ ImportError: if PIL is not available.
+ ValueError: if interpolation method is not supported.
+ """
+
+ if not (object_detector_backend in {'SSD', 'RetinaNet'}):
+ raise ValueError('The `object_detector_backend_name` argument should be either '
+ '`SSD` for Single-Shot MultiBox Detector or `RetinaNet` for RetinaNet dense detector. ')
+
+
+ nb_classifiers, classifiers_names = _obtain_nb_classifiers(model_a_name=model_a_backend_name,
+ model_b_name=model_b_backend_name,
+ model_c_name=model_c_backend_name)
+
+
+ if nb_classifiers == 1:
+ model_a = _obtain_single_model_VAD(model_a_backend_name)
+
+ elif nb_classifiers == 2:
+ w_model_a, w_model_b = _obtain_ensembling_weights(nb_classifiers=nb_classifiers,
+ model_a_name=model_a_backend_name,
+ model_b_name=model_b_backend_name,
+ model_c_name=model_c_backend_name)
+
+ model_a, model_b = _obtain_two_models_ensembling_VAD(model_a_name=model_a_backend_name,
+ model_b_name=model_b_backend_name)
+
+ elif nb_classifiers == 3:
+ w_model_a, w_model_b, w_model_c = _obtain_ensembling_weights(nb_classifiers=nb_classifiers,
+ model_a_name=model_a_backend_name,
+ model_b_name=model_b_backend_name,
+ model_c_name=model_c_backend_name)
+
+ model_a, model_b, model_c = _obtain_three_models_ensembling_VAD(model_a_name=model_a_backend_name,
+ model_b_name=model_b_backend_name,
+ model_c_name=model_c_backend_name)
+ # Uncomment for extra verbosity
+ # print('[INFO] EMOTIC VAD models have been loaded')
+
+ # numpy_img_path = imread(img_path)
+
+ # ~Object detection branch~
+ if object_detector_backend == 'SSD':
+ coordinates, persons = single_shot_detector(img_path=img_path, imshow=False)
+
+ elif object_detector_backend == 'RetinaNet':
+ coordinates, persons = RetinaNet_single_img_detection(img_path=img_path, imshow=False)
+
+ # Uncomment for extra verbosity
+ # print('[INFO] Objects in image have been detected')
+
+
+ if persons != 0:
+ # Uncomment for extra verbosity
+ # print('[INFO] Carrying out continuous emotion recognition in VAD space for %d person(s) found: ' % persons)
+
+ counter = 1
+ dominance_sum = 0
+ valence_sum = 0
+
+ for box in coordinates:
+
+ # checks if the number of persons have been reached in order to stop the for loop.
+ # if counter > persons:
+ # break
+
+ if box[0] != 0:
+ # Uncomment for extra verbosity
+ # print('[INFO] Person #%d' % counter)
+
+ crop(image_path=img_path, coords=box, saved_location='body_img.jpg')
+
+ x1, x2 = prepare_input_data(body_path = 'body_img.jpg',
+ image_path = img_path)
+
+ if nb_classifiers == 1:
+
+ preds = model_a.predict([x1, x2])
+
+
+ elif nb_classifiers == 2:
+ # obtain predictions
+ preds_model_a = model_a.predict([x1, x2])
+ preds_model_b = model_b.predict([x1, x2])
+
+ if w_model_a is None and w_model_b is None:
+ # This new prediction array should be more accurate than any of the initial ones
+ preds = 0.50 * (preds_model_a + preds_model_b)
+
+ else:
+ preds = w_model_a * preds_model_a + w_model_b * preds_model_b
+
+ elif nb_classifiers == 3:
+ # obtain predictions
+ preds_model_a = model_a.predict([x1, x2])
+ preds_model_b = model_b.predict([x1, x2])
+ preds_model_c = model_c.predict([x1, x2])
+
+ if w_model_a is None and w_model_b is None and w_model_c is None:
+ # This new prediction array should be more accurate than any of the initial ones
+ preds = 0.33 * (preds_model_a + preds_model_b + preds_model_c)
+
+ else:
+ preds = w_model_a * preds_model_a + w_model_b * preds_model_b + w_model_c * preds_model_c
+
+ # Uncomment to round predicted values
+ # valence = round_number(preds[0][0])
+ # arousal = round_number(preds[0][1])
+ # dominance = round_number(preds[0][2])
+
+ valence = preds[0][0]
+ # arousal = preds[0][1]
+ dominance = preds[0][2]
+
+ # Uncomment for extra verbosity
+ # print(' Valence (V): ', valence)
+ # print(' Arousal (A): ', arousal)
+ # print('Dominance (D): ', dominance)
+
+ valence_sum += valence
+ # arousal_sum += arousal
+ dominance_sum += dominance
+
+
+
+ counter += 1
+
+ global_valence = valence_sum/persons
+ # global_arousal = arousal_sum/persons
+ global_dominance = dominance_sum/persons
+
+ # Uncomment for extra verbosity
+ # print ('\n')
+ # print('[INFO] Global emotional traits::')
+ # print(' Valence (V) -- how pleasant the emotions are: ', global_valence)
+ # print(' Arousal (A) -- unrest level of the person(s): ', global_arousal)
+ # print('Dominance (D) -- control level of the situation: ', global_dominance)
+ # print('\n')
+
+ os.remove("body_img.jpg")
+
+ else:
+ print("[WARNING] No global emotional traits were identified -- no `people` found in input image `", img_path, '`')
+
+ global_valence = 0
+ # global_arousal = 0
+ global_dominance = 0
+
+
+
+ return global_valence, global_dominance
+
+
+def single_img_VAD_inference_with_bounding_boxes(img_path,
+ object_detector_backend,
+ model_a_backend_name,
+ model_b_backend_name=None,
+ model_c_backend_name=None):
+ """Performs single image inference.
+ It also saves the original image (`img_path`) with the overlaid recognised humans bounding boxes and their VAD values.
+
+ # Arguments
+ img_path: Path to image file
+ object_detector_backend: Backend with which the objects will be detected. One of `SSD` or `RetinaNet`.
+ the person who’s feelings are to be estimated.
+ model_backend_name: One of `VGG16`, `VGG19` or `ResNet50`.
+ Note that EMOTIC model has already combined `model_backend_name` features with `VGG16_Places365` features at training stage,
+ but for simplicity reasons only the body backbone CNN name is adjustable.
+
+ # Returns
+ Three integer values corresponding to `valence`, `arousal` and `dominance`.
+
+ # Raises
+ ImportError: if PIL is not available.
+ ValueError: if interpolation method is not supported.
+ """
+
+ if not (object_detector_backend in {'SSD', 'RetinaNet'}):
+ raise ValueError('The `object_detector_backend_name` argument should be either '
+ '`SSD` for Single-Shot MultiBox Detector or `RetinaNet` for RetinaNet dense detector. ')
+
+ (head, tail) = os.path.split(img_path)
+ filename_only = os.path.splitext(tail)[0]
+
+ nb_classifiers, classifiers_names = _obtain_nb_classifiers(model_a_name=model_a_backend_name,
+ model_b_name=model_b_backend_name,
+ model_c_name=model_c_backend_name)
+
+ save_as = 'results/'+filename_only + '_' + classifiers_names + '.png'
+
+ if nb_classifiers == 1:
+ model_a = _obtain_single_model_VAD(model_a_backend_name)
+
+ elif nb_classifiers == 2:
+ w_model_a, w_model_b = _obtain_ensembling_weights(nb_classifiers=nb_classifiers,
+ model_a_name=model_a_backend_name,
+ model_b_name=model_b_backend_name,
+ model_c_name=model_c_backend_name)
+
+ model_a, model_b = _obtain_two_models_ensembling_VAD(model_a_name=model_a_backend_name,
+ model_b_name=model_b_backend_name)
+
+ elif nb_classifiers == 3:
+ w_model_a, w_model_b, w_model_c = _obtain_ensembling_weights(nb_classifiers=nb_classifiers,
+ model_a_name=model_a_backend_name,
+ model_b_name=model_b_backend_name,
+ model_c_name=model_c_backend_name)
+
+ model_a, model_b, model_c = _obtain_three_models_ensembling_VAD(model_a_name=model_a_backend_name,
+ model_b_name=model_b_backend_name,
+ model_c_name=model_c_backend_name)
+
+
+ numpy_img_path = imread(img_path)
+
+ # ~Object detection branch~
+ if object_detector_backend == 'SSD':
+ coordinates, persons = single_shot_detector(img_path=img_path, imshow=False)
+
+ elif object_detector_backend == 'RetinaNet':
+ coordinates, persons = RetinaNet_single_img_detection(img_path=img_path, imshow=False)
+
+ # configure colours for bounding box and text
+ bounding_box_colour_rgbvar = (53, 42, 146)
+ bounding_box_colour_rgbvar2 = [x / 255.0 for x in bounding_box_colour_rgbvar]
+
+ text_colour_rgbvar = (214, 86, 100)
+ text_colour_rgbvar2 = [x / 255.0 for x in text_colour_rgbvar]
+
+ if persons != 0:
+ print('--IMAGE INFERENCE FOR |%d| PERSON(S) FOUND:' % persons)
+
+ plt.figure(figsize=(10, 12))
+ plt.imshow(numpy_img_path)
+
+ current_axis = plt.gca()
+
+ counter = 1
+ valence_sum = 0
+ arousal_sum = 0
+ dominance_sum = 0
+
+ for box in coordinates:
+
+ # checks if the number of persons have been reached in order to stop the for loop.
+ # if counter > persons:
+ # break
+
+ if box[0] != 0:
+ print('[INFO] Person #%d' % counter)
+
+ crop(image_path=img_path, coords=box, saved_location='body_img.jpg')
+
+ x1, x2 = prepare_input_data(body_path = 'body_img.jpg',
+ image_path = img_path)
+
+ if nb_classifiers == 1:
+
+ preds = model_a.predict([x1, x2])
+
+
+ elif nb_classifiers == 2:
+ # obtain predictions
+ preds_model_a = model_a.predict([x1, x2])
+ preds_model_b = model_b.predict([x1, x2])
+
+ if w_model_a is None and w_model_b is None:
+ # This new prediction array should be more accurate than any of the initial ones
+ preds = 0.50 * (preds_model_a + preds_model_b)
+
+ else:
+ preds = w_model_a * preds_model_a + w_model_b * preds_model_b
+
+ elif nb_classifiers == 3:
+ # obtain predictions
+ preds_model_a = model_a.predict([x1, x2])
+ preds_model_b = model_b.predict([x1, x2])
+ preds_model_c = model_c.predict([x1, x2])
+
+ if w_model_a is None and w_model_b is None and w_model_c is None:
+ # This new prediction array should be more accurate than any of the initial ones
+ preds = 0.33 * (preds_model_a + preds_model_b + preds_model_c)
+
+ else:
+ preds = w_model_a * preds_model_a + w_model_b * preds_model_b + w_model_c * preds_model_c
+
+ # Uncomment to round predicted values
+ # valence = round_number(preds[0][0])
+ # arousal = round_number(preds[0][1])
+ # dominance = round_number(preds[0][2])
+
+ valence = preds[0][0]
+ arousal = preds[0][1]
+ dominance = preds[0][2]
+
+ print(' Valence (V) -- how pleasant the emotions are: ', valence)
+ print(' Arousal (A) -- unrest level of the person(s): ', arousal)
+ print('Dominance (D) -- control level of the situation: ', dominance)
+
+ valence_sum += valence
+ arousal_sum += arousal
+ dominance_sum += dominance
+
+ current_axis.add_patch(
+ plt.Rectangle((box[0], box[1]), box[2] - box[0], box[3] - box[1],
+ color=text_colour_rgbvar2,
+ fill=False,
+ linewidth=3.5))
+
+ people_VAD_overlayed_text = '(V): ' + str(round(valence, 2)) + '\n' '(A): ' \
+ + str(round(arousal, 2)) + '\n' '(D): ' \
+ + str(round(dominance, 2))
+
+ current_axis.text(box[0]+5, box[1]-10, people_VAD_overlayed_text, size='x-large', color='white',
+ bbox={'facecolor': bounding_box_colour_rgbvar2, 'alpha': 1.0})
+
+
+
+
+ counter += 1
+
+ global_valence = valence_sum/persons
+ global_arousal = arousal_sum/persons
+ global_dominance = dominance_sum/persons
+
+ print ('\n')
+ print('--GLOBAL EMOTIONAL TRAITS:')
+
+ print(" Valence (V) -- how pleasant the emotions are: %.2f" % global_valence)
+ print(" Arousal (A) -- unrest level of the person(s): %.2f" % global_arousal)
+ print("Dominance (D) -- control level of the situation: %.2f" % global_dominance)
+ # print(' Valence (V) -- how pleasant the emotions are: ', global_valence)
+ # print(' Arousal (A) -- unrest level of the person(s): ', global_arousal)
+ # print('Dominance (D) -- control level of the situation: ', global_dominance)
+
+ overlayed_text = '(V): ' + str(round(global_valence,2)) + '\n' '(A): ' + str(round(global_arousal,2)) + '\n' '(D): ' + \
+ str(round(global_dominance,2))
+
+
+ # current_axis.text(0, 0, overlayed_text, size='x-large', color='white',
+ # bbox={'facecolor': bounding_box_colour_rgbvar2, 'alpha': 1.0})
+
+ plt.axis('off')
+ plt.savefig(save_as)
+ plt.show()
+ os.remove("body_img.jpg")
+
+ else:
+ warnings.warn('No global emotional traits were identified: '
+ 'there was no person detected in the image.')
+
+ global_valence = 0
+ global_arousal = 0
+ global_dominance = 0
+
+
+
+ return global_valence, global_arousal, global_dominance
+
+
+
+
+
+if __name__ == "__main__":
+
+ img_path = '/home/sandbox/Desktop/Two-class-HRV/ChildLabour/test/no_child_labour/no_child_labour_0015.jpg'
+ model_a_backend_name = 'VGG19'
+ model_b_backend_name = 'VGG16'
+ model_c_backend_name = 'ResNet50'
+
+ valence, arousal, dominance = single_img_VAD_inference(img_path = img_path,
+ object_detector_backend='RetinaNet',
+ model_a_backend_name = model_a_backend_name,
+ model_b_backend_name=model_b_backend_name,
+ model_c_backend_name=model_c_backend_name,
+ )
\ No newline at end of file
diff --git a/engine/object_detection_branch/__init__.py b/engine/object_detection_branch/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/retina_net/.gitignore b/engine/object_detection_branch/retina_net/.gitignore
new file mode 100644
index 0000000..9f32019
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/.gitignore
@@ -0,0 +1,4 @@
+__pycache__/
+*.pyc
+/.pytest_cache
+/.cache
diff --git a/engine/object_detection_branch/retina_net/.gitmodules b/engine/object_detection_branch/retina_net/.gitmodules
new file mode 100644
index 0000000..9d88837
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "tests/test-data"]
+ path = tests/test-data
+ url = https://github.com/fizyr/keras-retinanet-test-data.git
diff --git a/engine/object_detection_branch/retina_net/.travis.yml b/engine/object_detection_branch/retina_net/.travis.yml
new file mode 100644
index 0000000..6b5e666
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/.travis.yml
@@ -0,0 +1,20 @@
+language: python
+sudo: required
+python:
+ - '3.6'
+ - '2.7'
+install:
+ - pip install 'numpy>=1.14'
+ - pip install 'keras==2.1.3'
+ - pip install 'opencv-python>=3.3.0'
+ - pip install 'pillow'
+ - pip install 'tensorflow'
+ - pip install 'git+https://github.com/broadinstitute/keras-resnet'
+ - pip install 'pytest-flake8'
+ - pip install 'cython'
+ - pip install 'matplotlib'
+ - pip install 'h5py'
+ - pip install 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
+cache: pip
+script:
+ - py.test --flake8
diff --git a/engine/object_detection_branch/retina_net/CONTRIBUTORS.md b/engine/object_detection_branch/retina_net/CONTRIBUTORS.md
new file mode 100644
index 0000000..8b1863f
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/CONTRIBUTORS.md
@@ -0,0 +1,38 @@
+# Contributors
+
+This is a list of people who contributed patches to keras-retinanet.
+
+If you feel you should be listed here or if you have any other questions/comments on your listing here,
+please create an issue or pull request at https://github.com/fizyr/keras-retinanet/
+
+* Hans Gaiser
+* Maarten de Vries
+* Ashley Williamson
+* Yann Henon
+* Valeriu Lacatusu
+* András Vidosits
+* Cristian Gratie
+* jjiunlin
+* Sorin Panduru
+* Rodrigo Meira de Andrade
+* Enrico Liscio
+* Mihai Morariu
+* pedroconceicao
+* jjiun
+* Wudi Fang
+* Mike Clark
+* hannesedvartsen
+* Max Van Sande
+* Pierre Dérian
+* ori
+* mxvs
+* mwilder
+* Muhammed Kocabas
+* Max Van Sande
+* Koen Vijverberg
+* iver56
+* hnsywangxin
+* Guillaume Erhard
+* Eduardo Ramos
+* DiegoAgher
+* Alexander Pacha
diff --git a/engine/object_detection_branch/retina_net/LICENSE b/engine/object_detection_branch/retina_net/LICENSE
new file mode 100644
index 0000000..8dada3e
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/LICENSE
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "{}"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright {yyyy} {name of copyright owner}
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/engine/object_detection_branch/retina_net/README.md b/engine/object_detection_branch/retina_net/README.md
new file mode 100644
index 0000000..e941e8c
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/README.md
@@ -0,0 +1,247 @@
+# Keras RetinaNet [](https://travis-ci.org/fizyr/keras-retinanet) [](https://zenodo.org/badge/latestdoi/100249425)
+
+Keras implementation of RetinaNet object detection as described in [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002)
+by Tsung-Yi Lin, Priya Goyal, Ross Girshick, Kaiming He and Piotr Dollár.
+
+## Installation
+
+1) Clone this repository.
+2) In the repository, execute `pip install . --user`.
+ Note that due to inconsistencies with how `tensorflow` should be installed,
+ this package does not define a dependency on `tensorflow` as it will try to install that (which at least on Arch Linux results in an incorrect installation).
+ Please make sure `tensorflow` is installed as per your systems requirements.
+ Also, make sure Keras 2.1.3 or higher is installed.
+3) Optionally, install `pycocotools` if you want to train / test on the MS COCO dataset by running `pip install --user git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI`.
+
+## Testing
+An example of testing the network can be seen in [this Notebook](https://github.com/delftrobotics/keras-retinanet/blob/master/examples/ResNet50RetinaNet.ipynb).
+In general, inference of the network works as follows:
+```python
+boxes, scores, labels = model.predict_on_batch(inputs)
+```
+
+Where `boxes` are shaped `(None, None, 4)` (for `(x1, y1, x2, y2)`), scores is shaped `(None, None)` (classification score) and labels is shaped `(None, None)` (label corresponding to the score). In all three outputs, the first dimension represents the shape and the second dimension indexes the list of detections.
+
+Loading models can be done in the following manner:
+```python
+from keras_retinanet.models import load_model
+model = load_model('/path/to/model.h5', backbone_name='resnet50')
+```
+
+Execution time on NVIDIA Pascal Titan X is roughly 75msec for an image of shape `1000x800x3`.
+
+### Converting a training model to inference model
+The training procedure of `keras-retinanet` works with *training models*. These are stripped down versions compared to the *inference model* and only contains the layers necessary for training (regression and classification values). If you wish to do inference on a model (perform object detection on an image), you need to convert the trained model to an inference model. This is done as follows:
+
+```shell
+# Running directly from the repository:
+keras_retinanet/bin/convert_model.py /path/to/training/model.h5 /path/to/save/inference/model.h5
+
+# Using the installed script:
+retinanet-convert-model /path/to/training/model.h5 /path/to/save/inference/model.h5
+```
+
+Most scripts (like `retinanet-evaluate`) also support converting on the fly, using the `--convert-model` argument.
+
+
+## Training
+`keras-retinanet` can be trained using [this](https://github.com/fizyr/keras-retinanet/blob/master/keras_retinanet/bin/train.py) script.
+Note that the train script uses relative imports since it is inside the `keras_retinanet` package.
+If you want to adjust the script for your own use outside of this repository,
+you will need to switch it to use absolute imports.
+
+If you installed `keras-retinanet` correctly, the train script will be installed as `retinanet-train`.
+However, if you make local modifications to the `keras-retinanet` repository, you should run the script directly from the repository.
+That will ensure that your local changes will be used by the train script.
+
+The default backbone is `resnet50`. You can change this using the `--backbone=xxx` argument in the running script.
+`xxx` can be one of the backbones in resnet models (`resnet50`, `resnet101`, `resnet152`), mobilenet models (`mobilenet128_1.0`, `mobilenet128_0.75`, `mobilenet160_1.0`, etc), densenet models or vgg models. The different options are defined by each model in their corresponding python scripts (`resnet.py`, `mobilenet.py`, etc).
+
+Trained models can't be used directly for inference. To convert a trained model to an inference model, check [here](https://github.com/fizyr/keras-retinanet#converting-a-training-model-to-inference-model).
+
+### Usage
+For training on [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/), run:
+```shell
+# Running directly from the repository:
+keras_retinanet/bin/train.py pascal /path/to/VOCdevkit/VOC2007
+
+# Using the installed script:
+retinanet-train pascal /path/to/VOCdevkit/VOC2007
+```
+
+For training on [MS COCO](http://cocodataset.org/#home), run:
+```shell
+# Running directly from the repository:
+keras_retinanet/bin/train.py coco /path/to/MS/COCO
+
+# Using the installed script:
+retinanet-train coco /path/to/MS/COCO
+```
+
+The pretrained MS COCO model can be downloaded [here](https://github.com/fizyr/keras-retinanet/releases). Results using the `cocoapi` are shown below (note: according to the paper, this configuration should achieve a mAP of 0.357).
+
+```
+ Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.350
+ Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.537
+ Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.374
+ Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.191
+ Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.383
+ Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.472
+ Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.306
+ Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.491
+ Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.533
+ Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.345
+ Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.577
+ Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.681
+```
+
+For training on Open Images Dataset [OID](https://storage.googleapis.com/openimages/web/index.html)
+or taking place to the [OID challenges](https://storage.googleapis.com/openimages/web/challenge.html), run:
+```shell
+# Running directly from the repository:
+keras_retinanet/bin/train.py oid /path/to/OID
+
+# Using the installed script:
+retinanet-train oid /path/to/OID
+
+# You can also specify a list of labels if you want to train on a subset
+# by adding the argument 'labels_filter':
+keras_retinanet/bin/train.py oid /path/to/OID --labels_filter=Helmet,Tree
+```
+
+
+For training on [KITTI](http://www.cvlibs.net/datasets/kitti/eval_object.php), run:
+```shell
+# Running directly from the repository:
+keras_retinanet/bin/train.py kitti /path/to/KITTI
+
+# Using the installed script:
+retinanet-train kitti /path/to/KITTI
+
+If you want to prepare the dataset you can use the following script:
+https://github.com/NVIDIA/DIGITS/blob/master/examples/object-detection/prepare_kitti_data.py
+```
+
+
+For training on a [custom dataset], a CSV file can be used as a way to pass the data.
+See below for more details on the format of these CSV files.
+To train using your CSV, run:
+```shell
+# Running directly from the repository:
+keras_retinanet/bin/train.py csv /path/to/csv/file/containing/annotations /path/to/csv/file/containing/classes
+
+# Using the installed script:
+retinanet-train csv /path/to/csv/file/containing/annotations /path/to/csv/file/containing/classes
+```
+
+In general, the steps to train on your own datasets are:
+1) Create a model by calling for instance `keras_retinanet.models.resnet50_retinanet` and compile it.
+ Empirically, the following compile arguments have been found to work well:
+```python
+model.compile(
+ loss={
+ 'regression' : keras_retinanet.losses.smooth_l1(),
+ 'classification': keras_retinanet.losses.focal()
+ },
+ optimizer=keras.optimizers.adam(lr=1e-5, clipnorm=0.001)
+)
+```
+2) Create generators for training and testing data (an example is show in [`keras_retinanet.preprocessing.PascalVocGenerator`](https://github.com/fizyr/keras-retinanet/blob/master/keras_retinanet/preprocessing/pascal_voc.py)).
+3) Use `model.fit_generator` to start training.
+
+## CSV datasets
+The `CSVGenerator` provides an easy way to define your own datasets.
+It uses two CSV files: one file containing annotations and one file containing a class name to ID mapping.
+
+### Annotations format
+The CSV file with annotations should contain one annotation per line.
+Images with multiple bounding boxes should use one row per bounding box.
+Note that indexing for pixel values starts at 0.
+The expected format of each line is:
+```
+path/to/image.jpg,x1,y1,x2,y2,class_name
+```
+
+Some images may not contain any labeled objects.
+To add these images to the dataset as negative examples,
+add an annotation where `x1`, `y1`, `x2`, `y2` and `class_name` are all empty:
+```
+path/to/image.jpg,,,,,
+```
+
+A full example:
+```
+/data/imgs/img_001.jpg,837,346,981,456,cow
+/data/imgs/img_002.jpg,215,312,279,391,cat
+/data/imgs/img_002.jpg,22,5,89,84,bird
+/data/imgs/img_003.jpg,,,,,
+```
+
+This defines a dataset with 3 images.
+`img_001.jpg` contains a cow.
+`img_002.jpg` contains a cat and a bird.
+`img_003.jpg` contains no interesting objects/animals.
+
+
+### Class mapping format
+The class name to ID mapping file should contain one mapping per line.
+Each line should use the following format:
+```
+class_name,id
+```
+
+Indexing for classes starts at 0.
+Do not include a background class as it is implicit.
+
+For example:
+```
+cow,0
+cat,1
+bird,2
+```
+
+## Debugging
+Creating your own dataset does not always work out of the box. There is a [`debug.py`](https://github.com/fizyr/keras-retinanet/blob/master/keras_retinanet/bin/debug.py) tool to help find the most common mistakes.
+
+Particularly helpful is the `--annotations` flag which displays your annotations on the images from your dataset. Annotations are colored in green when there are anchors available and colored in red when there are no anchors available. If an annotation doesn't have anchors available, it means it won't contribute to training. It is normal for a small amount of annotations to show up in red, but if most or all annotations are red there is cause for concern. The most common issues are that the annotations are too small or too oddly shaped (stretched out).
+
+## Results
+
+### MS COCO
+
+## Status
+Example output images using `keras-retinanet` are shown below.
+
+
+
+
+
+
+
+### Projects using keras-retinanet
+* [Anno-Mage](https://virajmavani.github.io/saiat/). A tool that helps you annotate images, using input from the keras-retinanet COCO model as suggestions.
+* [Telenav.AI](https://github.com/Telenav/Telenav.AI/tree/master/retinanet). For the detection of traffic signs using keras-retinanet.
+* [Towards Deep Placental Histology Phenotyping](https://github.com/Nellaker-group/TowardsDeepPhenotyping). This research project uses keras-retinanet for analysing the placenta at a cellular level.
+* [4k video example](https://www.youtube.com/watch?v=KYueHEMGRos). This demo shows the use of keras-retinanet on a 4k input video.
+* [boring-detector](https://github.com/lexfridman/boring-detector). I suppose not all projects need to solve life's biggest questions. This project detects the "The Boring Company" hats in videos.
+* [comet.ml](https://towardsdatascience.com/how-i-monitor-and-track-my-machine-learning-experiments-from-anywhere-described-in-13-tweets-ec3d0870af99). Using keras-retinanet in combination with [comet.ml](https://comet.ml) to interactively inspect and compare experiments.
+
+If you have a project based on `keras-retinanet` and would like to have it published here, shoot me a message on Slack.
+
+### Notes
+* This repository requires Keras 2.1.3 or higher.
+* This repository is [tested](https://github.com/fizyr/keras-retinanet/blob/master/.travis.yml) using OpenCV 3.4.
+* This repository is [tested](https://github.com/fizyr/keras-retinanet/blob/master/.travis.yml) using Python 2.7 and 3.6.
+
+Contributions to this project are welcome.
+
+### Discussions
+Feel free to join the `#keras-retinanet` [Keras Slack](https://keras-slack-autojoin.herokuapp.com/) channel for discussions and questions.
+
+## FAQ
+* **I get the warning `UserWarning: No training configuration found in save file: the model was not compiled. Compile it manually.`, should I be worried?** This warning can safely be ignored during inference.
+* **I get the error `ValueError: not enough values to unpack (expected 3, got 2)` during inference, what to do?**. This is because you are using a train model to do inference. See https://github.com/fizyr/keras-retinanet#converting-a-training-model-to-inference-model for more information.
+* **How do I do transfer learning?** The easiest solution is to use the `--weights` argument when training. Keras will load models, even if the number of classes don't match (it will simply skip loading of weights when there is a mismatch). Run for example `retinanet-train --weights snapshots/some_coco_model.h5 pascal /path/to/pascal` to transfer weights from a COCO model to a PascalVOC training session. If your dataset is small, you can also use the `--freeze-backbone` argument to freeze the backbone layers.
+* **How do I change the number / shape of the anchors?** There is no straightforward way (yet) to do this. Look at https://github.com/fizyr/keras-retinanet/issues/421 for a discussion on what is currently the method to do this.
+* **I get a loss of `0`, what is going on?** This is mostly happens when none of the anchors "fit" on your objects, because they are most likely too small or elongated. You can verify this using the [debug](https://github.com/fizyr/keras-retinanet#debugging) tool.
+* **I have an older model, can I use it after an update of keras-retinanet?** This depends on what has changed. If it is a change that doesn't affect the weights then you can "update" models by creating a new retinanet model, loading your old weights using `model.load_weights(weights_path, by_name=True)` and saving this model. If the change has been too significant, you should retrain your model (you can try to load in the weights from your old model when starting training, this might be a better starting position than ImageNet).
diff --git a/engine/object_detection_branch/retina_net/__init__.py b/engine/object_detection_branch/retina_net/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/retina_net/examples/ResNet50RetinaNet.py b/engine/object_detection_branch/retina_net/examples/ResNet50RetinaNet.py
new file mode 100644
index 0000000..932ccdf
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/examples/ResNet50RetinaNet.py
@@ -0,0 +1,114 @@
+from __future__ import print_function
+
+# import miscellaneous modules
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import time
+
+import cv2
+# import keras
+# set tf backend to allow memory to grow, instead of claiming everything
+# import keras_retinanet
+from examples.object_detectors.retina_net.keras_retinanet import models
+
+from engine.object_detectors.retina_net.keras_retinanet.utils import draw_box, draw_caption
+from engine.object_detectors.retina_net.keras_retinanet.utils import label_color
+from engine.object_detectors.retina_net.keras_retinanet.utils import read_image_bgr, preprocess_image, resize_image
+
+# adjust this to point to your downloaded/trained model
+# models can be downloaded here: https://github.com/fizyr/keras-retinanet/releases
+model_path = os.path.join('..', 'snapshots', 'resnet50_coco_best_v2.1.0.h5')
+
+# load retinanet model
+model = models.load_model(model_path, backbone_name='resnet50')
+
+# if the model is not converted to an inference model, use the line below
+# see: https://github.com/fizyr/keras-retinanet#converting-a-training-model-to-inference-model
+#model = models.load_model(model_path, backbone_name='resnet50', convert_model=True)
+
+#print(model.summary())
+
+# load label to names mapping for visualization purposes
+labels_to_names = {0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane',
+ 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light',
+ 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench',
+ 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow',
+ 20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack',
+ 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee',
+ 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat',
+ 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle',
+ 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl',
+ 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli',
+ 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair',
+ 57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv',
+ 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microwave',
+ 69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase',
+ 76: 'scissors', 77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'}
+
+
+# Run detection on example
+
+# load image
+image = read_image_bgr('human_right_viol_2.jpg')
+
+# copy to draw on
+draw = image.copy()
+draw = cv2.cvtColor(draw, cv2.COLOR_BGR2RGB)
+
+# preprocess image for network
+image = preprocess_image(image)
+image, scale = resize_image(image)
+
+# process image
+start = time.time()
+boxes, scores, labels = model.predict_on_batch(np.expand_dims(image, axis=0))
+print("processing time: ", time.time() - start)
+
+
+
+
+
+
+# correct for image scale
+boxes /= scale
+
+counter = 0
+
+
+persons_counter = 0
+final_array = np.empty([len(boxes[0]), 4])
+
+# visualize detections
+for box, score, label in zip(boxes[0], scores[0], labels[0]):
+ # scores are sorted so we can break
+ if score < 0.5:
+ break
+
+ decoded_label = "{}".format(labels_to_names[label])
+
+ if decoded_label == 'person':
+ persons_counter = persons_counter + 1
+
+ color = label_color(label)
+
+ b = box.astype(int)
+ draw_box(draw, b, color=color)
+
+ final_array[counter][0] = b[0]
+ final_array[counter][1] = b[1]
+ final_array[counter][2] = b[2]
+ final_array[counter][3] = b[3]
+
+
+ caption = "{} {:.3f}".format(labels_to_names[label], score)
+ draw_caption(draw, b, caption)
+
+ counter += 1
+
+print ('Persons found: ', persons_counter)
+
+plt.figure(figsize=(15, 15))
+plt.axis('off')
+plt.imshow(draw)
+plt.show()
\ No newline at end of file
diff --git a/engine/object_detection_branch/retina_net/examples/__init__.py b/engine/object_detection_branch/retina_net/examples/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/__init__.py b/engine/object_detection_branch/retina_net/keras_retinanet/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/backend/__init__.py b/engine/object_detection_branch/retina_net/keras_retinanet/backend/__init__.py
new file mode 100644
index 0000000..4bace69
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/backend/__init__.py
@@ -0,0 +1,2 @@
+from .dynamic import * # noqa: F401,F403
+from .common import * # noqa: F401,F403
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/backend/cntk_backend.py b/engine/object_detection_branch/retina_net/keras_retinanet/backend/cntk_backend.py
new file mode 100644
index 0000000..70aae54
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/backend/cntk_backend.py
@@ -0,0 +1,15 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/backend/common.py b/engine/object_detection_branch/retina_net/keras_retinanet/backend/common.py
new file mode 100644
index 0000000..8f8dcc6
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/backend/common.py
@@ -0,0 +1,85 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import keras.backend
+from .dynamic import meshgrid
+
+
+def bbox_transform_inv(boxes, deltas, mean=None, std=None):
+ """ Applies deltas (usually regression results) to boxes (usually anchors).
+
+ Before applying the deltas to the boxes, the normalization that was previously applied (in the generator) has to be removed.
+ The mean and std are the mean and std as applied in the generator. They are unnormalized in this function and then applied to the boxes.
+
+ Args
+ boxes : np.array of shape (B, N, 4), where B is the batch size, N the number of boxes and 4 values for (x1, y1, x2, y2).
+ deltas: np.array of same shape as boxes. These deltas (d_x1, d_y1, d_x2, d_y2) are a factor of the width/height.
+ mean : The mean value used when computing deltas (defaults to [0, 0, 0, 0]).
+ std : The standard deviation used when computing deltas (defaults to [0.2, 0.2, 0.2, 0.2]).
+
+ Returns
+ A np.array of the same shape as boxes, but with deltas applied to each box.
+ The mean and std are used during training to normalize the regression values (networks love normalization).
+ """
+ if mean is None:
+ mean = [0, 0, 0, 0]
+ if std is None:
+ std = [0.2, 0.2, 0.2, 0.2]
+
+ width = boxes[:, :, 2] - boxes[:, :, 0]
+ height = boxes[:, :, 3] - boxes[:, :, 1]
+
+ x1 = boxes[:, :, 0] + (deltas[:, :, 0] * std[0] + mean[0]) * width
+ y1 = boxes[:, :, 1] + (deltas[:, :, 1] * std[1] + mean[1]) * height
+ x2 = boxes[:, :, 2] + (deltas[:, :, 2] * std[2] + mean[2]) * width
+ y2 = boxes[:, :, 3] + (deltas[:, :, 3] * std[3] + mean[3]) * height
+
+ pred_boxes = keras.backend.stack([x1, y1, x2, y2], axis=2)
+
+ return pred_boxes
+
+
+def shift(shape, stride, anchors):
+ """ Produce shifted anchors based on shape of the map and stride size.
+
+ Args
+ shape : Shape to shift the anchors over.
+ stride : Stride to shift the anchors with over the shape.
+ anchors: The anchors to apply at each location.
+ """
+ shift_x = (keras.backend.arange(0, shape[1], dtype=keras.backend.floatx()) + keras.backend.constant(0.5, dtype=keras.backend.floatx())) * stride
+ shift_y = (keras.backend.arange(0, shape[0], dtype=keras.backend.floatx()) + keras.backend.constant(0.5, dtype=keras.backend.floatx())) * stride
+
+ shift_x, shift_y = meshgrid(shift_x, shift_y)
+ shift_x = keras.backend.reshape(shift_x, [-1])
+ shift_y = keras.backend.reshape(shift_y, [-1])
+
+ shifts = keras.backend.stack([
+ shift_x,
+ shift_y,
+ shift_x,
+ shift_y
+ ], axis=0)
+
+ shifts = keras.backend.transpose(shifts)
+ number_of_anchors = keras.backend.shape(anchors)[0]
+
+ k = keras.backend.shape(shifts)[0] # number of base points = feat_h * feat_w
+
+ shifted_anchors = keras.backend.reshape(anchors, [1, number_of_anchors, 4]) + keras.backend.cast(keras.backend.reshape(shifts, [k, 1, 4]), keras.backend.floatx())
+ shifted_anchors = keras.backend.reshape(shifted_anchors, [k * number_of_anchors, 4])
+
+ return shifted_anchors
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/backend/dynamic.py b/engine/object_detection_branch/retina_net/keras_retinanet/backend/dynamic.py
new file mode 100644
index 0000000..361b685
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/backend/dynamic.py
@@ -0,0 +1,25 @@
+import os
+
+_BACKEND = "tensorflow"
+
+if "KERAS_BACKEND" in os.environ:
+ _backend = os.environ["KERAS_BACKEND"]
+
+ backends = {
+ "cntk",
+ "tensorflow",
+ "theano"
+ }
+
+ assert _backend in backends
+
+ _BACKEND = _backend
+
+if _BACKEND == "cntk":
+ from .cntk_backend import * # noqa: F401,F403
+elif _BACKEND == "theano":
+ from .theano_backend import * # noqa: F401,F403
+elif _BACKEND == "tensorflow":
+ from .tensorflow_backend import * # noqa: F401,F403
+else:
+ raise ValueError("Unknown backend: " + str(_BACKEND))
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/backend/tensorflow_backend.py b/engine/object_detection_branch/retina_net/keras_retinanet/backend/tensorflow_backend.py
new file mode 100644
index 0000000..9625c9a
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/backend/tensorflow_backend.py
@@ -0,0 +1,92 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import tensorflow
+
+
+def map_fn(*args, **kwargs):
+ """ See https://www.tensorflow.org/versions/master/api_docs/python/tf/map_fn .
+ """
+ return tensorflow.map_fn(*args, **kwargs)
+
+
+def pad(*args, **kwargs):
+ """ See https://www.tensorflow.org/versions/master/api_docs/python/tf/pad .
+ """
+ return tensorflow.pad(*args, **kwargs)
+
+
+def top_k(*args, **kwargs):
+ """ See https://www.tensorflow.org/versions/master/api_docs/python/tf/nn/top_k .
+ """
+ return tensorflow.nn.top_k(*args, **kwargs)
+
+
+def clip_by_value(*args, **kwargs):
+ """ See https://www.tensorflow.org/versions/master/api_docs/python/tf/clip_by_value .
+ """
+ return tensorflow.clip_by_value(*args, **kwargs)
+
+
+def resize_images(images, size, method='bilinear', align_corners=False):
+ """ See https://www.tensorflow.org/versions/master/api_docs/python/tf/image/resize_images .
+
+ Args
+ method: The method used for interpolation. One of ('bilinear', 'nearest', 'bicubic', 'area').
+ """
+ methods = {
+ 'bilinear': tensorflow.image.ResizeMethod.BILINEAR,
+ 'nearest' : tensorflow.image.ResizeMethod.NEAREST_NEIGHBOR,
+ 'bicubic' : tensorflow.image.ResizeMethod.BICUBIC,
+ 'area' : tensorflow.image.ResizeMethod.AREA,
+ }
+ return tensorflow.image.resize_images(images, size, methods[method], align_corners)
+
+
+def non_max_suppression(*args, **kwargs):
+ """ See https://www.tensorflow.org/versions/master/api_docs/python/tf/image/non_max_suppression .
+ """
+ return tensorflow.image.non_max_suppression(*args, **kwargs)
+
+
+def range(*args, **kwargs):
+ """ See https://www.tensorflow.org/versions/master/api_docs/python/tf/range .
+ """
+ return tensorflow.range(*args, **kwargs)
+
+
+def scatter_nd(*args, **kwargs):
+ """ See https://www.tensorflow.org/versions/master/api_docs/python/tf/scatter_nd .
+ """
+ return tensorflow.scatter_nd(*args, **kwargs)
+
+
+def gather_nd(*args, **kwargs):
+ """ See https://www.tensorflow.org/versions/master/api_docs/python/tf/gather_nd .
+ """
+ return tensorflow.gather_nd(*args, **kwargs)
+
+
+def meshgrid(*args, **kwargs):
+ """ See https://www.tensorflow.org/versions/master/api_docs/python/tf/meshgrid .
+ """
+ return tensorflow.meshgrid(*args, **kwargs)
+
+
+def where(*args, **kwargs):
+ """ See https://www.tensorflow.org/versions/master/api_docs/python/tf/where .
+ """
+ return tensorflow.where(*args, **kwargs)
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/backend/theano_backend.py b/engine/object_detection_branch/retina_net/keras_retinanet/backend/theano_backend.py
new file mode 100644
index 0000000..70aae54
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/backend/theano_backend.py
@@ -0,0 +1,15 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/bin/__init__.py b/engine/object_detection_branch/retina_net/keras_retinanet/bin/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/bin/convert_model.py b/engine/object_detection_branch/retina_net/keras_retinanet/bin/convert_model.py
new file mode 100755
index 0000000..c2d04df
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/bin/convert_model.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import argparse
+import os
+import sys
+
+# Allow relative imports when being executed as script.
+if __name__ == "__main__" and __package__ is None:
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
+ __package__ = "keras_retinanet.bin"
+
+# Change these to absolute imports if you copy this script outside the keras_retinanet package.
+from .. import models
+
+
+def parse_args(args):
+ parser = argparse.ArgumentParser(description='Script for converting a training model to an inference model.')
+
+ parser.add_argument('model_in', help='The model to convert.')
+ parser.add_argument('model_out', help='Path to save the converted model to.')
+ parser.add_argument('--backbone', help='The backbone of the model to convert.', default='resnet50')
+ parser.add_argument('--no-nms', help='Disables non maximum suppression.', dest='nms', action='store_false')
+
+ return parser.parse_args(args)
+
+
+def main(args=None):
+ # parse arguments
+ if args is None:
+ args = sys.argv[1:]
+ args = parse_args(args)
+
+ # load and convert model
+ model = models.load_model(args.model_in, convert=True, backbone_name=args.backbone, nms=args.nms)
+
+ # save model
+ model.save(args.model_out)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/bin/debug.py b/engine/object_detection_branch/retina_net/keras_retinanet/bin/debug.py
new file mode 100755
index 0000000..cccffa4
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/bin/debug.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python
+
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import argparse
+import numpy as np
+import os
+import sys
+
+import cv2
+
+# Allow relative imports when being executed as script.
+if __name__ == "__main__" and __package__ is None:
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
+ import keras_retinanet.bin # noqa: F401
+ __package__ = "keras_retinanet.bin"
+
+# Change these to absolute imports if you copy this script outside the keras_retinanet package.
+from ..preprocessing.pascal_voc import PascalVocGenerator
+from ..preprocessing.csv_generator import CSVGenerator
+from ..preprocessing.kitti import KittiGenerator
+from ..preprocessing.open_images import OpenImagesGenerator
+from ..utils.transform import random_transform_generator
+from ..utils.visualization import draw_annotations, draw_boxes
+
+
+def create_generator(args):
+ """ Create the data generators.
+
+ Args:
+ args: parseargs arguments object.
+ """
+ # create random transform generator for augmenting training data
+ transform_generator = random_transform_generator(
+ min_rotation=-0.1,
+ max_rotation=0.1,
+ min_translation=(-0.1, -0.1),
+ max_translation=(0.1, 0.1),
+ min_shear=-0.1,
+ max_shear=0.1,
+ min_scaling=(0.9, 0.9),
+ max_scaling=(1.1, 1.1),
+ flip_x_chance=0.5,
+ flip_y_chance=0.5,
+ )
+
+ if args.dataset_type == 'coco':
+ # import here to prevent unnecessary dependency on cocoapi
+ from ..preprocessing.coco import CocoGenerator
+
+ generator = CocoGenerator(
+ args.coco_path,
+ args.coco_set,
+ transform_generator=transform_generator
+ )
+ elif args.dataset_type == 'pascal':
+ generator = PascalVocGenerator(
+ args.pascal_path,
+ args.pascal_set,
+ transform_generator=transform_generator
+ )
+ elif args.dataset_type == 'csv':
+ generator = CSVGenerator(
+ args.annotations,
+ args.classes,
+ transform_generator=transform_generator
+ )
+ elif args.dataset_type == 'oid':
+ generator = OpenImagesGenerator(
+ args.main_dir,
+ subset=args.subset,
+ version=args.version,
+ labels_filter=args.labels_filter,
+ fixed_labels=args.fixed_labels,
+ annotation_cache_dir=args.annotation_cache_dir,
+ transform_generator=transform_generator
+ )
+ elif args.dataset_type == 'kitti':
+ generator = KittiGenerator(
+ args.kitti_path,
+ subset=args.subset,
+ transform_generator=transform_generator
+ )
+ else:
+ raise ValueError('Invalid data type received: {}'.format(args.dataset_type))
+
+ return generator
+
+
+def parse_args(args):
+ """ Parse the arguments.
+ """
+ parser = argparse.ArgumentParser(description='Debug script for a RetinaNet network.')
+ subparsers = parser.add_subparsers(help='Arguments for specific dataset types.', dest='dataset_type')
+ subparsers.required = True
+
+ coco_parser = subparsers.add_parser('coco')
+ coco_parser.add_argument('coco_path', help='Path to dataset directory (ie. /tmp/COCO).')
+ coco_parser.add_argument('--coco-set', help='Name of the set to show (defaults to val2017).', default='val2017')
+
+ pascal_parser = subparsers.add_parser('pascal')
+ pascal_parser.add_argument('pascal_path', help='Path to dataset directory (ie. /tmp/VOCdevkit).')
+ pascal_parser.add_argument('--pascal-set', help='Name of the set to show (defaults to test).', default='test')
+
+ kitti_parser = subparsers.add_parser('kitti')
+ kitti_parser.add_argument('kitti_path', help='Path to dataset directory (ie. /tmp/kitti).')
+ kitti_parser.add_argument('subset', help='Argument for loading a subset from train/val.')
+
+ def csv_list(string):
+ return string.split(',')
+
+ oid_parser = subparsers.add_parser('oid')
+ oid_parser.add_argument('main_dir', help='Path to dataset directory.')
+ oid_parser.add_argument('subset', help='Argument for loading a subset from train/validation/test.')
+ oid_parser.add_argument('--version', help='The current dataset version is v4.', default='v4')
+ oid_parser.add_argument('--labels-filter', help='A list of labels to filter.', type=csv_list, default=None)
+ oid_parser.add_argument('--annotation-cache-dir', help='Path to store annotation cache.', default='.')
+ oid_parser.add_argument('--fixed-labels', help='Use the exact specified labels.', default=False)
+
+ csv_parser = subparsers.add_parser('csv')
+ csv_parser.add_argument('annotations', help='Path to CSV file containing annotations for evaluation.')
+ csv_parser.add_argument('classes', help='Path to a CSV file containing class label mapping.')
+
+ parser.add_argument('-l', '--loop', help='Loop forever, even if the dataset is exhausted.', action='store_true')
+ parser.add_argument('--no-resize', help='Disable image resizing.', dest='resize', action='store_false')
+ parser.add_argument('--anchors', help='Show positive anchors on the image.', action='store_true')
+ parser.add_argument('--annotations', help='Show annotations on the image. Green annotations have anchors, red annotations don\'t and therefore don\'t contribute to training.', action='store_true')
+ parser.add_argument('--random-transform', help='Randomly transform image and annotations.', action='store_true')
+
+ return parser.parse_args(args)
+
+
+def run(generator, args):
+ """ Main loop.
+
+ Args
+ generator: The generator to debug.
+ args: parseargs args object.
+ """
+ # display images, one at a time
+ for i in range(generator.size()):
+ # load the data
+ image = generator.load_image(i)
+ annotations = generator.load_annotations(i)
+
+ # apply random transformations
+ if args.random_transform:
+ image, annotations = generator.random_transform_group_entry(image, annotations)
+
+ # resize the image and annotations
+ if args.resize:
+ image, image_scale = generator.resize_image(image)
+ annotations[:, :4] *= image_scale
+
+ # draw anchors on the image
+ if args.anchors:
+ labels, _, anchors = generator.compute_anchor_targets(image.shape, annotations, generator.num_classes())
+ draw_boxes(image, anchors[np.max(labels, axis=1) == 1], (255, 255, 0), thickness=1)
+
+ # draw annotations on the image
+ if args.annotations:
+ # draw annotations in red
+ draw_annotations(image, annotations, color=(0, 0, 255), label_to_name=generator.label_to_name)
+
+ # draw regressed anchors in green to override most red annotations
+ # result is that annotations without anchors are red, with anchors are green
+ labels, boxes, _ = generator.compute_anchor_targets(image.shape, annotations, generator.num_classes())
+ draw_boxes(image, boxes[np.max(labels, axis=1) == 1], (0, 255, 0))
+
+ cv2.imshow('Image', image)
+ if cv2.waitKey() == ord('q'):
+ return False
+ return True
+
+
+def main(args=None):
+ # parse arguments
+ if args is None:
+ args = sys.argv[1:]
+ args = parse_args(args)
+
+ # create the generator
+ generator = create_generator(args)
+
+ # create the display window
+ cv2.namedWindow('Image', cv2.WINDOW_NORMAL)
+
+ if args.loop:
+ while run(generator, args):
+ pass
+ else:
+ run(generator, args)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/bin/evaluate.py b/engine/object_detection_branch/retina_net/keras_retinanet/bin/evaluate.py
new file mode 100755
index 0000000..c1d66f4
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/bin/evaluate.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python
+
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import argparse
+import os
+import sys
+
+import keras
+import tensorflow as tf
+
+# Allow relative imports when being executed as script.
+if __name__ == "__main__" and __package__ is None:
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
+ __package__ = "keras_retinanet.bin"
+
+# Change these to absolute imports if you copy this script outside the keras_retinanet package.
+from .. import models
+from ..preprocessing.csv_generator import CSVGenerator
+from ..preprocessing.pascal_voc import PascalVocGenerator
+from ..utils.eval import evaluate
+from ..utils.keras_version import check_keras_version
+
+
+def get_session():
+ """ Construct a modified tf session.
+ """
+ config = tf.ConfigProto()
+ config.gpu_options.allow_growth = True
+ return tf.Session(config=config)
+
+
+def create_generator(args):
+ """ Create generators for evaluation.
+ """
+ if args.dataset_type == 'coco':
+ # import here to prevent unnecessary dependency on cocoapi
+ from ..preprocessing.coco import CocoGenerator
+
+ validation_generator = CocoGenerator(
+ args.coco_path,
+ 'val2017',
+ image_min_side=args.image_min_side,
+ image_max_side=args.image_max_side
+ )
+ elif args.dataset_type == 'pascal':
+ validation_generator = PascalVocGenerator(
+ args.pascal_path,
+ 'test',
+ image_min_side=args.image_min_side,
+ image_max_side=args.image_max_side
+ )
+ elif args.dataset_type == 'csv':
+ validation_generator = CSVGenerator(
+ args.annotations,
+ args.classes,
+ image_min_side=args.image_min_side,
+ image_max_side=args.image_max_side
+ )
+ else:
+ raise ValueError('Invalid data type received: {}'.format(args.dataset_type))
+
+ return validation_generator
+
+
+def parse_args(args):
+ """ Parse the arguments.
+ """
+ parser = argparse.ArgumentParser(description='Evaluation script for a RetinaNet network.')
+ subparsers = parser.add_subparsers(help='Arguments for specific dataset types.', dest='dataset_type')
+ subparsers.required = True
+
+ coco_parser = subparsers.add_parser('coco')
+ coco_parser.add_argument('coco_path', help='Path to dataset directory (ie. /tmp/COCO).')
+
+ pascal_parser = subparsers.add_parser('pascal')
+ pascal_parser.add_argument('pascal_path', help='Path to dataset directory (ie. /tmp/VOCdevkit).')
+
+ csv_parser = subparsers.add_parser('csv')
+ csv_parser.add_argument('annotations', help='Path to CSV file containing annotations for evaluation.')
+ csv_parser.add_argument('classes', help='Path to a CSV file containing class label mapping.')
+
+ parser.add_argument('model', help='Path to RetinaNet model.')
+ parser.add_argument('--convert-model', help='Convert the model to an inference model (ie. the input is a training model).', action='store_true')
+ parser.add_argument('--backbone', help='The backbone of the model.', default='resnet50')
+ parser.add_argument('--gpu', help='Id of the GPU to use (as reported by nvidia-smi).')
+ parser.add_argument('--score-threshold', help='Threshold on score to filter detections with (defaults to 0.05).', default=0.05, type=float)
+ parser.add_argument('--iou-threshold', help='IoU Threshold to count for a positive detection (defaults to 0.5).', default=0.5, type=float)
+ parser.add_argument('--max-detections', help='Max Detections per image (defaults to 100).', default=100, type=int)
+ parser.add_argument('--save-path', help='Path for saving images with detections (doesn\'t work for COCO).')
+ parser.add_argument('--image-min-side', help='Rescale the image so the smallest side is min_side.', type=int, default=800)
+ parser.add_argument('--image-max-side', help='Rescale the image if the largest side is larger than max_side.', type=int, default=1333)
+
+ return parser.parse_args(args)
+
+
+def main(args=None):
+ # parse arguments
+ if args is None:
+ args = sys.argv[1:]
+ args = parse_args(args)
+
+ # make sure keras is the minimum required version
+ check_keras_version()
+
+ # optionally choose specific GPU
+ if args.gpu:
+ os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
+ keras.backend.tensorflow_backend.set_session(get_session())
+
+ # make save path if it doesn't exist
+ if args.save_path is not None and not os.path.exists(args.save_path):
+ os.makedirs(args.save_path)
+
+ # create the generator
+ generator = create_generator(args)
+
+ # load the model
+ print('Loading model, this may take a second...')
+ model = models.load_model(args.model, backbone_name=args.backbone, convert=args.convert_model)
+
+ # print model summary
+ # print(model.summary())
+
+ # start evaluation
+ if args.dataset_type == 'coco':
+ from ..utils.coco_eval import evaluate_coco
+ evaluate_coco(generator, model, args.score_threshold)
+ else:
+ average_precisions = evaluate(
+ generator,
+ model,
+ iou_threshold=args.iou_threshold,
+ score_threshold=args.score_threshold,
+ max_detections=args.max_detections,
+ save_path=args.save_path
+ )
+
+ # print evaluation
+ for label, average_precision in average_precisions.items():
+ print(generator.label_to_name(label), '{:.4f}'.format(average_precision))
+ print('mAP: {:.4f}'.format(sum(average_precisions.values()) / len(average_precisions)))
+
+
+if __name__ == '__main__':
+ main()
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/bin/train.py b/engine/object_detection_branch/retina_net/keras_retinanet/bin/train.py
new file mode 100755
index 0000000..5ab1585
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/bin/train.py
@@ -0,0 +1,481 @@
+#!/usr/bin/env python
+
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import argparse
+import functools
+import os
+import sys
+import warnings
+
+import keras
+import keras.preprocessing.image
+import tensorflow as tf
+from keras.utils import multi_gpu_model
+
+# Allow relative imports when being executed as script.
+if __name__ == "__main__" and __package__ is None:
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
+ __package__ = "keras_retinanet.bin"
+
+# Change these to absolute imports if you copy this script outside the keras_retinanet package.
+from .. import losses
+from .. import models
+from ..callbacks import RedirectModel
+from ..callbacks.eval import Evaluate
+from ..models.retinanet import retinanet_bbox
+from ..preprocessing.csv_generator import CSVGenerator
+from ..preprocessing.kitti import KittiGenerator
+from ..preprocessing.open_images import OpenImagesGenerator
+from ..preprocessing.pascal_voc import PascalVocGenerator
+from ..utils.anchors import make_shapes_callback, anchor_targets_bbox
+from ..utils.keras_version import check_keras_version
+from ..utils.model import freeze as freeze_model
+from ..utils.transform import random_transform_generator
+
+
+def makedirs(path):
+ # Intended behavior: try to create the directory,
+ # pass if the directory exists already, fails otherwise.
+ # Meant for Python 2.7/3.n compatibility.
+ try:
+ os.makedirs(path)
+ except OSError:
+ if not os.path.isdir(path):
+ raise
+
+
+def get_session():
+ """ Construct a modified tf session.
+ """
+ config = tf.ConfigProto()
+ config.gpu_options.allow_growth = True
+ return tf.Session(config=config)
+
+
+def model_with_weights(model, weights, skip_mismatch):
+ """ Load weights for model.
+
+ Args
+ model : The model to load weights for.
+ weights : The weights to load.
+ skip_mismatch : If True, skips layers whose shape of weights doesn't match with the model.
+ """
+ if weights is not None:
+ model.load_weights(weights, by_name=True, skip_mismatch=skip_mismatch)
+ return model
+
+
+def create_models(backbone_retinanet, num_classes, weights, multi_gpu=0, freeze_backbone=False):
+ """ Creates three models (model, training_model, prediction_model).
+
+ Args
+ backbone_retinanet : A function to call to create a retinanet model with a given backbone.
+ num_classes : The number of classes to train.
+ weights : The weights to load into the model.
+ multi_gpu : The number of GPUs to use for training.
+ freeze_backbone : If True, disables learning for the backbone.
+
+ Returns
+ model : The base model. This is also the model that is saved in snapshots.
+ training_model : The training model. If multi_gpu=0, this is identical to model.
+ prediction_model : The model wrapped with utility functions to perform object detection (applies regression values and performs NMS).
+ """
+ modifier = freeze_model if freeze_backbone else None
+
+ # Keras recommends initialising a multi-gpu model on the CPU to ease weight sharing, and to prevent OOM errors.
+ # optionally wrap in a parallel model
+ if multi_gpu > 1:
+ with tf.device('/cpu:0'):
+ model = model_with_weights(backbone_retinanet(num_classes, modifier=modifier), weights=weights, skip_mismatch=True)
+ training_model = multi_gpu_model(model, gpus=multi_gpu)
+ else:
+ model = model_with_weights(backbone_retinanet(num_classes, modifier=modifier), weights=weights, skip_mismatch=True)
+ training_model = model
+
+ # make prediction model
+ prediction_model = retinanet_bbox(model=model)
+
+ # compile model
+ training_model.compile(
+ loss={
+ 'regression' : losses.smooth_l1(),
+ 'classification': losses.focal()
+ },
+ optimizer=keras.optimizers.adam(lr=1e-5, clipnorm=0.001)
+ )
+
+ return model, training_model, prediction_model
+
+
+def create_callbacks(model, training_model, prediction_model, validation_generator, args):
+ """ Creates the callbacks to use during training.
+
+ Args
+ model: The base model.
+ training_model: The model that is used for training.
+ prediction_model: The model that should be used for validation.
+ validation_generator: The generator for creating validation data.
+ args: parseargs args object.
+
+ Returns:
+ A list of callbacks used for training.
+ """
+ callbacks = []
+
+ tensorboard_callback = None
+
+ if args.tensorboard_dir:
+ tensorboard_callback = keras.callbacks.TensorBoard(
+ log_dir = args.tensorboard_dir,
+ histogram_freq = 0,
+ batch_size = args.batch_size,
+ write_graph = True,
+ write_grads = False,
+ write_images = False,
+ embeddings_freq = 0,
+ embeddings_layer_names = None,
+ embeddings_metadata = None
+ )
+ callbacks.append(tensorboard_callback)
+
+ if args.evaluation and validation_generator:
+ if args.dataset_type == 'coco':
+ from ..callbacks.coco import CocoEval
+
+ # use prediction model for evaluation
+ evaluation = CocoEval(validation_generator, tensorboard=tensorboard_callback)
+ else:
+ evaluation = Evaluate(validation_generator, tensorboard=tensorboard_callback)
+ evaluation = RedirectModel(evaluation, prediction_model)
+ callbacks.append(evaluation)
+
+ # save the model
+ if args.snapshots:
+ # ensure directory created first; otherwise h5py will error after epoch.
+ makedirs(args.snapshot_path)
+ checkpoint = keras.callbacks.ModelCheckpoint(
+ os.path.join(
+ args.snapshot_path,
+ '{backbone}_{dataset_type}_{{epoch:02d}}.h5'.format(backbone=args.backbone, dataset_type=args.dataset_type)
+ ),
+ verbose=1,
+ # save_best_only=True,
+ # monitor="mAP",
+ # mode='max'
+ )
+ checkpoint = RedirectModel(checkpoint, model)
+ callbacks.append(checkpoint)
+
+ callbacks.append(keras.callbacks.ReduceLROnPlateau(
+ monitor = 'loss',
+ factor = 0.1,
+ patience = 2,
+ verbose = 1,
+ mode = 'auto',
+ epsilon = 0.0001,
+ cooldown = 0,
+ min_lr = 0
+ ))
+
+ return callbacks
+
+
+def create_generators(args):
+ """ Create generators for training and validation.
+ """
+ # create random transform generator for augmenting training data
+ if args.random_transform:
+ transform_generator = random_transform_generator(
+ min_rotation=-0.1,
+ max_rotation=0.1,
+ min_translation=(-0.1, -0.1),
+ max_translation=(0.1, 0.1),
+ min_shear=-0.1,
+ max_shear=0.1,
+ min_scaling=(0.9, 0.9),
+ max_scaling=(1.1, 1.1),
+ flip_x_chance=0.5,
+ flip_y_chance=0.5,
+ )
+ else:
+ transform_generator = random_transform_generator(flip_x_chance=0.5)
+
+ if args.dataset_type == 'coco':
+ # import here to prevent unnecessary dependency on cocoapi
+ from ..preprocessing.coco import CocoGenerator
+
+ train_generator = CocoGenerator(
+ args.coco_path,
+ 'train2017',
+ transform_generator=transform_generator,
+ batch_size=args.batch_size,
+ image_min_side=args.image_min_side,
+ image_max_side=args.image_max_side
+ )
+
+ validation_generator = CocoGenerator(
+ args.coco_path,
+ 'val2017',
+ batch_size=args.batch_size,
+ image_min_side=args.image_min_side,
+ image_max_side=args.image_max_side
+ )
+ elif args.dataset_type == 'pascal':
+ train_generator = PascalVocGenerator(
+ args.pascal_path,
+ 'trainval',
+ transform_generator=transform_generator,
+ batch_size=args.batch_size,
+ image_min_side=args.image_min_side,
+ image_max_side=args.image_max_side
+ )
+
+ validation_generator = PascalVocGenerator(
+ args.pascal_path,
+ 'test',
+ batch_size=args.batch_size,
+ image_min_side=args.image_min_side,
+ image_max_side=args.image_max_side
+ )
+ elif args.dataset_type == 'csv':
+ train_generator = CSVGenerator(
+ args.annotations,
+ args.classes,
+ transform_generator=transform_generator,
+ batch_size=args.batch_size,
+ image_min_side=args.image_min_side,
+ image_max_side=args.image_max_side
+ )
+
+ if args.val_annotations:
+ validation_generator = CSVGenerator(
+ args.val_annotations,
+ args.classes,
+ batch_size=args.batch_size,
+ image_min_side=args.image_min_side,
+ image_max_side=args.image_max_side
+ )
+ else:
+ validation_generator = None
+ elif args.dataset_type == 'oid':
+ train_generator = OpenImagesGenerator(
+ args.main_dir,
+ subset='train',
+ version=args.version,
+ labels_filter=args.labels_filter,
+ annotation_cache_dir=args.annotation_cache_dir,
+ fixed_labels=args.fixed_labels,
+ transform_generator=transform_generator,
+ batch_size=args.batch_size,
+ image_min_side=args.image_min_side,
+ image_max_side=args.image_max_side
+ )
+
+ validation_generator = OpenImagesGenerator(
+ args.main_dir,
+ subset='validation',
+ version=args.version,
+ labels_filter=args.labels_filter,
+ annotation_cache_dir=args.annotation_cache_dir,
+ fixed_labels=args.fixed_labels,
+ batch_size=args.batch_size,
+ image_min_side=args.image_min_side,
+ image_max_side=args.image_max_side
+ )
+ elif args.dataset_type == 'kitti':
+ train_generator = KittiGenerator(
+ args.kitti_path,
+ subset='train',
+ transform_generator=transform_generator,
+ batch_size=args.batch_size,
+ image_min_side=args.image_min_side,
+ image_max_side=args.image_max_side
+ )
+
+ validation_generator = KittiGenerator(
+ args.kitti_path,
+ subset='val',
+ batch_size=args.batch_size,
+ image_min_side=args.image_min_side,
+ image_max_side=args.image_max_side
+ )
+ else:
+ raise ValueError('Invalid data type received: {}'.format(args.dataset_type))
+
+ return train_generator, validation_generator
+
+
+def check_args(parsed_args):
+ """ Function to check for inherent contradictions within parsed arguments.
+ For example, batch_size < num_gpus
+ Intended to raise errors prior to backend initialisation.
+
+ Args
+ parsed_args: parser.parse_args()
+
+ Returns
+ parsed_args
+ """
+
+ if parsed_args.multi_gpu > 1 and parsed_args.batch_size < parsed_args.multi_gpu:
+ raise ValueError(
+ "Batch size ({}) must be equal to or higher than the number of GPUs ({})".format(parsed_args.batch_size,
+ parsed_args.multi_gpu))
+
+ if parsed_args.multi_gpu > 1 and parsed_args.snapshot:
+ raise ValueError(
+ "Multi GPU training ({}) and resuming from snapshots ({}) is not supported.".format(parsed_args.multi_gpu,
+ parsed_args.snapshot))
+
+ if parsed_args.multi_gpu > 1 and not parsed_args.multi_gpu_force:
+ raise ValueError("Multi-GPU support is experimental, use at own risk! Run with --multi-gpu-force if you wish to continue.")
+
+ if 'resnet' not in parsed_args.backbone:
+ warnings.warn('Using experimental backbone {}. Only resnet50 has been properly tested.'.format(parsed_args.backbone))
+
+ return parsed_args
+
+
+def parse_args(args):
+ """ Parse the arguments.
+ """
+ parser = argparse.ArgumentParser(description='Simple training script for training a RetinaNet network.')
+ subparsers = parser.add_subparsers(help='Arguments for specific dataset types.', dest='dataset_type')
+ subparsers.required = True
+
+ coco_parser = subparsers.add_parser('coco')
+ coco_parser.add_argument('coco_path', help='Path to dataset directory (ie. /tmp/COCO).')
+
+ pascal_parser = subparsers.add_parser('pascal')
+ pascal_parser.add_argument('pascal_path', help='Path to dataset directory (ie. /tmp/VOCdevkit).')
+
+ kitti_parser = subparsers.add_parser('kitti')
+ kitti_parser.add_argument('kitti_path', help='Path to dataset directory (ie. /tmp/kitti).')
+
+ def csv_list(string):
+ return string.split(',')
+
+ oid_parser = subparsers.add_parser('oid')
+ oid_parser.add_argument('main_dir', help='Path to dataset directory.')
+ oid_parser.add_argument('--version', help='The current dataset version is v4.', default='v4')
+ oid_parser.add_argument('--labels-filter', help='A list of labels to filter.', type=csv_list, default=None)
+ oid_parser.add_argument('--annotation-cache-dir', help='Path to store annotation cache.', default='.')
+ oid_parser.add_argument('--fixed-labels', help='Use the exact specified labels.', default=False)
+
+ csv_parser = subparsers.add_parser('csv')
+ csv_parser.add_argument('annotations', help='Path to CSV file containing annotations for training.')
+ csv_parser.add_argument('classes', help='Path to a CSV file containing class label mapping.')
+ csv_parser.add_argument('--val-annotations', help='Path to CSV file containing annotations for validation (optional).')
+
+ group = parser.add_mutually_exclusive_group()
+ group.add_argument('--snapshot', help='Resume training from a snapshot.')
+ group.add_argument('--imagenet-weights', help='Initialize the model with pretrained imagenet weights. This is the default behaviour.', action='store_const', const=True, default=True)
+ group.add_argument('--weights', help='Initialize the model with weights from a file.')
+ group.add_argument('--no-weights', help='Don\'t initialize the model with any weights.', dest='imagenet_weights', action='store_const', const=False)
+
+ parser.add_argument('--backbone', help='Backbone model used by retinanet.', default='resnet50', type=str)
+ parser.add_argument('--batch-size', help='Size of the batches.', default=1, type=int)
+ parser.add_argument('--gpu', help='Id of the GPU to use (as reported by nvidia-smi).')
+ parser.add_argument('--multi-gpu', help='Number of GPUs to use for parallel processing.', type=int, default=0)
+ parser.add_argument('--multi-gpu-force', help='Extra flag needed to enable (experimental) multi-gpu support.', action='store_true')
+ parser.add_argument('--epochs', help='Number of epochs to train.', type=int, default=50)
+ parser.add_argument('--steps', help='Number of steps per epoch.', type=int, default=10000)
+ parser.add_argument('--snapshot-path', help='Path to store snapshots of models during training (defaults to \'./snapshots\')', default='./snapshots')
+ parser.add_argument('--tensorboard-dir', help='Log directory for Tensorboard output', default='./logs')
+ parser.add_argument('--no-snapshots', help='Disable saving snapshots.', dest='snapshots', action='store_false')
+ parser.add_argument('--no-evaluation', help='Disable per epoch evaluation.', dest='evaluation', action='store_false')
+ parser.add_argument('--freeze-backbone', help='Freeze training of backbone layers.', action='store_true')
+ parser.add_argument('--random-transform', help='Randomly transform image and annotations.', action='store_true')
+ parser.add_argument('--image-min-side', help='Rescale the image so the smallest side is min_side.', type=int, default=800)
+ parser.add_argument('--image-max-side', help='Rescale the image if the largest side is larger than max_side.', type=int, default=1333)
+
+ return check_args(parser.parse_args(args))
+
+
+def main(args=None):
+ # parse arguments
+ if args is None:
+ args = sys.argv[1:]
+ args = parse_args(args)
+
+ # create object that stores backbone information
+ backbone = models.backbone(args.backbone)
+
+ # make sure keras is the minimum required version
+ check_keras_version()
+
+ # optionally choose specific GPU
+ if args.gpu:
+ os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
+ keras.backend.tensorflow_backend.set_session(get_session())
+
+ # create the generators
+ train_generator, validation_generator = create_generators(args)
+
+ # create the model
+ if args.snapshot is not None:
+ print('Loading model, this may take a second...')
+ model = models.load_model(args.snapshot, backbone_name=args.backbone)
+ training_model = model
+ prediction_model = retinanet_bbox(model=model)
+ else:
+ weights = args.weights
+ # default to imagenet if nothing else is specified
+ if weights is None and args.imagenet_weights:
+ weights = backbone.download_imagenet()
+
+ print('Creating model, this may take a second...')
+ model, training_model, prediction_model = create_models(
+ backbone_retinanet=backbone.retinanet,
+ num_classes=train_generator.num_classes(),
+ weights=weights,
+ multi_gpu=args.multi_gpu,
+ freeze_backbone=args.freeze_backbone
+ )
+
+ # print model summary
+ print(model.summary())
+
+ # this lets the generator compute backbone layer shapes using the actual backbone model
+ if 'vgg' in args.backbone or 'densenet' in args.backbone:
+ compute_anchor_targets = functools.partial(anchor_targets_bbox, shapes_callback=make_shapes_callback(model))
+ train_generator.compute_anchor_targets = compute_anchor_targets
+ if validation_generator is not None:
+ validation_generator.compute_anchor_targets = compute_anchor_targets
+
+ # create the callbacks
+ callbacks = create_callbacks(
+ model,
+ training_model,
+ prediction_model,
+ validation_generator,
+ args,
+ )
+
+ # start training
+ training_model.fit_generator(
+ generator=train_generator,
+ steps_per_epoch=args.steps,
+ epochs=args.epochs,
+ verbose=1,
+ callbacks=callbacks,
+ )
+
+
+if __name__ == '__main__':
+ main()
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/callbacks/__init__.py b/engine/object_detection_branch/retina_net/keras_retinanet/callbacks/__init__.py
new file mode 100644
index 0000000..7316c99
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/callbacks/__init__.py
@@ -0,0 +1 @@
+from .common import * # noqa: F401,F403
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/callbacks/coco.py b/engine/object_detection_branch/retina_net/keras_retinanet/callbacks/coco.py
new file mode 100644
index 0000000..4790f1f
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/callbacks/coco.py
@@ -0,0 +1,62 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import keras
+from ..utils.coco_eval import evaluate_coco
+
+
+class CocoEval(keras.callbacks.Callback):
+ """ Performs COCO evaluation on each epoch.
+ """
+ def __init__(self, generator, tensorboard=None, threshold=0.05):
+ """ CocoEval callback intializer.
+
+ Args
+ generator : The generator used for creating validation data.
+ tensorboard : If given, the results will be written to tensorboard.
+ threshold : The score threshold to use.
+ """
+ self.generator = generator
+ self.threshold = threshold
+ self.tensorboard = tensorboard
+
+ super(CocoEval, self).__init__()
+
+ def on_epoch_end(self, epoch, logs=None):
+ logs = logs or {}
+
+ coco_tag = ['AP @[ IoU=0.50:0.95 | area= all | maxDets=100 ]',
+ 'AP @[ IoU=0.50 | area= all | maxDets=100 ]',
+ 'AP @[ IoU=0.75 | area= all | maxDets=100 ]',
+ 'AP @[ IoU=0.50:0.95 | area= small | maxDets=100 ]',
+ 'AP @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]',
+ 'AP @[ IoU=0.50:0.95 | area= large | maxDets=100 ]',
+ 'AR @[ IoU=0.50:0.95 | area= all | maxDets= 1 ]',
+ 'AR @[ IoU=0.50:0.95 | area= all | maxDets= 10 ]',
+ 'AR @[ IoU=0.50:0.95 | area= all | maxDets=100 ]',
+ 'AR @[ IoU=0.50:0.95 | area= small | maxDets=100 ]',
+ 'AR @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]',
+ 'AR @[ IoU=0.50:0.95 | area= large | maxDets=100 ]']
+ coco_eval_stats = evaluate_coco(self.generator, self.model, self.threshold)
+ if coco_eval_stats is not None and self.tensorboard is not None and self.tensorboard.writer is not None:
+ import tensorflow as tf
+ summary = tf.Summary()
+ for index, result in enumerate(coco_eval_stats):
+ summary_value = summary.value.add()
+ summary_value.simple_value = result
+ summary_value.tag = '{}. {}'.format(index + 1, coco_tag[index])
+ self.tensorboard.writer.add_summary(summary, epoch)
+ logs[coco_tag[index]] = result
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/callbacks/common.py b/engine/object_detection_branch/retina_net/keras_retinanet/callbacks/common.py
new file mode 100644
index 0000000..67c00e1
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/callbacks/common.py
@@ -0,0 +1,46 @@
+import keras.callbacks
+
+
+class RedirectModel(keras.callbacks.Callback):
+ """Callback which wraps another callback, but executed on a different model.
+
+ ```python
+ model = keras.models.load_model('model.h5')
+ model_checkpoint = ModelCheckpoint(filepath='snapshot.h5')
+ parallel_model = multi_gpu_model(model, gpus=2)
+ parallel_model.fit(X_train, Y_train, callbacks=[RedirectModel(model_checkpoint, model)])
+ ```
+
+ Args
+ callback : callback to wrap.
+ model : model to use when executing callbacks.
+ """
+
+ def __init__(self,
+ callback,
+ model):
+ super(RedirectModel, self).__init__()
+
+ self.callback = callback
+ self.redirect_model = model
+
+ def on_epoch_begin(self, epoch, logs=None):
+ self.callback.on_epoch_begin(epoch, logs=logs)
+
+ def on_epoch_end(self, epoch, logs=None):
+ self.callback.on_epoch_end(epoch, logs=logs)
+
+ def on_batch_begin(self, batch, logs=None):
+ self.callback.on_batch_begin(batch, logs=logs)
+
+ def on_batch_end(self, batch, logs=None):
+ self.callback.on_batch_end(batch, logs=logs)
+
+ def on_train_begin(self, logs=None):
+ # overwrite the model with our custom model
+ self.callback.set_model(self.redirect_model)
+
+ self.callback.on_train_begin(logs=logs)
+
+ def on_train_end(self, logs=None):
+ self.callback.on_train_end(logs=logs)
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/callbacks/eval.py b/engine/object_detection_branch/retina_net/keras_retinanet/callbacks/eval.py
new file mode 100644
index 0000000..c0e22c2
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/callbacks/eval.py
@@ -0,0 +1,75 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import keras
+from ..utils.eval import evaluate
+
+
+class Evaluate(keras.callbacks.Callback):
+ """ Evaluation callback for arbitrary datasets.
+ """
+
+ def __init__(self, generator, iou_threshold=0.5, score_threshold=0.05, max_detections=100, save_path=None, tensorboard=None, verbose=1):
+ """ Evaluate a given dataset using a given model at the end of every epoch during training.
+
+ # Arguments
+ generator : The generator that represents the dataset to evaluate.
+ iou_threshold : The threshold used to consider when a detection is positive or negative.
+ score_threshold : The score confidence threshold to use for detections.
+ max_detections : The maximum number of detections to use per image.
+ save_path : The path to save images with visualized detections to.
+ tensorboard : Instance of keras.callbacks.TensorBoard used to log the mAP value.
+ verbose : Set the verbosity level, by default this is set to 1.
+ """
+ self.generator = generator
+ self.iou_threshold = iou_threshold
+ self.score_threshold = score_threshold
+ self.max_detections = max_detections
+ self.save_path = save_path
+ self.tensorboard = tensorboard
+ self.verbose = verbose
+
+ super(Evaluate, self).__init__()
+
+ def on_epoch_end(self, epoch, logs=None):
+ logs = logs or {}
+
+ # run evaluation
+ average_precisions = evaluate(
+ self.generator,
+ self.model,
+ iou_threshold=self.iou_threshold,
+ score_threshold=self.score_threshold,
+ max_detections=self.max_detections,
+ save_path=self.save_path
+ )
+
+ self.mean_ap = sum(average_precisions.values()) / len(average_precisions)
+
+ if self.tensorboard is not None and self.tensorboard.writer is not None:
+ import tensorflow as tf
+ summary = tf.Summary()
+ summary_value = summary.value.add()
+ summary_value.simple_value = self.mean_ap
+ summary_value.tag = "mAP"
+ self.tensorboard.writer.add_summary(summary, epoch)
+
+ logs['mAP'] = self.mean_ap
+
+ if self.verbose == 1:
+ for label, average_precision in average_precisions.items():
+ print(self.generator.label_to_name(label), '{:.4f}'.format(average_precision))
+ print('mAP: {:.4f}'.format(self.mean_ap))
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/initializers.py b/engine/object_detection_branch/retina_net/keras_retinanet/initializers.py
new file mode 100644
index 0000000..f41faf8
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/initializers.py
@@ -0,0 +1,39 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import keras
+
+import numpy as np
+import math
+
+
+class PriorProbability(keras.initializers.Initializer):
+ """ Apply a prior probability to the weights.
+ """
+
+ def __init__(self, probability=0.01):
+ self.probability = probability
+
+ def get_config(self):
+ return {
+ 'probability': self.probability
+ }
+
+ def __call__(self, shape, dtype=None):
+ # set bias to -log((1 - p)/p) for foreground
+ result = np.ones(shape, dtype=dtype) * -math.log((1 - self.probability) / self.probability)
+
+ return result
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/layers/__init__.py b/engine/object_detection_branch/retina_net/keras_retinanet/layers/__init__.py
new file mode 100644
index 0000000..5a8c7d3
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/layers/__init__.py
@@ -0,0 +1,2 @@
+from ._misc import RegressBoxes, UpsampleLike, Anchors, ClipBoxes # noqa: F401
+from .filter_detections import FilterDetections # noqa: F401
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/layers/_misc.py b/engine/object_detection_branch/retina_net/keras_retinanet/layers/_misc.py
new file mode 100644
index 0000000..603b8af
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/layers/_misc.py
@@ -0,0 +1,166 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import numpy as np
+
+import keras
+
+from .. import backend
+from ..utils import anchors as utils_anchors
+
+
+class Anchors(keras.layers.Layer):
+ """ Keras layer for generating achors for a given shape.
+ """
+
+ def __init__(self, size, stride, ratios=None, scales=None, *args, **kwargs):
+ """ Initializer for an Anchors layer.
+
+ Args
+ size: The base size of the anchors to generate.
+ stride: The stride of the anchors to generate.
+ ratios: The ratios of the anchors to generate (defaults to [0.5, 1, 2]).
+ scales: The scales of the anchors to generate (defaults to [2^0, 2^(1/3), 2^(2/3)]).
+ """
+ self.size = size
+ self.stride = stride
+ self.ratios = ratios
+ self.scales = scales
+
+ if ratios is None:
+ self.ratios = np.array([0.5, 1, 2], keras.backend.floatx()),
+ elif isinstance(ratios, list):
+ self.ratios = np.array(ratios)
+ if scales is None:
+ self.scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)], keras.backend.floatx()),
+ elif isinstance(scales, list):
+ self.scales = np.array(scales)
+
+ self.num_anchors = len(ratios) * len(scales)
+ self.anchors = keras.backend.variable(utils_anchors.generate_anchors(
+ base_size=size,
+ ratios=ratios,
+ scales=scales,
+ ))
+
+ super(Anchors, self).__init__(*args, **kwargs)
+
+ def call(self, inputs, **kwargs):
+ features = inputs
+ features_shape = keras.backend.shape(features)[:3]
+
+ # generate proposals from bbox deltas and shifted anchors
+ anchors = backend.shift(features_shape[1:3], self.stride, self.anchors)
+ anchors = keras.backend.tile(keras.backend.expand_dims(anchors, axis=0), (features_shape[0], 1, 1))
+
+ return anchors
+
+ def compute_output_shape(self, input_shape):
+ if None not in input_shape[1:]:
+ total = np.prod(input_shape[1:3]) * self.num_anchors
+ return (input_shape[0], total, 4)
+ else:
+ return (input_shape[0], None, 4)
+
+ def get_config(self):
+ config = super(Anchors, self).get_config()
+ config.update({
+ 'size' : self.size,
+ 'stride' : self.stride,
+ 'ratios' : self.ratios.tolist(),
+ 'scales' : self.scales.tolist(),
+ })
+
+ return config
+
+
+class UpsampleLike(keras.layers.Layer):
+ """ Keras layer for upsampling a Tensor to be the same shape as another Tensor.
+ """
+
+ def call(self, inputs, **kwargs):
+ source, target = inputs
+ target_shape = keras.backend.shape(target)
+ return backend.resize_images(source, (target_shape[1], target_shape[2]), method='nearest')
+
+ def compute_output_shape(self, input_shape):
+ return (input_shape[0][0],) + input_shape[1][1:3] + (input_shape[0][-1],)
+
+
+class RegressBoxes(keras.layers.Layer):
+ """ Keras layer for applying regression values to boxes.
+ """
+
+ def __init__(self, mean=None, std=None, *args, **kwargs):
+ """ Initializer for the RegressBoxes layer.
+
+ Args
+ mean: The mean value of the regression values which was used for normalization.
+ std: The standard value of the regression values which was used for normalization.
+ """
+ if mean is None:
+ mean = np.array([0, 0, 0, 0])
+ if std is None:
+ std = np.array([0.2, 0.2, 0.2, 0.2])
+
+ if isinstance(mean, (list, tuple)):
+ mean = np.array(mean)
+ elif not isinstance(mean, np.ndarray):
+ raise ValueError('Expected mean to be a np.ndarray, list or tuple. Received: {}'.format(type(mean)))
+
+ if isinstance(std, (list, tuple)):
+ std = np.array(std)
+ elif not isinstance(std, np.ndarray):
+ raise ValueError('Expected std to be a np.ndarray, list or tuple. Received: {}'.format(type(std)))
+
+ self.mean = mean
+ self.std = std
+ super(RegressBoxes, self).__init__(*args, **kwargs)
+
+ def call(self, inputs, **kwargs):
+ anchors, regression = inputs
+ return backend.bbox_transform_inv(anchors, regression, mean=self.mean, std=self.std)
+
+ def compute_output_shape(self, input_shape):
+ return input_shape[0]
+
+ def get_config(self):
+ config = super(RegressBoxes, self).get_config()
+ config.update({
+ 'mean': self.mean.tolist(),
+ 'std' : self.std.tolist(),
+ })
+
+ return config
+
+
+class ClipBoxes(keras.layers.Layer):
+ """ Keras layer to clip box values to lie inside a given shape.
+ """
+
+ def call(self, inputs, **kwargs):
+ image, boxes = inputs
+ shape = keras.backend.cast(keras.backend.shape(image), keras.backend.floatx())
+
+ x1 = backend.clip_by_value(boxes[:, :, 0], 0, shape[2])
+ y1 = backend.clip_by_value(boxes[:, :, 1], 0, shape[1])
+ x2 = backend.clip_by_value(boxes[:, :, 2], 0, shape[2])
+ y2 = backend.clip_by_value(boxes[:, :, 3], 0, shape[1])
+
+ return keras.backend.stack([x1, y1, x2, y2], axis=2)
+
+ def compute_output_shape(self, input_shape):
+ return input_shape[1]
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/layers/filter_detections.py b/engine/object_detection_branch/retina_net/keras_retinanet/layers/filter_detections.py
new file mode 100644
index 0000000..1f062b4
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/layers/filter_detections.py
@@ -0,0 +1,201 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import keras
+
+from .. import backend
+
+
+def filter_detections(boxes, classification, other=[], nms=True, score_threshold=0.05, max_detections=300, nms_threshold=0.5):
+ """ Filter detections using the boxes and classification values.
+
+ Args
+ boxes : Tensor of shape (num_boxes, 4) containing the boxes in (x1, y1, x2, y2) format.
+ classification : Tensor of shape (num_boxes, num_classes) containing the classification scores.
+ other : List of tensors of shape (num_boxes, ...) to filter along with the boxes and classification scores.
+ nms : Flag to enable/disable non maximum suppression.
+ score_threshold : Threshold used to prefilter the boxes with.
+ max_detections : Maximum number of detections to keep.
+ nms_threshold : Threshold for the IoU value to determine when a box should be suppressed.
+
+ Returns
+ A list of [boxes, scores, labels, other[0], other[1], ...].
+ boxes is shaped (max_detections, 4) and contains the (x1, y1, x2, y2) of the non-suppressed boxes.
+ scores is shaped (max_detections,) and contains the scores of the predicted class.
+ labels is shaped (max_detections,) and contains the predicted label.
+ other[i] is shaped (max_detections, ...) and contains the filtered other[i] data.
+ In case there are less than max_detections detections, the tensors are padded with -1's.
+ """
+ all_indices = []
+
+ # perform per class filtering
+ for c in range(int(classification.shape[1])):
+ scores = classification[:, c]
+
+ # threshold based on score
+ indices = backend.where(keras.backend.greater(scores, score_threshold))
+
+ if nms:
+ filtered_boxes = backend.gather_nd(boxes, indices)
+ filtered_scores = keras.backend.gather(scores, indices)[:, 0]
+
+ # perform NMS
+ nms_indices = backend.non_max_suppression(filtered_boxes, filtered_scores, max_output_size=max_detections, iou_threshold=nms_threshold)
+
+ # filter indices based on NMS
+ indices = keras.backend.gather(indices, nms_indices)
+
+ # add indices to list of all indices
+ labels = c * keras.backend.ones((keras.backend.shape(indices)[0],), dtype='int64')
+ indices = keras.backend.stack([indices[:, 0], labels], axis=1)
+ all_indices.append(indices)
+
+ # concatenate indices to single tensor
+ indices = keras.backend.concatenate(all_indices, axis=0)
+
+ # select top k
+ scores = backend.gather_nd(classification, indices)
+ labels = indices[:, 1]
+ scores, top_indices = backend.top_k(scores, k=keras.backend.minimum(max_detections, keras.backend.shape(scores)[0]))
+
+ # filter input using the final set of indices
+ indices = keras.backend.gather(indices[:, 0], top_indices)
+ boxes = keras.backend.gather(boxes, indices)
+ labels = keras.backend.gather(labels, top_indices)
+ other_ = [keras.backend.gather(o, indices) for o in other]
+
+ # zero pad the outputs
+ pad_size = keras.backend.maximum(0, max_detections - keras.backend.shape(scores)[0])
+ boxes = backend.pad(boxes, [[0, pad_size], [0, 0]], constant_values=-1)
+ scores = backend.pad(scores, [[0, pad_size]], constant_values=-1)
+ labels = backend.pad(labels, [[0, pad_size]], constant_values=-1)
+ labels = keras.backend.cast(labels, 'int32')
+ other_ = [backend.pad(o, [[0, pad_size]] + [[0, 0] for _ in range(1, len(o.shape))], constant_values=-1) for o in other_]
+
+ # set shapes, since we know what they are
+ boxes.set_shape([max_detections, 4])
+ scores.set_shape([max_detections])
+ labels.set_shape([max_detections])
+ for o, s in zip(other_, [list(keras.backend.int_shape(o)) for o in other]):
+ o.set_shape([max_detections] + s[1:])
+
+ return [boxes, scores, labels] + other_
+
+
+class FilterDetections(keras.layers.Layer):
+ """ Keras layer for filtering detections using score threshold and NMS.
+ """
+
+ def __init__(
+ self,
+ nms = True,
+ nms_threshold = 0.5,
+ score_threshold = 0.05,
+ max_detections = 300,
+ parallel_iterations = 32,
+ **kwargs
+ ):
+ """ Filters detections using score threshold, NMS and selecting the top-k detections.
+
+ Args
+ nms : Flag to enable/disable NMS.
+ nms_threshold : Threshold for the IoU value to determine when a box should be suppressed.
+ score_threshold : Threshold used to prefilter the boxes with.
+ max_detections : Maximum number of detections to keep.
+ parallel_iterations : Number of batch items to process in parallel.
+ """
+ self.nms = nms
+ self.nms_threshold = nms_threshold
+ self.score_threshold = score_threshold
+ self.max_detections = max_detections
+ self.parallel_iterations = parallel_iterations
+ super(FilterDetections, self).__init__(**kwargs)
+
+ def call(self, inputs, **kwargs):
+ """ Constructs the NMS graph.
+
+ Args
+ inputs : List of [boxes, classification, other[0], other[1], ...] tensors.
+ """
+ boxes = inputs[0]
+ classification = inputs[1]
+ other = inputs[2:]
+
+ # wrap nms with our parameters
+ def _filter_detections(args):
+ boxes = args[0]
+ classification = args[1]
+ other = args[2]
+
+ return filter_detections(
+ boxes,
+ classification,
+ other,
+ nms=self.nms,
+ score_threshold=self.score_threshold,
+ max_detections=self.max_detections,
+ nms_threshold=self.nms_threshold,
+ )
+
+ # call filter_detections on each batch
+ outputs = backend.map_fn(
+ _filter_detections,
+ elems=[boxes, classification, other],
+ dtype=[keras.backend.floatx(), keras.backend.floatx(), 'int32'] + [o.dtype for o in other],
+ parallel_iterations=self.parallel_iterations
+ )
+
+ return outputs
+
+ def compute_output_shape(self, input_shape):
+ """ Computes the output shapes given the input shapes.
+
+ Args
+ input_shape : List of input shapes [boxes, classification, other[0], other[1], ...].
+
+ Returns
+ List of tuples representing the output shapes:
+ [filtered_boxes.shape, filtered_scores.shape, filtered_labels.shape, filtered_other[0].shape, filtered_other[1].shape, ...]
+ """
+ return [
+ (input_shape[0][0], self.max_detections, 4),
+ (input_shape[1][0], self.max_detections),
+ (input_shape[1][0], self.max_detections),
+ ] + [
+ tuple([input_shape[i][0], self.max_detections] + list(input_shape[i][2:])) for i in range(2, len(input_shape))
+ ]
+
+ def compute_mask(self, inputs, mask=None):
+ """ This is required in Keras when there is more than 1 output.
+ """
+ return (len(inputs) + 1) * [None]
+
+ def get_config(self):
+ """ Gets the configuration of this layer.
+
+ Returns
+ Dictionary containing the parameters of this layer.
+ """
+ config = super(FilterDetections, self).get_config()
+ config.update({
+ 'nms' : self.nms,
+ 'nms_threshold' : self.nms_threshold,
+ 'score_threshold' : self.score_threshold,
+ 'max_detections' : self.max_detections,
+ 'parallel_iterations' : self.parallel_iterations,
+ })
+
+ return config
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/losses.py b/engine/object_detection_branch/retina_net/keras_retinanet/losses.py
new file mode 100644
index 0000000..ad8c2eb
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/losses.py
@@ -0,0 +1,118 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import keras
+
+from . import backend
+
+
+def focal(alpha=0.25, gamma=2.0):
+ """ Create a functor for computing the focal loss.
+
+ Args
+ alpha: Scale the focal weight with alpha.
+ gamma: Take the power of the focal weight with gamma.
+
+ Returns
+ A functor that computes the focal loss using the alpha and gamma.
+ """
+ def _focal(y_true, y_pred):
+ """ Compute the focal loss given the target tensor and the predicted tensor.
+
+ As defined in https://arxiv.org/abs/1708.02002
+
+ Args
+ y_true: Tensor of target data from the generator with shape (B, N, num_classes).
+ y_pred: Tensor of predicted data from the network with shape (B, N, num_classes).
+
+ Returns
+ The focal loss of y_pred w.r.t. y_true.
+ """
+ labels = y_true
+ classification = y_pred
+
+ # filter out "ignore" anchors
+ anchor_state = keras.backend.max(labels, axis=2) # -1 for ignore, 0 for background, 1 for object
+ indices = backend.where(keras.backend.not_equal(anchor_state, -1))
+ labels = backend.gather_nd(labels, indices)
+ classification = backend.gather_nd(classification, indices)
+
+ # compute the focal loss
+ alpha_factor = keras.backend.ones_like(labels) * alpha
+ alpha_factor = backend.where(keras.backend.equal(labels, 1), alpha_factor, 1 - alpha_factor)
+ focal_weight = backend.where(keras.backend.equal(labels, 1), 1 - classification, classification)
+ focal_weight = alpha_factor * focal_weight ** gamma
+
+ cls_loss = focal_weight * keras.backend.binary_crossentropy(labels, classification)
+
+ # compute the normalizer: the number of positive anchors
+ normalizer = backend.where(keras.backend.equal(anchor_state, 1))
+ normalizer = keras.backend.cast(keras.backend.shape(normalizer)[0], keras.backend.floatx())
+ normalizer = keras.backend.maximum(1.0, normalizer)
+
+ return keras.backend.sum(cls_loss) / normalizer
+
+ return _focal
+
+
+def smooth_l1(sigma=3.0):
+ """ Create a smooth L1 loss functor.
+
+ Args
+ sigma: This argument defines the point where the loss changes from L2 to L1.
+
+ Returns
+ A functor for computing the smooth L1 loss given target data and predicted data.
+ """
+ sigma_squared = sigma ** 2
+
+ def _smooth_l1(y_true, y_pred):
+ """ Compute the smooth L1 loss of y_pred w.r.t. y_true.
+
+ Args
+ y_true: Tensor from the generator of shape (B, N, 5). The last value for each box is the state of the anchor (ignore, negative, positive).
+ y_pred: Tensor from the network of shape (B, N, 4).
+
+ Returns
+ The smooth L1 loss of y_pred w.r.t. y_true.
+ """
+ # separate target and state
+ regression = y_pred
+ regression_target = y_true[:, :, :4]
+ anchor_state = y_true[:, :, 4]
+
+ # filter out "ignore" anchors
+ indices = backend.where(keras.backend.equal(anchor_state, 1))
+ regression = backend.gather_nd(regression, indices)
+ regression_target = backend.gather_nd(regression_target, indices)
+
+ # compute smooth L1 loss
+ # f(x) = 0.5 * (sigma * x)^2 if |x| < 1 / sigma / sigma
+ # |x| - 0.5 / sigma / sigma otherwise
+ regression_diff = regression - regression_target
+ regression_diff = keras.backend.abs(regression_diff)
+ regression_loss = backend.where(
+ keras.backend.less(regression_diff, 1.0 / sigma_squared),
+ 0.5 * sigma_squared * keras.backend.pow(regression_diff, 2),
+ regression_diff - 0.5 / sigma_squared
+ )
+
+ # compute the normalizer: the number of positive anchors
+ normalizer = keras.backend.maximum(1, keras.backend.shape(indices)[0])
+ normalizer = keras.backend.cast(normalizer, dtype=keras.backend.floatx())
+ return keras.backend.sum(regression_loss) / normalizer
+
+ return _smooth_l1
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/models/__init__.py b/engine/object_detection_branch/retina_net/keras_retinanet/models/__init__.py
new file mode 100644
index 0000000..4cc31a0
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/models/__init__.py
@@ -0,0 +1,81 @@
+class Backbone(object):
+ """ This class stores additional information on backbones.
+ """
+ def __init__(self, backbone):
+ # a dictionary mapping custom layer names to the correct classes
+ from .. import layers
+ from .. import losses
+ from .. import initializers
+ self.custom_objects = {
+ 'UpsampleLike' : layers.UpsampleLike,
+ 'PriorProbability' : initializers.PriorProbability,
+ 'RegressBoxes' : layers.RegressBoxes,
+ 'FilterDetections' : layers.FilterDetections,
+ 'Anchors' : layers.Anchors,
+ 'ClipBoxes' : layers.ClipBoxes,
+ '_smooth_l1' : losses.smooth_l1(),
+ '_focal' : losses.focal(),
+ }
+
+ self.backbone = backbone
+ self.validate()
+
+ def retinanet(self, *args, **kwargs):
+ """ Returns a retinanet model using the correct backbone.
+ """
+ raise NotImplementedError('retinanet method not implemented.')
+
+ def download_imagenet(self):
+ """ Downloads ImageNet weights and returns path to weights file.
+ """
+ raise NotImplementedError('download_imagenet method not implemented.')
+
+ def validate(self):
+ """ Checks whether the backbone string is correct.
+ """
+ raise NotImplementedError('validate method not implemented.')
+
+
+def backbone(backbone_name):
+ """ Returns a backbone object for the given backbone.
+ """
+ if 'resnet' in backbone_name:
+ from .resnet import ResNetBackbone as b
+ elif 'mobilenet' in backbone_name:
+ from .mobilenet import MobileNetBackbone as b
+ elif 'vgg' in backbone_name:
+ from .vgg import VGGBackbone as b
+ elif 'densenet' in backbone_name:
+ from .densenet import DenseNetBackbone as b
+ else:
+ raise NotImplementedError('Backbone class for \'{}\' not implemented.'.format(backbone))
+
+ return b(backbone_name)
+
+
+def load_model(filepath, backbone_name='resnet50', convert=False, nms=True):
+ """ Loads a retinanet model using the correct custom objects.
+
+ # Arguments
+ filepath: one of the following:
+ - string, path to the saved model, or
+ - h5py.File object from which to load the model
+ backbone_name: Backbone with which the model was trained.
+ convert: Boolean, whether to convert the model to an inference model.
+ nms: Boolean, whether to add NMS filtering to the converted model. Only valid if convert=True.
+
+ # Returns
+ A keras.models.Model object.
+
+ # Raises
+ ImportError: if h5py is not available.
+ ValueError: In case of an invalid savefile.
+ """
+ import keras.models
+
+ model = keras.models.load_model(filepath, custom_objects=backbone(backbone_name).custom_objects)
+ if convert:
+ from .retinanet import retinanet_bbox
+ model = retinanet_bbox(model=model, nms=nms)
+
+ return model
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/models/densenet.py b/engine/object_detection_branch/retina_net/keras_retinanet/models/densenet.py
new file mode 100644
index 0000000..ba86bb9
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/models/densenet.py
@@ -0,0 +1,93 @@
+"""
+Copyright 2018 vidosits (https://github.com/vidosits/)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import keras
+from keras.applications.densenet import DenseNet, get_file
+
+from . import Backbone
+from . import retinanet
+
+allowed_backbones = {'densenet121': [6, 12, 24, 16], 'densenet169': [6, 12, 32, 32], 'densenet201': [6, 12, 48, 32]}
+
+
+class DenseNetBackbone(Backbone):
+ """ Describes backbone information and provides utility functions.
+ """
+
+ def retinanet(self, *args, **kwargs):
+ """ Returns a retinanet model using the correct backbone.
+ """
+ return densenet_retinanet(*args, backbone=self.backbone, **kwargs)
+
+ def download_imagenet(self):
+ """ Download pre-trained weights for the specified backbone name.
+ This name is in the format {backbone}_weights_tf_dim_ordering_tf_kernels_notop
+ where backbone is the densenet + number of layers (e.g. densenet121).
+ For more info check the explanation from the keras densenet script itself:
+ https://github.com/keras-team/keras/blob/master/keras/applications/densenet.py
+ """
+ origin = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/'
+ file_name = '{}_weights_tf_dim_ordering_tf_kernels_notop.h5'
+
+ # load weights
+ if keras.backend.image_data_format() == 'channels_first':
+ raise ValueError('Weights for "channels_first" format are not available.')
+
+ weights_url = origin + file_name.format(self.backbone)
+ return get_file(file_name.format(self.backbone), weights_url, cache_subdir='models')
+
+ def validate(self):
+ """ Checks whether the backbone string is correct.
+ """
+ backbone = self.backbone.split('_')[0]
+
+ if backbone not in allowed_backbones:
+ raise ValueError('Backbone (\'{}\') not in allowed backbones ({}).'.format(backbone, allowed_backbones.keys()))
+
+
+def densenet_retinanet(num_classes, backbone='densenet121', inputs=None, modifier=None, **kwargs):
+ """ Constructs a retinanet model using a densenet backbone.
+
+ Args
+ num_classes: Number of classes to predict.
+ backbone: Which backbone to use (one of ('densenet121', 'densenet169', 'densenet201')).
+ inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)).
+ modifier: A function handler which can modify the backbone before using it in retinanet (this can be used to freeze backbone layers for example).
+
+ Returns
+ RetinaNet model with a DenseNet backbone.
+ """
+ # choose default input
+ if inputs is None:
+ inputs = keras.layers.Input((None, None, 3))
+
+ blocks = allowed_backbones[backbone]
+ densenet = DenseNet(blocks=blocks, input_tensor=inputs, include_top=False, pooling=None, weights=None)
+
+ # get last conv layer from the end of each dense block
+ layer_outputs = [densenet.get_layer(name='conv{}_block{}_concat'.format(idx + 2, block_num)).output for idx, block_num in enumerate(blocks)]
+
+ # create the densenet backbone
+ densenet = keras.models.Model(inputs=inputs, outputs=layer_outputs[1:], name=densenet.name)
+
+ # invoke modifier if given
+ if modifier:
+ densenet = modifier(densenet)
+
+ # create the full model
+ model = retinanet.retinanet(inputs=inputs, num_classes=num_classes, backbone_layers=densenet.outputs, **kwargs)
+
+ return model
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/models/mobilenet.py b/engine/object_detection_branch/retina_net/keras_retinanet/models/mobilenet.py
new file mode 100644
index 0000000..02d0c28
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/models/mobilenet.py
@@ -0,0 +1,110 @@
+"""
+Copyright 2017-2018 lvaleriu (https://github.com/lvaleriu/)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import keras
+from keras.applications.mobilenet import MobileNet, BASE_WEIGHT_PATH, get_file, relu6, DepthwiseConv2D
+
+from . import Backbone
+from . import retinanet
+
+
+class MobileNetBackbone(Backbone):
+ """ Describes backbone information and provides utility functions.
+ """
+
+ allowed_backbones = ['mobilenet128', 'mobilenet160', 'mobilenet192', 'mobilenet224']
+
+ def __init__(self, backbone):
+ super(MobileNetBackbone, self).__init__(backbone)
+
+ self.custom_objects.update({
+ 'relu6': relu6,
+ 'DepthwiseConv2D': DepthwiseConv2D
+ })
+
+ def retinanet(self, *args, **kwargs):
+ """ Returns a retinanet model using the correct backbone.
+ """
+ return mobilenet_retinanet(*args, backbone=self.backbone, **kwargs)
+
+ def download_imagenet(self):
+ """ Download pre-trained weights for the specified backbone name.
+ This name is in the format mobilenet{rows}_{alpha} where rows is the
+ imagenet shape dimension and 'alpha' controls the width of the network.
+ For more info check the explanation from the keras mobilenet script itself.
+ """
+
+ alpha = float(self.backbone.split('_')[1])
+ rows = int(self.backbone.split('_')[0].replace('mobilenet', ''))
+
+ # load weights
+ if keras.backend.image_data_format() == 'channels_first':
+ raise ValueError('Weights for "channels_last" format '
+ 'are not available.')
+ if alpha == 1.0:
+ alpha_text = '1_0'
+ elif alpha == 0.75:
+ alpha_text = '7_5'
+ elif alpha == 0.50:
+ alpha_text = '5_0'
+ else:
+ alpha_text = '2_5'
+
+ model_name = 'mobilenet_{}_{}_tf_no_top.h5'.format(alpha_text, rows)
+ weights_url = BASE_WEIGHT_PATH + model_name
+ weights_path = get_file(model_name, weights_url, cache_subdir='models')
+
+ return weights_path
+
+ def validate(self):
+ """ Checks whether the backbone string is correct.
+ """
+ backbone = self.backbone.split('_')[0]
+
+ if backbone not in MobileNetBackbone.allowed_backbones:
+ raise ValueError('Backbone (\'{}\') not in allowed backbones ({}).'.format(backbone, MobileNetBackbone.allowed_backbones))
+
+
+def mobilenet_retinanet(num_classes, backbone='mobilenet224_1.0', inputs=None, modifier=None, **kwargs):
+ """ Constructs a retinanet model using a mobilenet backbone.
+
+ Args
+ num_classes: Number of classes to predict.
+ backbone: Which backbone to use (one of ('mobilenet128', 'mobilenet160', 'mobilenet192', 'mobilenet224')).
+ inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)).
+ modifier: A function handler which can modify the backbone before using it in retinanet (this can be used to freeze backbone layers for example).
+
+ Returns
+ RetinaNet model with a MobileNet backbone.
+ """
+ alpha = float(backbone.split('_')[1])
+
+ # choose default input
+ if inputs is None:
+ inputs = keras.layers.Input((None, None, 3))
+
+ mobilenet = MobileNet(input_tensor=inputs, alpha=alpha, include_top=False, pooling=None, weights=None)
+
+ # create the full model
+ layer_names = ['conv_pw_5_relu', 'conv_pw_11_relu', 'conv_pw_13_relu']
+ layer_outputs = [mobilenet.get_layer(name).output for name in layer_names]
+ mobilenet = keras.models.Model(inputs=inputs, outputs=layer_outputs, name=mobilenet.name)
+
+ # invoke modifier if given
+ if modifier:
+ mobilenet = modifier(mobilenet)
+
+ return retinanet.retinanet(inputs=inputs, num_classes=num_classes, backbone_layers=mobilenet.outputs, **kwargs)
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/models/resnet.py b/engine/object_detection_branch/retina_net/keras_retinanet/models/resnet.py
new file mode 100644
index 0000000..394e1bb
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/models/resnet.py
@@ -0,0 +1,114 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import keras
+import keras_resnet
+import keras_resnet.models
+
+from . import Backbone
+from . import retinanet
+
+
+class ResNetBackbone(Backbone):
+ """ Describes backbone information and provides utility functions.
+ """
+
+ def __init__(self, backbone):
+ super(ResNetBackbone, self).__init__(backbone)
+ self.custom_objects.update(keras_resnet.custom_objects)
+
+ def retinanet(self, *args, **kwargs):
+ """ Returns a retinanet model using the correct backbone.
+ """
+ return resnet_retinanet(*args, backbone=self.backbone, **kwargs)
+
+ def download_imagenet(self):
+ """ Downloads ImageNet weights and returns path to weights file.
+ """
+ resnet_filename = 'ResNet-{}-model.keras.h5'
+ resnet_resource = 'https://github.com/fizyr/keras-models/releases/download/v0.0.1/{}'.format(resnet_filename)
+ depth = int(self.backbone.replace('resnet', ''))
+
+ filename = resnet_filename.format(depth)
+ resource = resnet_resource.format(depth)
+ if depth == 50:
+ checksum = '3e9f4e4f77bbe2c9bec13b53ee1c2319'
+ elif depth == 101:
+ checksum = '05dc86924389e5b401a9ea0348a3213c'
+ elif depth == 152:
+ checksum = '6ee11ef2b135592f8031058820bb9e71'
+
+ return keras.applications.imagenet_utils.get_file(
+ filename,
+ resource,
+ cache_subdir='models',
+ md5_hash=checksum
+ )
+
+ def validate(self):
+ """ Checks whether the backbone string is correct.
+ """
+ allowed_backbones = ['resnet50', 'resnet101', 'resnet152']
+ backbone = self.backbone.split('_')[0]
+
+ if backbone not in allowed_backbones:
+ raise ValueError('Backbone (\'{}\') not in allowed backbones ({}).'.format(backbone, allowed_backbones))
+
+
+def resnet_retinanet(num_classes, backbone='resnet50', inputs=None, modifier=None, **kwargs):
+ """ Constructs a retinanet model using a resnet backbone.
+
+ Args
+ num_classes: Number of classes to predict.
+ backbone: Which backbone to use (one of ('resnet50', 'resnet101', 'resnet152')).
+ inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)).
+ modifier: A function handler which can modify the backbone before using it in retinanet (this can be used to freeze backbone layers for example).
+
+ Returns
+ RetinaNet model with a ResNet backbone.
+ """
+ # choose default input
+ if inputs is None:
+ inputs = keras.layers.Input(shape=(None, None, 3))
+
+ # create the resnet backbone
+ if backbone == 'resnet50':
+ resnet = keras_resnet.models.ResNet50(inputs, include_top=False, freeze_bn=True)
+ elif backbone == 'resnet101':
+ resnet = keras_resnet.models.ResNet101(inputs, include_top=False, freeze_bn=True)
+ elif backbone == 'resnet152':
+ resnet = keras_resnet.models.ResNet152(inputs, include_top=False, freeze_bn=True)
+ else:
+ raise ValueError('Backbone (\'{}\') is invalid.'.format(backbone))
+
+ # invoke modifier if given
+ if modifier:
+ resnet = modifier(resnet)
+
+ # create the full model
+ return retinanet.retinanet(inputs=inputs, num_classes=num_classes, backbone_layers=resnet.outputs[1:], **kwargs)
+
+
+def resnet50_retinanet(num_classes, inputs=None, **kwargs):
+ return resnet_retinanet(num_classes=num_classes, backbone='resnet50', inputs=inputs, **kwargs)
+
+
+def resnet101_retinanet(num_classes, inputs=None, **kwargs):
+ return resnet_retinanet(num_classes=num_classes, backbone='resnet101', inputs=inputs, **kwargs)
+
+
+def resnet152_retinanet(num_classes, inputs=None, **kwargs):
+ return resnet_retinanet(num_classes=num_classes, backbone='resnet152', inputs=inputs, **kwargs)
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/models/retinanet.py b/engine/object_detection_branch/retina_net/keras_retinanet/models/retinanet.py
new file mode 100644
index 0000000..e4b110a
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/models/retinanet.py
@@ -0,0 +1,355 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import numpy as np
+
+import keras
+
+from .. import initializers
+from .. import layers
+
+
+def default_classification_model(
+ num_classes,
+ num_anchors,
+ pyramid_feature_size=256,
+ prior_probability=0.01,
+ classification_feature_size=256,
+ name='classification_submodel'
+):
+ """ Creates the default regression submodel.
+
+ Args
+ num_classes : Number of classes to predict a score for at each feature level.
+ num_anchors : Number of anchors to predict classification scores for at each feature level.
+ pyramid_feature_size : The number of filters to expect from the feature pyramid levels.
+ classification_feature_size : The number of filters to use in the layers in the classification submodel.
+ name : The name of the submodel.
+
+ Returns
+ A keras.models.Model that predicts classes for each anchor.
+ """
+ options = {
+ 'kernel_size' : 3,
+ 'strides' : 1,
+ 'padding' : 'same',
+ }
+
+ inputs = keras.layers.Input(shape=(None, None, pyramid_feature_size))
+ outputs = inputs
+ for i in range(4):
+ outputs = keras.layers.Conv2D(
+ filters=classification_feature_size,
+ activation='relu',
+ name='pyramid_classification_{}'.format(i),
+ kernel_initializer=keras.initializers.normal(mean=0.0, stddev=0.01, seed=None),
+ bias_initializer='zeros',
+ **options
+ )(outputs)
+
+ outputs = keras.layers.Conv2D(
+ filters=num_classes * num_anchors,
+ kernel_initializer=keras.initializers.zeros(),
+ bias_initializer=initializers.PriorProbability(probability=prior_probability),
+ name='pyramid_classification',
+ **options
+ )(outputs)
+
+ # reshape output and apply sigmoid
+ outputs = keras.layers.Reshape((-1, num_classes), name='pyramid_classification_reshape')(outputs)
+ outputs = keras.layers.Activation('sigmoid', name='pyramid_classification_sigmoid')(outputs)
+
+ return keras.models.Model(inputs=inputs, outputs=outputs, name=name)
+
+
+def default_regression_model(num_anchors, pyramid_feature_size=256, regression_feature_size=256, name='regression_submodel'):
+ """ Creates the default regression submodel.
+
+ Args
+ num_anchors : Number of anchors to regress for each feature level.
+ pyramid_feature_size : The number of filters to expect from the feature pyramid levels.
+ regression_feature_size : The number of filters to use in the layers in the regression submodel.
+ name : The name of the submodel.
+
+ Returns
+ A keras.models.Model that predicts regression values for each anchor.
+ """
+ # All new conv layers except the final one in the
+ # RetinaNet (classification) subnets are initialized
+ # with bias b = 0 and a Gaussian weight fill with stddev = 0.01.
+ options = {
+ 'kernel_size' : 3,
+ 'strides' : 1,
+ 'padding' : 'same',
+ 'kernel_initializer' : keras.initializers.normal(mean=0.0, stddev=0.01, seed=None),
+ 'bias_initializer' : 'zeros'
+ }
+
+ inputs = keras.layers.Input(shape=(None, None, pyramid_feature_size))
+ outputs = inputs
+ for i in range(4):
+ outputs = keras.layers.Conv2D(
+ filters=regression_feature_size,
+ activation='relu',
+ name='pyramid_regression_{}'.format(i),
+ **options
+ )(outputs)
+
+ outputs = keras.layers.Conv2D(num_anchors * 4, name='pyramid_regression', **options)(outputs)
+ outputs = keras.layers.Reshape((-1, 4), name='pyramid_regression_reshape')(outputs)
+
+ return keras.models.Model(inputs=inputs, outputs=outputs, name=name)
+
+
+def __create_pyramid_features(C3, C4, C5, feature_size=256):
+ """ Creates the FPN layers on top of the backbone features.
+
+ Args
+ C3 : Feature stage C3 from the backbone.
+ C4 : Feature stage C4 from the backbone.
+ C5 : Feature stage C5 from the backbone.
+ feature_size : The feature size to use for the resulting feature levels.
+
+ Returns
+ A list of feature levels [P3, P4, P5, P6, P7].
+ """
+ # upsample C5 to get P5 from the FPN paper
+ P5 = keras.layers.Conv2D(feature_size, kernel_size=1, strides=1, padding='same', name='C5_reduced')(C5)
+ P5_upsampled = layers.UpsampleLike(name='P5_upsampled')([P5, C4])
+ P5 = keras.layers.Conv2D(feature_size, kernel_size=3, strides=1, padding='same', name='P5')(P5)
+
+ # add P5 elementwise to C4
+ P4 = keras.layers.Conv2D(feature_size, kernel_size=1, strides=1, padding='same', name='C4_reduced')(C4)
+ P4 = keras.layers.Add(name='P4_merged')([P5_upsampled, P4])
+ P4_upsampled = layers.UpsampleLike(name='P4_upsampled')([P4, C3])
+ P4 = keras.layers.Conv2D(feature_size, kernel_size=3, strides=1, padding='same', name='P4')(P4)
+
+ # add P4 elementwise to C3
+ P3 = keras.layers.Conv2D(feature_size, kernel_size=1, strides=1, padding='same', name='C3_reduced')(C3)
+ P3 = keras.layers.Add(name='P3_merged')([P4_upsampled, P3])
+ P3 = keras.layers.Conv2D(feature_size, kernel_size=3, strides=1, padding='same', name='P3')(P3)
+
+ # "P6 is obtained via a 3x3 stride-2 conv on C5"
+ P6 = keras.layers.Conv2D(feature_size, kernel_size=3, strides=2, padding='same', name='P6')(C5)
+
+ # "P7 is computed by applying ReLU followed by a 3x3 stride-2 conv on P6"
+ P7 = keras.layers.Activation('relu', name='C6_relu')(P6)
+ P7 = keras.layers.Conv2D(feature_size, kernel_size=3, strides=2, padding='same', name='P7')(P7)
+
+ return [P3, P4, P5, P6, P7]
+
+
+class AnchorParameters:
+ """ The parameteres that define how anchors are generated.
+
+ Args
+ sizes : List of sizes to use. Each size corresponds to one feature level.
+ strides : List of strides to use. Each stride correspond to one feature level.
+ ratios : List of ratios to use per location in a feature map.
+ scales : List of scales to use per location in a feature map.
+ """
+ def __init__(self, sizes, strides, ratios, scales):
+ self.sizes = sizes
+ self.strides = strides
+ self.ratios = ratios
+ self.scales = scales
+
+ def num_anchors(self):
+ return len(self.ratios) * len(self.scales)
+
+
+"""
+The default anchor parameters.
+"""
+AnchorParameters.default = AnchorParameters(
+ sizes = [32, 64, 128, 256, 512],
+ strides = [8, 16, 32, 64, 128],
+ ratios = np.array([0.5, 1, 2], keras.backend.floatx()),
+ scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)], keras.backend.floatx()),
+)
+
+
+def default_submodels(num_classes, num_anchors):
+ """ Create a list of default submodels used for object detection.
+
+ The default submodels contains a regression submodel and a classification submodel.
+
+ Args
+ num_classes : Number of classes to use.
+ num_anchors : Number of base anchors.
+
+ Returns
+ A list of tuple, where the first element is the name of the submodel and the second element is the submodel itself.
+ """
+ return [
+ ('regression', default_regression_model(num_anchors)),
+ ('classification', default_classification_model(num_classes, num_anchors))
+ ]
+
+
+def __build_model_pyramid(name, model, features):
+ """ Applies a single submodel to each FPN level.
+
+ Args
+ name : Name of the submodel.
+ model : The submodel to evaluate.
+ features : The FPN features.
+
+ Returns
+ A tensor containing the response from the submodel on the FPN features.
+ """
+ return keras.layers.Concatenate(axis=1, name=name)([model(f) for f in features])
+
+
+def __build_pyramid(models, features):
+ """ Applies all submodels to each FPN level.
+
+ Args
+ models : List of sumodels to run on each pyramid level (by default only regression, classifcation).
+ features : The FPN features.
+
+ Returns
+ A list of tensors, one for each submodel.
+ """
+ return [__build_model_pyramid(n, m, features) for n, m in models]
+
+
+def __build_anchors(anchor_parameters, features):
+ """ Builds anchors for the shape of the features from FPN.
+
+ Args
+ anchor_parameters : Parameteres that determine how anchors are generated.
+ features : The FPN features.
+
+ Returns
+ A tensor containing the anchors for the FPN features.
+
+ The shape is:
+ ```
+ (batch_size, num_anchors, 4)
+ ```
+ """
+ anchors = [
+ layers.Anchors(
+ size=anchor_parameters.sizes[i],
+ stride=anchor_parameters.strides[i],
+ ratios=anchor_parameters.ratios,
+ scales=anchor_parameters.scales,
+ name='anchors_{}'.format(i)
+ )(f) for i, f in enumerate(features)
+ ]
+
+ return keras.layers.Concatenate(axis=1, name='anchors')(anchors)
+
+
+def retinanet(
+ inputs,
+ backbone_layers,
+ num_classes,
+ num_anchors = 9,
+ create_pyramid_features = __create_pyramid_features,
+ submodels = None,
+ name = 'retinanet'
+):
+ """ Construct a RetinaNet model on top of a backbone.
+
+ This model is the minimum model necessary for training (with the unfortunate exception of anchors as output).
+
+ Args
+ inputs : keras.layers.Input (or list of) for the input to the model.
+ num_classes : Number of classes to classify.
+ num_anchors : Number of base anchors.
+ create_pyramid_features : Functor for creating pyramid features given the features C3, C4, C5 from the backbone.
+ submodels : Submodels to run on each feature map (default is regression and classification submodels).
+ name : Name of the model.
+
+ Returns
+ A keras.models.Model which takes an image as input and outputs generated anchors and the result from each submodel on every pyramid level.
+
+ The order of the outputs is as defined in submodels:
+ ```
+ [
+ regression, classification, other[0], other[1], ...
+ ]
+ ```
+ """
+ if submodels is None:
+ submodels = default_submodels(num_classes, num_anchors)
+
+ C3, C4, C5 = backbone_layers
+
+ # compute pyramid features as per https://arxiv.org/abs/1708.02002
+ features = create_pyramid_features(C3, C4, C5)
+
+ # for all pyramid levels, run available submodels
+ pyramids = __build_pyramid(submodels, features)
+
+ return keras.models.Model(inputs=inputs, outputs=pyramids, name=name)
+
+
+def retinanet_bbox(
+ model = None,
+ anchor_parameters = AnchorParameters.default,
+ nms = True,
+ name = 'retinanet-bbox',
+ **kwargs
+):
+ """ Construct a RetinaNet model on top of a backbone and adds convenience functions to output boxes directly.
+
+ This model uses the minimum retinanet model and appends a few layers to compute boxes within the graph.
+ These layers include applying the regression values to the anchors and performing NMS.
+
+ Args
+ model : RetinaNet model to append bbox layers to. If None, it will create a RetinaNet model using **kwargs.
+ anchor_parameters : Struct containing configuration for anchor generation (sizes, strides, ratios, scales).
+ name : Name of the model.
+ *kwargs : Additional kwargs to pass to the minimal retinanet model.
+
+ Returns
+ A keras.models.Model which takes an image as input and outputs the detections on the image.
+
+ The order is defined as follows:
+ ```
+ [
+ boxes, scores, labels, other[0], other[1], ...
+ ]
+ ```
+ """
+ if model is None:
+ model = retinanet(num_anchors=anchor_parameters.num_anchors(), **kwargs)
+
+ # compute the anchors
+ features = [model.get_layer(p_name).output for p_name in ['P3', 'P4', 'P5', 'P6', 'P7']]
+ anchors = __build_anchors(anchor_parameters, features)
+
+ # we expect the anchors, regression and classification values as first output
+ regression = model.outputs[0]
+ classification = model.outputs[1]
+
+ # "other" can be any additional output from custom submodels, by default this will be []
+ other = model.outputs[2:]
+
+ # apply predicted regression to anchors
+ boxes = layers.RegressBoxes(name='boxes')([anchors, regression])
+ boxes = layers.ClipBoxes(name='clipped_boxes')([model.inputs[0], boxes])
+
+ # filter detections (apply NMS / score threshold / select top-k)
+ detections = layers.FilterDetections(nms=nms, name='filtered_detections')([boxes, classification] + other)
+
+ outputs = detections
+
+ # construct the model
+ return keras.models.Model(inputs=model.inputs, outputs=outputs, name=name)
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/models/vgg.py b/engine/object_detection_branch/retina_net/keras_retinanet/models/vgg.py
new file mode 100644
index 0000000..73c52f8
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/models/vgg.py
@@ -0,0 +1,92 @@
+"""
+Copyright 2017-2018 cgratie (https://github.com/cgratie/)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+
+import keras
+
+from . import Backbone
+from . import retinanet
+
+
+class VGGBackbone(Backbone):
+ """ Describes backbone information and provides utility functions.
+ """
+
+ def retinanet(self, *args, **kwargs):
+ """ Returns a retinanet model using the correct backbone.
+ """
+ return vgg_retinanet(*args, backbone=self.backbone, **kwargs)
+
+ def download_imagenet(self):
+ """ Downloads ImageNet weights and returns path to weights file.
+ Weights can be downloaded at https://github.com/fizyr/keras-models/releases .
+ """
+ if self.backbone == 'vgg16':
+ resource = keras.applications.vgg16.WEIGHTS_PATH_NO_TOP
+ checksum = '6d6bbae143d832006294945121d1f1fc'
+ elif self.backbone == 'vgg19':
+ resource = keras.applications.vgg19.WEIGHTS_PATH_NO_TOP
+ checksum = '253f8cb515780f3b799900260a226db6'
+ else:
+ raise ValueError("Backbone '{}' not recognized.".format(self.backbone))
+
+ return keras.applications.imagenet_utils.get_file(
+ '{}_weights_tf_dim_ordering_tf_kernels_notop.h5'.format(self.backbone),
+ resource,
+ cache_subdir='models',
+ file_hash=checksum
+ )
+
+ def validate(self):
+ """ Checks whether the backbone string is correct.
+ """
+ allowed_backbones = ['vgg16', 'vgg19']
+
+ if self.backbone not in allowed_backbones:
+ raise ValueError('Backbone (\'{}\') not in allowed backbones ({}).'.format(self.backbone, allowed_backbones))
+
+
+def vgg_retinanet(num_classes, backbone='vgg16', inputs=None, modifier=None, **kwargs):
+ """ Constructs a retinanet model using a vgg backbone.
+
+ Args
+ num_classes: Number of classes to predict.
+ backbone: Which backbone to use (one of ('vgg16', 'vgg19')).
+ inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)).
+ modifier: A function handler which can modify the backbone before using it in retinanet (this can be used to freeze backbone layers for example).
+
+ Returns
+ RetinaNet model with a VGG backbone.
+ """
+ # choose default input
+ if inputs is None:
+ inputs = keras.layers.Input(shape=(None, None, 3))
+
+ # create the vgg backbone
+ if backbone == 'vgg16':
+ vgg = keras.applications.VGG16(input_tensor=inputs, include_top=False)
+ elif backbone == 'vgg19':
+ vgg = keras.applications.VGG19(input_tensor=inputs, include_top=False)
+ else:
+ raise ValueError("Backbone '{}' not recognized.".format(backbone))
+
+ if modifier:
+ vgg = modifier(vgg)
+
+ # create the full model
+ layer_names = ["block3_pool", "block4_pool", "block5_pool"]
+ layer_outputs = [vgg.get_layer(name).output for name in layer_names]
+ return retinanet.retinanet(inputs=inputs, num_classes=num_classes, backbone_layers=layer_outputs, **kwargs)
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/preprocessing/__init__.py b/engine/object_detection_branch/retina_net/keras_retinanet/preprocessing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/preprocessing/coco.py b/engine/object_detection_branch/retina_net/keras_retinanet/preprocessing/coco.py
new file mode 100644
index 0000000..66287d7
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/preprocessing/coco.py
@@ -0,0 +1,144 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from ..preprocessing.generator import Generator
+from ..utils.image import read_image_bgr
+
+import os
+import numpy as np
+
+from pycocotools.coco import COCO
+
+
+class CocoGenerator(Generator):
+ """ Generate data from the COCO dataset.
+
+ See https://github.com/cocodataset/cocoapi/tree/master/PythonAPI for more information.
+ """
+
+ def __init__(self, data_dir, set_name, **kwargs):
+ """ Initialize a COCO data generator.
+
+ Args
+ data_dir: Path to where the COCO dataset is stored.
+ set_name: Name of the set to parse.
+ """
+ self.data_dir = data_dir
+ self.set_name = set_name
+ self.coco = COCO(os.path.join(data_dir, 'annotations', 'instances_' + set_name + '.json'))
+ self.image_ids = self.coco.getImgIds()
+
+ self.load_classes()
+
+ super(CocoGenerator, self).__init__(**kwargs)
+
+ def load_classes(self):
+ """ Loads the class to label mapping (and inverse) for COCO.
+ """
+ # load class names (name -> label)
+ categories = self.coco.loadCats(self.coco.getCatIds())
+ categories.sort(key=lambda x: x['id'])
+
+ self.classes = {}
+ self.coco_labels = {}
+ self.coco_labels_inverse = {}
+ for c in categories:
+ self.coco_labels[len(self.classes)] = c['id']
+ self.coco_labels_inverse[c['id']] = len(self.classes)
+ self.classes[c['name']] = len(self.classes)
+
+ # also load the reverse (label -> name)
+ self.labels = {}
+ for key, value in self.classes.items():
+ self.labels[value] = key
+
+ def size(self):
+ """ Size of the COCO dataset.
+ """
+ return len(self.image_ids)
+
+ def num_classes(self):
+ """ Number of classes in the dataset. For COCO this is 80.
+ """
+ return len(self.classes)
+
+ def name_to_label(self, name):
+ """ Map name to label.
+ """
+ return self.classes[name]
+
+ def label_to_name(self, label):
+ """ Map label to name.
+ """
+ return self.labels[label]
+
+ def coco_label_to_label(self, coco_label):
+ """ Map COCO label to the label as used in the network.
+ COCO has some gaps in the order of labels. The highest label is 90, but there are 80 classes.
+ """
+ return self.coco_labels_inverse[coco_label]
+
+ def coco_label_to_name(self, coco_label):
+ """ Map COCO label to name.
+ """
+ return self.label_to_name(self.coco_label_to_label(coco_label))
+
+ def label_to_coco_label(self, label):
+ """ Map label as used by the network to labels as used by COCO.
+ """
+ return self.coco_labels[label]
+
+ def image_aspect_ratio(self, image_index):
+ """ Compute the aspect ratio for an image with image_index.
+ """
+ image = self.coco.loadImgs(self.image_ids[image_index])[0]
+ return float(image['width']) / float(image['height'])
+
+ def load_image(self, image_index):
+ """ Load an image at the image_index.
+ """
+ image_info = self.coco.loadImgs(self.image_ids[image_index])[0]
+ path = os.path.join(self.data_dir, 'images', self.set_name, image_info['file_name'])
+ return read_image_bgr(path)
+
+ def load_annotations(self, image_index):
+ """ Load annotations for an image_index.
+ """
+ # get ground truth annotations
+ annotations_ids = self.coco.getAnnIds(imgIds=self.image_ids[image_index], iscrowd=False)
+ annotations = np.zeros((0, 5))
+
+ # some images appear to miss annotations (like image with id 257034)
+ if len(annotations_ids) == 0:
+ return annotations
+
+ # parse annotations
+ coco_annotations = self.coco.loadAnns(annotations_ids)
+ for idx, a in enumerate(coco_annotations):
+ # some annotations have basically no width / height, skip them
+ if a['bbox'][2] < 1 or a['bbox'][3] < 1:
+ continue
+
+ annotation = np.zeros((1, 5))
+ annotation[0, :4] = a['bbox']
+ annotation[0, 4] = self.coco_label_to_label(a['category_id'])
+ annotations = np.append(annotations, annotation, axis=0)
+
+ # transform from [x, y, w, h] to [x1, y1, x2, y2]
+ annotations[:, 2] = annotations[:, 0] + annotations[:, 2]
+ annotations[:, 3] = annotations[:, 1] + annotations[:, 3]
+
+ return annotations
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/preprocessing/csv_generator.py b/engine/object_detection_branch/retina_net/keras_retinanet/preprocessing/csv_generator.py
new file mode 100644
index 0000000..39fd4a6
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/preprocessing/csv_generator.py
@@ -0,0 +1,214 @@
+"""
+Copyright 2017-2018 yhenon (https://github.com/yhenon/)
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from .generator import Generator
+from ..utils.image import read_image_bgr
+
+import numpy as np
+from PIL import Image
+from six import raise_from
+
+import csv
+import sys
+import os.path
+
+
+def _parse(value, function, fmt):
+ """
+ Parse a string into a value, and format a nice ValueError if it fails.
+
+ Returns `function(value)`.
+ Any `ValueError` raised is catched and a new `ValueError` is raised
+ with message `fmt.format(e)`, where `e` is the caught `ValueError`.
+ """
+ try:
+ return function(value)
+ except ValueError as e:
+ raise_from(ValueError(fmt.format(e)), None)
+
+
+def _read_classes(csv_reader):
+ """ Parse the classes file given by csv_reader.
+ """
+ result = {}
+ for line, row in enumerate(csv_reader):
+ line += 1
+
+ try:
+ class_name, class_id = row
+ except ValueError:
+ raise_from(ValueError('line {}: format should be \'class_name,class_id\''.format(line)), None)
+ class_id = _parse(class_id, int, 'line {}: malformed class ID: {{}}'.format(line))
+
+ if class_name in result:
+ raise ValueError('line {}: duplicate class name: \'{}\''.format(line, class_name))
+ result[class_name] = class_id
+ return result
+
+
+def _read_annotations(csv_reader, classes):
+ """ Read annotations from the csv_reader.
+ """
+ result = {}
+ for line, row in enumerate(csv_reader):
+ line += 1
+
+ try:
+ img_file, x1, y1, x2, y2, class_name = row[:6]
+ except ValueError:
+ raise_from(ValueError('line {}: format should be \'img_file,x1,y1,x2,y2,class_name\' or \'img_file,,,,,\''.format(line)), None)
+
+ if img_file not in result:
+ result[img_file] = []
+
+ # If a row contains only an image path, it's an image without annotations.
+ if (x1, y1, x2, y2, class_name) == ('', '', '', '', ''):
+ continue
+
+ x1 = _parse(x1, int, 'line {}: malformed x1: {{}}'.format(line))
+ y1 = _parse(y1, int, 'line {}: malformed y1: {{}}'.format(line))
+ x2 = _parse(x2, int, 'line {}: malformed x2: {{}}'.format(line))
+ y2 = _parse(y2, int, 'line {}: malformed y2: {{}}'.format(line))
+
+ # Check that the bounding box is valid.
+ if x2 <= x1:
+ raise ValueError('line {}: x2 ({}) must be higher than x1 ({})'.format(line, x2, x1))
+ if y2 <= y1:
+ raise ValueError('line {}: y2 ({}) must be higher than y1 ({})'.format(line, y2, y1))
+
+ # check if the current class name is correctly present
+ if class_name not in classes:
+ raise ValueError('line {}: unknown class name: \'{}\' (classes: {})'.format(line, class_name, classes))
+
+ result[img_file].append({'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2, 'class': class_name})
+ return result
+
+
+def _open_for_csv(path):
+ """ Open a file with flags suitable for csv.reader.
+
+ This is different for python2 it means with mode 'rb',
+ for python3 this means 'r' with "universal newlines".
+ """
+ if sys.version_info[0] < 3:
+ return open(path, 'rb')
+ else:
+ return open(path, 'r', newline='')
+
+
+class CSVGenerator(Generator):
+ """ Generate data for a custom CSV dataset.
+
+ See https://github.com/fizyr/keras-retinanet#csv-datasets for more information.
+ """
+
+ def __init__(
+ self,
+ csv_data_file,
+ csv_class_file,
+ base_dir=None,
+ **kwargs
+ ):
+ """ Initialize a CSV data generator.
+
+ Args
+ csv_data_file: Path to the CSV annotations file.
+ csv_class_file: Path to the CSV classes file.
+ base_dir: Directory w.r.t. where the files are to be searched (defaults to the directory containing the csv_data_file).
+ """
+ self.image_names = []
+ self.image_data = {}
+ self.base_dir = base_dir
+
+ # Take base_dir from annotations file if not explicitly specified.
+ if self.base_dir is None:
+ self.base_dir = os.path.dirname(csv_data_file)
+
+ # parse the provided class file
+ try:
+ with _open_for_csv(csv_class_file) as file:
+ self.classes = _read_classes(csv.reader(file, delimiter=','))
+ except ValueError as e:
+ raise_from(ValueError('invalid CSV class file: {}: {}'.format(csv_class_file, e)), None)
+
+ self.labels = {}
+ for key, value in self.classes.items():
+ self.labels[value] = key
+
+ # csv with img_path, x1, y1, x2, y2, class_name
+ try:
+ with _open_for_csv(csv_data_file) as file:
+ self.image_data = _read_annotations(csv.reader(file, delimiter=','), self.classes)
+ except ValueError as e:
+ raise_from(ValueError('invalid CSV annotations file: {}: {}'.format(csv_data_file, e)), None)
+ self.image_names = list(self.image_data.keys())
+
+ super(CSVGenerator, self).__init__(**kwargs)
+
+ def size(self):
+ """ Size of the dataset.
+ """
+ return len(self.image_names)
+
+ def num_classes(self):
+ """ Number of classes in the dataset.
+ """
+ return max(self.classes.values()) + 1
+
+ def name_to_label(self, name):
+ """ Map name to label.
+ """
+ return self.classes[name]
+
+ def label_to_name(self, label):
+ """ Map label to name.
+ """
+ return self.labels[label]
+
+ def image_path(self, image_index):
+ """ Returns the image path for image_index.
+ """
+ return os.path.join(self.base_dir, self.image_names[image_index])
+
+ def image_aspect_ratio(self, image_index):
+ """ Compute the aspect ratio for an image with image_index.
+ """
+ # PIL is fast for metadata
+ image = Image.open(self.image_path(image_index))
+ return float(image.width) / float(image.height)
+
+ def load_image(self, image_index):
+ """ Load an image at the image_index.
+ """
+ return read_image_bgr(self.image_path(image_index))
+
+ def load_annotations(self, image_index):
+ """ Load annotations for an image_index.
+ """
+ path = self.image_names[image_index]
+ annots = self.image_data[path]
+ boxes = np.zeros((len(annots), 5))
+
+ for idx, annot in enumerate(annots):
+ class_name = annot['class']
+ boxes[idx, 0] = float(annot['x1'])
+ boxes[idx, 1] = float(annot['y1'])
+ boxes[idx, 2] = float(annot['x2'])
+ boxes[idx, 3] = float(annot['y2'])
+ boxes[idx, 4] = self.name_to_label(class_name)
+
+ return boxes
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/preprocessing/generator.py b/engine/object_detection_branch/retina_net/keras_retinanet/preprocessing/generator.py
new file mode 100644
index 0000000..dec47d3
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/preprocessing/generator.py
@@ -0,0 +1,298 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import numpy as np
+import random
+import threading
+import warnings
+
+import keras
+
+from ..utils.anchors import anchor_targets_bbox, bbox_transform
+from ..utils.image import (
+ TransformParameters,
+ adjust_transform_for_image,
+ apply_transform,
+ preprocess_image,
+ resize_image,
+)
+from ..utils.transform import transform_aabb
+
+
+class Generator(object):
+ """ Abstract generator class.
+ """
+
+ def __init__(
+ self,
+ transform_generator = None,
+ batch_size=1,
+ group_method='ratio', # one of 'none', 'random', 'ratio'
+ shuffle_groups=True,
+ image_min_side=800,
+ image_max_side=1333,
+ transform_parameters=None,
+ compute_anchor_targets=anchor_targets_bbox,
+ ):
+ """ Initialize Generator object.
+
+ Args
+ transform_generator : A generator used to randomly transform images and annotations.
+ batch_size : The size of the batches to generate.
+ group_method : Determines how images are grouped together (defaults to 'ratio', one of ('none', 'random', 'ratio')).
+ shuffle_groups : If True, shuffles the groups each epoch.
+ image_min_side : After resizing the minimum side of an image is equal to image_min_side.
+ image_max_side : If after resizing the maximum side is larger than image_max_side, scales down further so that the max side is equal to image_max_side.
+ transform_parameters : The transform parameters used for data augmentation.
+ compute_anchor_targets : Function handler for computing the targets of anchors for an image and its annotations.
+ """
+ self.transform_generator = transform_generator
+ self.batch_size = int(batch_size)
+ self.group_method = group_method
+ self.shuffle_groups = shuffle_groups
+ self.image_min_side = image_min_side
+ self.image_max_side = image_max_side
+ self.transform_parameters = transform_parameters or TransformParameters()
+ self.compute_anchor_targets = compute_anchor_targets
+
+ self.group_index = 0
+ self.lock = threading.Lock()
+
+ self.group_images()
+
+ def size(self):
+ """ Size of the dataset.
+ """
+ raise NotImplementedError('size method not implemented')
+
+ def num_classes(self):
+ """ Number of classes in the dataset.
+ """
+ raise NotImplementedError('num_classes method not implemented')
+
+ def name_to_label(self, name):
+ """ Map name to label.
+ """
+ raise NotImplementedError('name_to_label method not implemented')
+
+ def label_to_name(self, label):
+ """ Map label to name.
+ """
+ raise NotImplementedError('label_to_name method not implemented')
+
+ def image_aspect_ratio(self, image_index):
+ """ Compute the aspect ratio for an image with image_index.
+ """
+ raise NotImplementedError('image_aspect_ratio method not implemented')
+
+ def load_image(self, image_index):
+ """ Load an image at the image_index.
+ """
+ raise NotImplementedError('load_image method not implemented')
+
+ def load_annotations(self, image_index):
+ """ Load annotations for an image_index.
+ """
+ raise NotImplementedError('load_annotations method not implemented')
+
+ def load_annotations_group(self, group):
+ """ Load annotations for all images in group.
+ """
+ return [self.load_annotations(image_index) for image_index in group]
+
+ def filter_annotations(self, image_group, annotations_group, group):
+ """ Filter annotations by removing those that are outside of the image bounds or whose width/height < 0.
+ """
+ # test all annotations
+ for index, (image, annotations) in enumerate(zip(image_group, annotations_group)):
+ assert(isinstance(annotations, np.ndarray)), '\'load_annotations\' should return a list of numpy arrays, received: {}'.format(type(annotations))
+
+ # test x2 < x1 | y2 < y1 | x1 < 0 | y1 < 0 | x2 <= 0 | y2 <= 0 | x2 >= image.shape[1] | y2 >= image.shape[0]
+ invalid_indices = np.where(
+ (annotations[:, 2] <= annotations[:, 0]) |
+ (annotations[:, 3] <= annotations[:, 1]) |
+ (annotations[:, 0] < 0) |
+ (annotations[:, 1] < 0) |
+ (annotations[:, 2] > image.shape[1]) |
+ (annotations[:, 3] > image.shape[0])
+ )[0]
+
+ # delete invalid indices
+ if len(invalid_indices):
+ warnings.warn('Image with id {} (shape {}) contains the following invalid boxes: {}.'.format(
+ group[index],
+ image.shape,
+ [annotations[invalid_index, :] for invalid_index in invalid_indices]
+ ))
+ annotations_group[index] = np.delete(annotations, invalid_indices, axis=0)
+
+ return image_group, annotations_group
+
+ def load_image_group(self, group):
+ """ Load images for all images in a group.
+ """
+ return [self.load_image(image_index) for image_index in group]
+
+ def random_transform_group_entry(self, image, annotations):
+ """ Randomly transforms image and annotation.
+ """
+ # randomly transform both image and annotations
+ if self.transform_generator:
+ transform = adjust_transform_for_image(next(self.transform_generator), image, self.transform_parameters.relative_translation)
+ image = apply_transform(transform, image, self.transform_parameters)
+
+ # Transform the bounding boxes in the annotations.
+ annotations = annotations.copy()
+ for index in range(annotations.shape[0]):
+ annotations[index, :4] = transform_aabb(transform, annotations[index, :4])
+
+ return image, annotations
+
+ def resize_image(self, image):
+ """ Resize an image using image_min_side and image_max_side.
+ """
+ return resize_image(image, min_side=self.image_min_side, max_side=self.image_max_side)
+
+ def preprocess_image(self, image):
+ """ Preprocess an image (e.g. subtracts ImageNet mean).
+ """
+ return preprocess_image(image)
+
+ def preprocess_group_entry(self, image, annotations):
+ """ Preprocess image and its annotations.
+ """
+ # preprocess the image
+ image = self.preprocess_image(image)
+
+ # randomly transform image and annotations
+ image, annotations = self.random_transform_group_entry(image, annotations)
+
+ # resize image
+ image, image_scale = self.resize_image(image)
+
+ # apply resizing to annotations too
+ annotations[:, :4] *= image_scale
+
+ return image, annotations
+
+ def preprocess_group(self, image_group, annotations_group):
+ """ Preprocess each image and its annotations in its group.
+ """
+ for index, (image, annotations) in enumerate(zip(image_group, annotations_group)):
+ # preprocess a single group entry
+ image, annotations = self.preprocess_group_entry(image, annotations)
+
+ # copy processed data back to group
+ image_group[index] = image
+ annotations_group[index] = annotations
+
+ return image_group, annotations_group
+
+ def group_images(self):
+ """ Order the images according to self.order and makes groups of self.batch_size.
+ """
+ # determine the order of the images
+ order = list(range(self.size()))
+ if self.group_method == 'random':
+ random.shuffle(order)
+ elif self.group_method == 'ratio':
+ order.sort(key=lambda x: self.image_aspect_ratio(x))
+
+ # divide into groups, one group = one batch
+ self.groups = [[order[x % len(order)] for x in range(i, i + self.batch_size)] for i in range(0, len(order), self.batch_size)]
+
+ def compute_inputs(self, image_group):
+ """ Compute inputs for the network using an image_group.
+ """
+ # get the max image shape
+ max_shape = tuple(max(image.shape[x] for image in image_group) for x in range(3))
+
+ # construct an image batch object
+ image_batch = np.zeros((self.batch_size,) + max_shape, dtype=keras.backend.floatx())
+
+ # copy all images to the upper left part of the image batch object
+ for image_index, image in enumerate(image_group):
+ image_batch[image_index, :image.shape[0], :image.shape[1], :image.shape[2]] = image
+
+ return image_batch
+
+ def compute_targets(self, image_group, annotations_group):
+ """ Compute target outputs for the network using images and their annotations.
+ """
+ # get the max image shape
+ max_shape = tuple(max(image.shape[x] for image in image_group) for x in range(3))
+
+ # compute labels and regression targets
+ labels_group = [None] * self.batch_size
+ regression_group = [None] * self.batch_size
+ for index, (image, annotations) in enumerate(zip(image_group, annotations_group)):
+ # compute regression targets
+ labels_group[index], annotations, anchors = self.compute_anchor_targets(
+ max_shape,
+ annotations,
+ self.num_classes(),
+ mask_shape=image.shape,
+ )
+ regression_group[index] = bbox_transform(anchors, annotations)
+
+ # append anchor states to regression targets (necessary for filtering 'ignore', 'positive' and 'negative' anchors)
+ anchor_states = np.max(labels_group[index], axis=1, keepdims=True)
+ regression_group[index] = np.append(regression_group[index], anchor_states, axis=1)
+
+ labels_batch = np.zeros((self.batch_size,) + labels_group[0].shape, dtype=keras.backend.floatx())
+ regression_batch = np.zeros((self.batch_size,) + regression_group[0].shape, dtype=keras.backend.floatx())
+
+ # copy all labels and regression values to the batch blob
+ for index, (labels, regression) in enumerate(zip(labels_group, regression_group)):
+ labels_batch[index, ...] = labels
+ regression_batch[index, ...] = regression
+
+ return [regression_batch, labels_batch]
+
+ def compute_input_output(self, group):
+ """ Compute inputs and target outputs for the network.
+ """
+ # load images and annotations
+ image_group = self.load_image_group(group)
+ annotations_group = self.load_annotations_group(group)
+
+ # check validity of annotations
+ image_group, annotations_group = self.filter_annotations(image_group, annotations_group, group)
+
+ # perform preprocessing steps
+ image_group, annotations_group = self.preprocess_group(image_group, annotations_group)
+
+ # compute network inputs
+ inputs = self.compute_inputs(image_group)
+
+ # compute network targets
+ targets = self.compute_targets(image_group, annotations_group)
+
+ return inputs, targets
+
+ def __next__(self):
+ return self.next()
+
+ def next(self):
+ # advance the group index
+ with self.lock:
+ if self.group_index == 0 and self.shuffle_groups:
+ # shuffle groups at start of epoch
+ random.shuffle(self.groups)
+ group = self.groups[self.group_index]
+ self.group_index = (self.group_index + 1) % len(self.groups)
+
+ return self.compute_input_output(group)
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/preprocessing/kitti.py b/engine/object_detection_branch/retina_net/keras_retinanet/preprocessing/kitti.py
new file mode 100644
index 0000000..2db88f1
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/preprocessing/kitti.py
@@ -0,0 +1,151 @@
+"""
+Copyright 2017-2018 lvaleriu (https://github.com/lvaleriu/)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import csv
+import os.path
+
+import numpy as np
+from PIL import Image
+
+from .generator import Generator
+from ..utils.image import read_image_bgr
+
+kitti_classes = {
+ 'Car': 0,
+ 'Van': 1,
+ 'Truck': 2,
+ 'Pedestrian': 3,
+ 'Person_sitting': 4,
+ 'Cyclist': 5,
+ 'Tram': 6,
+ 'Misc': 7,
+ 'DontCare': 7
+}
+
+
+class KittiGenerator(Generator):
+ """ Generate data for a KITTI dataset.
+
+ See http://www.cvlibs.net/datasets/kitti/ for more information.
+ """
+
+ def __init__(
+ self,
+ base_dir,
+ subset='train',
+ **kwargs
+ ):
+ """ Initialize a KITTI data generator.
+
+ Args
+ base_dir: Directory w.r.t. where the files are to be searched (defaults to the directory containing the csv_data_file).
+ subset: The subset to generate data for (defaults to 'train').
+ """
+ self.base_dir = base_dir
+
+ label_dir = os.path.join(self.base_dir, subset, 'labels')
+ image_dir = os.path.join(self.base_dir, subset, 'images')
+
+ """
+ 1 type Describes the type of object: 'Car', 'Van', 'Truck',
+ 'Pedestrian', 'Person_sitting', 'Cyclist', 'Tram',
+ 'Misc' or 'DontCare'
+ 1 truncated Float from 0 (non-truncated) to 1 (truncated), where
+ truncated refers to the object leaving image boundaries
+ 1 occluded Integer (0,1,2,3) indicating occlusion state:
+ 0 = fully visible, 1 = partly occluded
+ 2 = largely occluded, 3 = unknown
+ 1 alpha Observation angle of object, ranging [-pi..pi]
+ 4 bbox 2D bounding box of object in the image (0-based index):
+ contains left, top, right, bottom pixel coordinates
+ 3 dimensions 3D object dimensions: height, width, length (in meters)
+ 3 location 3D object location x,y,z in camera coordinates (in meters)
+ 1 rotation_y Rotation ry around Y-axis in camera coordinates [-pi..pi]
+ """
+
+ self.id_to_labels = {}
+ for label, id in kitti_classes.items():
+ self.id_to_labels[id] = label
+
+ self.image_data = dict()
+ self.images = []
+ for i, fn in enumerate(os.listdir(label_dir)):
+ label_fp = os.path.join(label_dir, fn)
+ image_fp = os.path.join(image_dir, fn.replace('.txt', '.png'))
+
+ self.images.append(image_fp)
+
+ fieldnames = ['type', 'truncated', 'occluded', 'alpha', 'left', 'top', 'right', 'bottom', 'dh', 'dw', 'dl',
+ 'lx', 'ly', 'lz', 'ry']
+ with open(label_fp, 'r') as csv_file:
+ reader = csv.DictReader(csv_file, delimiter=' ', fieldnames=fieldnames)
+ boxes = []
+ for line, row in enumerate(reader):
+ label = row['type']
+ cls_id = kitti_classes[label]
+
+ annotation = {'cls_id': cls_id, 'x1': row['left'], 'x2': row['right'], 'y2': row['bottom'], 'y1': row['top']}
+ boxes.append(annotation)
+
+ self.image_data[i] = boxes
+
+ super(KittiGenerator, self).__init__(**kwargs)
+
+ def size(self):
+ """ Size of the dataset.
+ """
+ return len(self.images)
+
+ def num_classes(self):
+ """ Number of classes in the dataset.
+ """
+ return max(kitti_classes.values()) + 1
+
+ def name_to_label(self, name):
+ """ Map name to label.
+ """
+ raise NotImplementedError()
+
+ def label_to_name(self, label):
+ """ Map label to name.
+ """
+ return self.id_to_labels[label]
+
+ def image_aspect_ratio(self, image_index):
+ """ Compute the aspect ratio for an image with image_index.
+ """
+ # PIL is fast for metadata
+ image = Image.open(self.images[image_index])
+ return float(image.width) / float(image.height)
+
+ def load_image(self, image_index):
+ """ Load an image at the image_index.
+ """
+ return read_image_bgr(self.images[image_index])
+
+ def load_annotations(self, image_index):
+ """ Load annotations for an image_index.
+ """
+ annotations = self.image_data[image_index]
+
+ boxes = np.zeros((len(annotations), 5))
+ for idx, ann in enumerate(annotations):
+ boxes[idx, 0] = float(ann['x1'])
+ boxes[idx, 1] = float(ann['y1'])
+ boxes[idx, 2] = float(ann['x2'])
+ boxes[idx, 3] = float(ann['y2'])
+ boxes[idx, 4] = int(ann['cls_id'])
+ return boxes
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/preprocessing/open_images.py b/engine/object_detection_branch/retina_net/keras_retinanet/preprocessing/open_images.py
new file mode 100644
index 0000000..8b3ef7f
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/preprocessing/open_images.py
@@ -0,0 +1,264 @@
+"""
+Copyright 2017-2018 lvaleriu (https://github.com/lvaleriu/)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import csv
+import json
+import os
+import warnings
+
+import numpy as np
+from PIL import Image
+
+from .generator import Generator
+from ..utils.image import read_image_bgr
+
+
+def get_labels(metadata_dir, version='v4'):
+ if version == 'v4':
+ boxable_classes_descriptions = os.path.join(metadata_dir, 'class-descriptions-boxable.csv')
+ id_to_labels = {}
+ cls_index = {}
+
+ i = 0
+ with open(boxable_classes_descriptions) as f:
+ for row in csv.reader(f):
+ # make sure the csv row is not empty (usually the last one)
+ if len(row):
+ label = row[0]
+ description = row[1].replace("\"", "").replace("'", "").replace('`', '')
+
+ id_to_labels[i] = description
+ cls_index[label] = i
+
+ i += 1
+ else:
+ trainable_classes_path = os.path.join(metadata_dir, 'classes-bbox-trainable.txt')
+ description_path = os.path.join(metadata_dir, 'class-descriptions.csv')
+
+ description_table = {}
+ with open(description_path) as f:
+ for row in csv.reader(f):
+ # make sure the csv row is not empty (usually the last one)
+ if len(row):
+ description_table[row[0]] = row[1].replace("\"", "").replace("'", "").replace('`', '')
+
+ with open(trainable_classes_path, 'rb') as f:
+ trainable_classes = f.read().split('\n')
+
+ id_to_labels = dict([(i, description_table[c]) for i, c in enumerate(trainable_classes)])
+ cls_index = dict([(c, i) for i, c in enumerate(trainable_classes)])
+
+ return id_to_labels, cls_index
+
+
+def generate_images_annotations_json(main_dir, metadata_dir, subset, cls_index, version='v4'):
+ if version == 'v4':
+ annotations_path = os.path.join(metadata_dir, subset, '{}-annotations-bbox.csv'.format(subset))
+ else:
+ annotations_path = os.path.join(metadata_dir, subset, 'annotations-human-bbox.csv')
+
+ cnt = 0
+ with open(annotations_path, 'r') as csv_file:
+ reader = csv.DictReader(csv_file,
+ fieldnames=['ImageID', 'Source', 'LabelName',
+ 'Confidence', 'XMin', 'XMax', 'YMin',
+ 'YMax'])
+ next(reader)
+ for _ in reader:
+ cnt += 1
+
+ id_annotations = dict()
+ with open(annotations_path, 'r') as csv_file:
+ reader = csv.DictReader(csv_file,
+ fieldnames=['ImageID', 'Source', 'LabelName',
+ 'Confidence', 'XMin', 'XMax', 'YMin',
+ 'YMax'])
+ next(reader)
+
+ images_sizes = {}
+ for line, row in enumerate(reader):
+ frame = row['ImageID']
+ class_name = row['LabelName']
+
+ if class_name not in cls_index:
+ continue
+
+ cls_id = cls_index[class_name]
+
+ img_path = os.path.join(main_dir, 'images', subset, frame + '.jpg')
+ if frame in images_sizes:
+ width, height = images_sizes[frame]
+ else:
+ try:
+ with Image.open(img_path) as img:
+ width, height = img.width, img.height
+ images_sizes[frame] = (width, height)
+ except Exception:
+ continue
+
+ x1 = float(row['XMin'])
+ x2 = float(row['XMax'])
+ y1 = float(row['YMin'])
+ y2 = float(row['YMax'])
+
+ x1_int = int(round(x1 * width))
+ x2_int = int(round(x2 * width))
+ y1_int = int(round(y1 * height))
+ y2_int = int(round(y2 * height))
+
+ # Check that the bounding box is valid.
+ if x2 <= x1:
+ raise ValueError('line {}: x2 ({}) must be higher than x1 ({})'.format(line, x2, x1))
+ if y2 <= y1:
+ raise ValueError('line {}: y2 ({}) must be higher than y1 ({})'.format(line, y2, y1))
+
+ if y2_int == y1_int:
+ warnings.warn('filtering line {}: rounding y2 ({}) and y1 ({}) makes them equal'.format(line, y2, y1))
+ continue
+
+ if x2_int == x1_int:
+ warnings.warn('filtering line {}: rounding x2 ({}) and x1 ({}) makes them equal'.format(line, x2, x1))
+ continue
+
+ img_id = row['ImageID']
+ annotation = {'cls_id': cls_id, 'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2}
+
+ if img_id in id_annotations:
+ annotations = id_annotations[img_id]
+ annotations['boxes'].append(annotation)
+ else:
+ id_annotations[img_id] = {'w': width, 'h': height, 'boxes': [annotation]}
+ return id_annotations
+
+
+class OpenImagesGenerator(Generator):
+ def __init__(
+ self, main_dir, subset, version='v4',
+ labels_filter=None, annotation_cache_dir='.',
+ fixed_labels=False,
+ **kwargs
+ ):
+ if version == 'v4':
+ metadata = '2018_04'
+ elif version == 'v3':
+ metadata = '2017_11'
+ else:
+ raise NotImplementedError('There is currently no implementation for versions older than v3')
+
+ self.base_dir = os.path.join(main_dir, 'images', subset)
+ metadata_dir = os.path.join(main_dir, metadata)
+ annotation_cache_json = os.path.join(annotation_cache_dir, subset + '.json')
+
+ self.id_to_labels, cls_index = get_labels(metadata_dir, version=version)
+
+ if os.path.exists(annotation_cache_json):
+ with open(annotation_cache_json, 'r') as f:
+ self.annotations = json.loads(f.read())
+ else:
+ self.annotations = generate_images_annotations_json(main_dir, metadata_dir, subset, cls_index, version=version)
+ json.dump(self.annotations, open(annotation_cache_json, "w"))
+
+ if labels_filter is not None:
+ self.id_to_labels, self.annotations = self.__filter_data(labels_filter, fixed_labels)
+
+ self.id_to_image_id = dict([(i, k) for i, k in enumerate(self.annotations)])
+
+ super(OpenImagesGenerator, self).__init__(**kwargs)
+
+ def __filter_data(self, labels_filter, fixed_labels):
+ """
+ If you want to work with a subset of the labels just set a list with trainable labels
+ :param labels_filter: Ex: labels_filter = ['Helmet', 'Hat', 'Analog television']
+ :param fixed_labels: If fixed_labels is true this will bring you the 'Helmet' label
+ but also: 'bicycle helmet', 'welding helmet', 'ski helmet' etc...
+ :return:
+ """
+
+ labels_to_id = dict([(l, i) for i, l in enumerate(labels_filter)])
+
+ sub_labels_to_id = {}
+ if fixed_labels:
+ # there is/are no other sublabel(s) other than the labels itself
+ sub_labels_to_id = labels_to_id
+ else:
+ for l in labels_filter:
+ label = str.lower(l)
+ for v in [v for v in self.id_to_labels.values() if label in str.lower(v)]:
+ sub_labels_to_id[v] = labels_to_id[l]
+
+ filtered_annotations = {}
+ for k in self.annotations:
+ img_ann = self.annotations[k]
+
+ filtered_boxes = []
+ for ann in img_ann['boxes']:
+ cls_id = ann['cls_id']
+ label = self.id_to_labels[cls_id]
+ if label in sub_labels_to_id:
+ ann['cls_id'] = sub_labels_to_id[label]
+ filtered_boxes.append(ann)
+
+ if len(filtered_boxes) > 0:
+ filtered_annotations[k] = {'w': img_ann['w'], 'h': img_ann['h'], 'boxes': filtered_boxes}
+
+ id_to_labels = dict([(labels_to_id[k], k) for k in labels_to_id])
+ return id_to_labels, filtered_annotations
+
+ def size(self):
+ return len(self.annotations)
+
+ def num_classes(self):
+ return len(self.id_to_labels)
+
+ def name_to_label(self, name):
+ raise NotImplementedError()
+
+ def label_to_name(self, label):
+ return self.id_to_labels[label]
+
+ def image_aspect_ratio(self, image_index):
+ img_annotations = self.annotations[self.id_to_image_id[image_index]]
+ height, width = img_annotations['h'], img_annotations['w']
+ return float(width) / float(height)
+
+ def image_path(self, image_index):
+ path = os.path.join(self.base_dir, self.id_to_image_id[image_index] + '.jpg')
+ return path
+
+ def load_image(self, image_index):
+ return read_image_bgr(self.image_path(image_index))
+
+ def load_annotations(self, image_index):
+ image_annotations = self.annotations[self.id_to_image_id[image_index]]
+
+ labels = image_annotations['boxes']
+ height, width = image_annotations['h'], image_annotations['w']
+
+ boxes = np.zeros((len(labels), 5))
+ for idx, ann in enumerate(labels):
+ cls_id = ann['cls_id']
+ x1 = ann['x1'] * width
+ x2 = ann['x2'] * width
+ y1 = ann['y1'] * height
+ y2 = ann['y2'] * height
+
+ boxes[idx, 0] = x1
+ boxes[idx, 1] = y1
+ boxes[idx, 2] = x2
+ boxes[idx, 3] = y2
+ boxes[idx, 4] = cls_id
+
+ return boxes
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/preprocessing/pascal_voc.py b/engine/object_detection_branch/retina_net/keras_retinanet/preprocessing/pascal_voc.py
new file mode 100644
index 0000000..d12d796
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/preprocessing/pascal_voc.py
@@ -0,0 +1,187 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from ..preprocessing.generator import Generator
+from ..utils.image import read_image_bgr
+
+import os
+import numpy as np
+from six import raise_from
+from PIL import Image
+
+try:
+ import xml.etree.cElementTree as ET
+except ImportError:
+ import xml.etree.ElementTree as ET
+
+voc_classes = {
+ 'aeroplane' : 0,
+ 'bicycle' : 1,
+ 'bird' : 2,
+ 'boat' : 3,
+ 'bottle' : 4,
+ 'bus' : 5,
+ 'car' : 6,
+ 'cat' : 7,
+ 'chair' : 8,
+ 'cow' : 9,
+ 'diningtable' : 10,
+ 'dog' : 11,
+ 'horse' : 12,
+ 'motorbike' : 13,
+ 'person' : 14,
+ 'pottedplant' : 15,
+ 'sheep' : 16,
+ 'sofa' : 17,
+ 'train' : 18,
+ 'tvmonitor' : 19
+}
+
+
+def _findNode(parent, name, debug_name=None, parse=None):
+ if debug_name is None:
+ debug_name = name
+
+ result = parent.find(name)
+ if result is None:
+ raise ValueError('missing element \'{}\''.format(debug_name))
+ if parse is not None:
+ try:
+ return parse(result.text)
+ except ValueError as e:
+ raise_from(ValueError('illegal value for \'{}\': {}'.format(debug_name, e)), None)
+ return result
+
+
+class PascalVocGenerator(Generator):
+ """ Generate data for a Pascal VOC dataset.
+
+ See http://host.robots.ox.ac.uk/pascal/VOC/ for more information.
+ """
+
+ def __init__(
+ self,
+ data_dir,
+ set_name,
+ classes=voc_classes,
+ image_extension='.jpg',
+ skip_truncated=False,
+ skip_difficult=False,
+ **kwargs
+ ):
+ """ Initialize a Pascal VOC data generator.
+
+ Args
+ base_dir: Directory w.r.t. where the files are to be searched (defaults to the directory containing the csv_data_file).
+ csv_class_file: Path to the CSV classes file.
+ """
+ self.data_dir = data_dir
+ self.set_name = set_name
+ self.classes = classes
+ self.image_names = [l.strip().split(None, 1)[0] for l in open(os.path.join(data_dir, 'ImageSets', 'Main', set_name + '.txt')).readlines()]
+ self.image_extension = image_extension
+ self.skip_truncated = skip_truncated
+ self.skip_difficult = skip_difficult
+
+ self.labels = {}
+ for key, value in self.classes.items():
+ self.labels[value] = key
+
+ super(PascalVocGenerator, self).__init__(**kwargs)
+
+ def size(self):
+ """ Size of the dataset.
+ """
+ return len(self.image_names)
+
+ def num_classes(self):
+ """ Number of classes in the dataset.
+ """
+ return len(self.classes)
+
+ def name_to_label(self, name):
+ """ Map name to label.
+ """
+ return self.classes[name]
+
+ def label_to_name(self, label):
+ """ Map label to name.
+ """
+ return self.labels[label]
+
+ def image_aspect_ratio(self, image_index):
+ """ Compute the aspect ratio for an image with image_index.
+ """
+ path = os.path.join(self.data_dir, 'JPEGImages', self.image_names[image_index] + self.image_extension)
+ image = Image.open(path)
+ return float(image.width) / float(image.height)
+
+ def load_image(self, image_index):
+ """ Load an image at the image_index.
+ """
+ path = os.path.join(self.data_dir, 'JPEGImages', self.image_names[image_index] + self.image_extension)
+ return read_image_bgr(path)
+
+ def __parse_annotation(self, element):
+ """ Parse an annotation given an XML element.
+ """
+ truncated = _findNode(element, 'truncated', parse=int)
+ difficult = _findNode(element, 'difficult', parse=int)
+
+ class_name = _findNode(element, 'name').text
+ if class_name not in self.classes:
+ raise ValueError('class name \'{}\' not found in classes: {}'.format(class_name, list(self.classes.keys())))
+
+ box = np.zeros((1, 5))
+ box[0, 4] = self.name_to_label(class_name)
+
+ bndbox = _findNode(element, 'bndbox')
+ box[0, 0] = _findNode(bndbox, 'xmin', 'bndbox.xmin', parse=float) - 1
+ box[0, 1] = _findNode(bndbox, 'ymin', 'bndbox.ymin', parse=float) - 1
+ box[0, 2] = _findNode(bndbox, 'xmax', 'bndbox.xmax', parse=float) - 1
+ box[0, 3] = _findNode(bndbox, 'ymax', 'bndbox.ymax', parse=float) - 1
+
+ return truncated, difficult, box
+
+ def __parse_annotations(self, xml_root):
+ """ Parse all annotations under the xml_root.
+ """
+ boxes = np.zeros((0, 5))
+ for i, element in enumerate(xml_root.iter('object')):
+ try:
+ truncated, difficult, box = self.__parse_annotation(element)
+ except ValueError as e:
+ raise_from(ValueError('could not parse object #{}: {}'.format(i, e)), None)
+
+ if truncated and self.skip_truncated:
+ continue
+ if difficult and self.skip_difficult:
+ continue
+ boxes = np.append(boxes, box, axis=0)
+
+ return boxes
+
+ def load_annotations(self, image_index):
+ """ Load annotations for an image_index.
+ """
+ filename = self.image_names[image_index] + '.xml'
+ try:
+ tree = ET.parse(os.path.join(self.data_dir, 'Annotations', filename))
+ return self.__parse_annotations(tree.getroot())
+ except ET.ParseError as e:
+ raise_from(ValueError('invalid annotations file: {}: {}'.format(filename, e)), None)
+ except ValueError as e:
+ raise_from(ValueError('invalid annotations file: {}: {}'.format(filename, e)), None)
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/utils/__init__.py b/engine/object_detection_branch/retina_net/keras_retinanet/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/utils/anchors.py b/engine/object_detection_branch/retina_net/keras_retinanet/utils/anchors.py
new file mode 100644
index 0000000..abb5536
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/utils/anchors.py
@@ -0,0 +1,300 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import numpy as np
+
+
+def anchor_targets_bbox(
+ image_shape,
+ annotations,
+ num_classes,
+ mask_shape=None,
+ negative_overlap=0.4,
+ positive_overlap=0.5,
+ **kwargs
+):
+ """ Generate anchor targets for bbox detection.
+
+ Args
+ image_shape: Shape of the image.
+ annotations: np.array of shape (N, 5) for (x1, y1, x2, y2, label).
+ num_classes: Number of classes to predict.
+ mask_shape: If the image is padded with zeros, mask_shape can be used to mark the relevant part of the image.
+ negative_overlap: IoU overlap for negative anchors (all anchors with overlap < negative_overlap are negative).
+ positive_overlap: IoU overlap or positive anchors (all anchors with overlap > positive_overlap are positive).
+
+ Returns
+ labels: np.array of shape (A, num_classes) where a cols consists of -1 for ignore, 0 for negative and 1 for positive for a certain class.
+ annotations: np.array of shape (A, 5) for (x1, y1, x2, y2, label) containing the annotations corresponding to each anchor or 0 if there is no corresponding anchor.
+ anchors: np.array of shape (A, 4) for (x1, y1, x2, y2) containing the anchor boxes.
+ """
+ anchors = anchors_for_shape(image_shape, **kwargs)
+
+ # label: 1 is positive, 0 is negative, -1 is dont care
+ labels = np.ones((anchors.shape[0], num_classes)) * -1
+
+ if annotations.shape[0]:
+ # obtain indices of gt annotations with the greatest overlap
+ overlaps = compute_overlap(anchors, annotations)
+ argmax_overlaps_inds = np.argmax(overlaps, axis=1)
+ max_overlaps = overlaps[np.arange(overlaps.shape[0]), argmax_overlaps_inds]
+
+ # assign bg labels first so that positive labels can clobber them
+ labels[max_overlaps < negative_overlap, :] = 0
+
+ # compute box regression targets
+ annotations = annotations[argmax_overlaps_inds]
+
+ # fg label: above threshold IOU
+ positive_indices = max_overlaps >= positive_overlap
+ labels[positive_indices, :] = 0
+ labels[positive_indices, annotations[positive_indices, 4].astype(int)] = 1
+ else:
+ # no annotations? then everything is background
+ labels[:] = 0
+ annotations = np.zeros((anchors.shape[0], annotations.shape[1]))
+
+ # ignore annotations outside of image
+ mask_shape = image_shape if mask_shape is None else mask_shape
+ anchors_centers = np.vstack([(anchors[:, 0] + anchors[:, 2]) / 2, (anchors[:, 1] + anchors[:, 3]) / 2]).T
+ indices = np.logical_or(anchors_centers[:, 0] >= mask_shape[1], anchors_centers[:, 1] >= mask_shape[0])
+ labels[indices, :] = -1
+
+ return labels, annotations, anchors
+
+
+def layer_shapes(image_shape, model):
+ """Compute layer shapes given input image shape and the model.
+
+ Args
+ image_shape: The shape of the image.
+ model: The model to use for computing how the image shape is transformed in the pyramid.
+
+ Returns
+ A dictionary mapping layer names to image shapes.
+ """
+ shape = {
+ model.layers[0].name: (None,) + image_shape,
+ }
+
+ for layer in model.layers[1:]:
+ nodes = layer._inbound_nodes
+ for node in nodes:
+ inputs = [shape[lr.name] for lr in node.inbound_layers]
+ if not inputs:
+ continue
+ shape[layer.name] = layer.compute_output_shape(inputs[0] if len(inputs) == 1 else inputs)
+
+ return shape
+
+
+def make_shapes_callback(model):
+ """ Make a function for getting the shape of the pyramid levels.
+ """
+ def get_shapes(image_shape, pyramid_levels):
+ shape = layer_shapes(image_shape, model)
+ image_shapes = [shape["P{}".format(level)][1:3] for level in pyramid_levels]
+ return image_shapes
+
+ return get_shapes
+
+
+def guess_shapes(image_shape, pyramid_levels):
+ """Guess shapes based on pyramid levels.
+
+ Args
+ image_shape: The shape of the image.
+ pyramid_levels: A list of what pyramid levels are used.
+
+ Returns
+ A list of image shapes at each pyramid level.
+ """
+ image_shape = np.array(image_shape[:2])
+ image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in pyramid_levels]
+ return image_shapes
+
+
+def anchors_for_shape(
+ image_shape,
+ pyramid_levels=None,
+ ratios=None,
+ scales=None,
+ strides=None,
+ sizes=None,
+ shapes_callback=None,
+):
+ """ Generators anchors for a given shape.
+
+ Args
+ image_shape: The shape of the image.
+ pyramid_levels: List of ints representing which pyramids to use (defaults to [3, 4, 5, 6, 7]).
+ ratios: List of ratios with which anchors are generated (defaults to [0.5, 1, 2]).
+ scales: List of scales with which anchors are generated (defaults to [2^0, 2^(1/3), 2^(2/3)]).
+ strides: Stride per pyramid level, defines how the pyramids are constructed.
+ sizes: Sizes of the anchors per pyramid level.
+ shapes_callback: Function to call for getting the shape of the image at different pyramid levels.
+
+ Returns
+ np.array of shape (N, 4) containing the (x1, y1, x2, y2) coordinates for the anchors.
+ """
+ if pyramid_levels is None:
+ pyramid_levels = [3, 4, 5, 6, 7]
+ if strides is None:
+ strides = [2 ** x for x in pyramid_levels]
+ if sizes is None:
+ sizes = [2 ** (x + 2) for x in pyramid_levels]
+ if ratios is None:
+ ratios = np.array([0.5, 1, 2])
+ if scales is None:
+ scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)])
+
+ if shapes_callback is None:
+ shapes_callback = guess_shapes
+ image_shapes = shapes_callback(image_shape, pyramid_levels)
+
+ # compute anchors over all pyramid levels
+ all_anchors = np.zeros((0, 4))
+ for idx, p in enumerate(pyramid_levels):
+ anchors = generate_anchors(base_size=sizes[idx], ratios=ratios, scales=scales)
+ shifted_anchors = shift(image_shapes[idx], strides[idx], anchors)
+ all_anchors = np.append(all_anchors, shifted_anchors, axis=0)
+
+ return all_anchors
+
+
+def shift(shape, stride, anchors):
+ """ Produce shifted anchors based on shape of the map and stride size.
+
+ Args
+ shape : Shape to shift the anchors over.
+ stride : Stride to shift the anchors with over the shape.
+ anchors: The anchors to apply at each location.
+ """
+ shift_x = (np.arange(0, shape[1]) + 0.5) * stride
+ shift_y = (np.arange(0, shape[0]) + 0.5) * stride
+
+ shift_x, shift_y = np.meshgrid(shift_x, shift_y)
+
+ shifts = np.vstack((
+ shift_x.ravel(), shift_y.ravel(),
+ shift_x.ravel(), shift_y.ravel()
+ )).transpose()
+
+ # add A anchors (1, A, 4) to
+ # cell K shifts (K, 1, 4) to get
+ # shift anchors (K, A, 4)
+ # reshape to (K*A, 4) shifted anchors
+ A = anchors.shape[0]
+ K = shifts.shape[0]
+ all_anchors = (anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
+ all_anchors = all_anchors.reshape((K * A, 4))
+
+ return all_anchors
+
+
+def generate_anchors(base_size=16, ratios=None, scales=None):
+ """
+ Generate anchor (reference) windows by enumerating aspect ratios X
+ scales w.r.t. a reference window.
+ """
+
+ if ratios is None:
+ ratios = np.array([0.5, 1, 2])
+
+ if scales is None:
+ scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)])
+
+ num_anchors = len(ratios) * len(scales)
+
+ # initialize output anchors
+ anchors = np.zeros((num_anchors, 4))
+
+ # scale base_size
+ anchors[:, 2:] = base_size * np.tile(scales, (2, len(ratios))).T
+
+ # compute areas of anchors
+ areas = anchors[:, 2] * anchors[:, 3]
+
+ # correct for ratios
+ anchors[:, 2] = np.sqrt(areas / np.repeat(ratios, len(scales)))
+ anchors[:, 3] = anchors[:, 2] * np.repeat(ratios, len(scales))
+
+ # transform from (x_ctr, y_ctr, w, h) -> (x1, y1, x2, y2)
+ anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T
+ anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T
+
+ return anchors
+
+
+def bbox_transform(anchors, gt_boxes, mean=None, std=None):
+ """Compute bounding-box regression targets for an image."""
+
+ if mean is None:
+ mean = np.array([0, 0, 0, 0])
+ if std is None:
+ std = np.array([0.2, 0.2, 0.2, 0.2])
+
+ if isinstance(mean, (list, tuple)):
+ mean = np.array(mean)
+ elif not isinstance(mean, np.ndarray):
+ raise ValueError('Expected mean to be a np.ndarray, list or tuple. Received: {}'.format(type(mean)))
+
+ if isinstance(std, (list, tuple)):
+ std = np.array(std)
+ elif not isinstance(std, np.ndarray):
+ raise ValueError('Expected std to be a np.ndarray, list or tuple. Received: {}'.format(type(std)))
+
+ anchor_widths = anchors[:, 2] - anchors[:, 0]
+ anchor_heights = anchors[:, 3] - anchors[:, 1]
+
+ targets_dx1 = (gt_boxes[:, 0] - anchors[:, 0]) / anchor_widths
+ targets_dy1 = (gt_boxes[:, 1] - anchors[:, 1]) / anchor_heights
+ targets_dx2 = (gt_boxes[:, 2] - anchors[:, 2]) / anchor_widths
+ targets_dy2 = (gt_boxes[:, 3] - anchors[:, 3]) / anchor_heights
+
+ targets = np.stack((targets_dx1, targets_dy1, targets_dx2, targets_dy2))
+ targets = targets.T
+
+ targets = (targets - mean) / std
+
+ return targets
+
+
+def compute_overlap(a, b):
+ """
+ Args
+
+ a: (N, 4) ndarray of float
+ b: (K, 4) ndarray of float
+
+ Returns
+ overlaps: (N, K) ndarray of overlap between boxes and query_boxes
+ """
+ area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
+
+ iw = np.minimum(np.expand_dims(a[:, 2], axis=1), b[:, 2]) - np.maximum(np.expand_dims(a[:, 0], 1), b[:, 0])
+ ih = np.minimum(np.expand_dims(a[:, 3], axis=1), b[:, 3]) - np.maximum(np.expand_dims(a[:, 1], 1), b[:, 1])
+
+ iw = np.maximum(iw, 0)
+ ih = np.maximum(ih, 0)
+
+ ua = np.expand_dims((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih
+
+ ua = np.maximum(ua, np.finfo(float).eps)
+
+ intersection = iw * ih
+
+ return intersection / ua
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/utils/coco_eval.py b/engine/object_detection_branch/retina_net/keras_retinanet/utils/coco_eval.py
new file mode 100644
index 0000000..298f277
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/utils/coco_eval.py
@@ -0,0 +1,91 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from __future__ import print_function
+
+from pycocotools.cocoeval import COCOeval
+
+import numpy as np
+import json
+
+
+def evaluate_coco(generator, model, threshold=0.05):
+ """ Use the pycocotools to evaluate a COCO model on a dataset.
+
+ Args
+ generator : The generator for generating the evaluation data.
+ model : The model to evaluate.
+ threshold : The score threshold to use.
+ """
+ # start collecting results
+ results = []
+ image_ids = []
+ for index in range(generator.size()):
+ image = generator.load_image(index)
+ image = generator.preprocess_image(image)
+ image, scale = generator.resize_image(image)
+
+ # run network
+ boxes, scores, labels = model.predict_on_batch(np.expand_dims(image, axis=0))
+
+ # correct boxes for image scale
+ boxes /= scale
+
+ # change to (x, y, w, h) (MS COCO standard)
+ boxes[:, :, 2] -= boxes[:, :, 0]
+ boxes[:, :, 3] -= boxes[:, :, 1]
+
+ # compute predicted labels and scores
+ for box, score, label in zip(boxes[0], scores[0], labels[0]):
+ # scores are sorted, so we can break
+ if score < threshold:
+ break
+
+ # append detection for each positively labeled class
+ image_result = {
+ 'image_id' : generator.image_ids[index],
+ 'category_id' : generator.label_to_coco_label(label),
+ 'score' : float(score),
+ 'bbox' : box.tolist(),
+ }
+
+ # append detection to results
+ results.append(image_result)
+
+ # append image to list of processed images
+ image_ids.append(generator.image_ids[index])
+
+ # print progress
+ print('{}/{}'.format(index + 1, generator.size()), end='\r')
+
+ if not len(results):
+ return
+
+ # write output
+ json.dump(results, open('{}_bbox_results.json'.format(generator.set_name), 'w'), indent=4)
+ json.dump(image_ids, open('{}_processed_image_ids.json'.format(generator.set_name), 'w'), indent=4)
+
+ # load results in COCO evaluation tool
+ coco_true = generator.coco
+ coco_pred = coco_true.loadRes('{}_bbox_results.json'.format(generator.set_name))
+
+ # run COCO evaluation
+ coco_eval = COCOeval(coco_true, coco_pred, 'bbox')
+ coco_eval.params.imgIds = image_ids
+ coco_eval.evaluate()
+ coco_eval.accumulate()
+ coco_eval.summarize()
+ return coco_eval.stats
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/utils/colors.py b/engine/object_detection_branch/retina_net/keras_retinanet/utils/colors.py
new file mode 100644
index 0000000..4f40f70
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/utils/colors.py
@@ -0,0 +1,118 @@
+import warnings
+
+
+def label_color(label):
+ """ Return a color from a set of predefined colors. Contains 80 colors in total.
+
+ Args
+ label: The label to get the color for.
+
+ Returns
+ A list of three values representing a RGB color.
+
+ If no color is defined for a certain label, the color green is returned and a warning is printed.
+ """
+ if label < len(colors):
+
+ if label == 0:
+ return (214, 86, 100)
+ else:
+ return colors[label]
+ else:
+ warnings.warn('Label {} has no color, returning default.'.format(label))
+ return (0, 255, 0)
+
+ # color =
+
+
+"""
+Generated using:
+
+```
+colors = [list((matplotlib.colors.hsv_to_rgb([x, 1.0, 1.0]) * 255).astype(int)) for x in np.arange(0, 1, 1.0 / 80)]
+shuffle(colors)
+pprint(colors)
+```
+"""
+colors = [
+ [31 , 0 , 255] ,
+ [0 , 159 , 255] ,
+ [255 , 95 , 0] ,
+ [255 , 19 , 0] ,
+ [255 , 0 , 0] ,
+ [255 , 38 , 0] ,
+ [0 , 255 , 25] ,
+ [255 , 0 , 133] ,
+ [255 , 172 , 0] ,
+ [108 , 0 , 255] ,
+ [0 , 82 , 255] ,
+ [0 , 255 , 6] ,
+ [255 , 0 , 152] ,
+ [223 , 0 , 255] ,
+ [12 , 0 , 255] ,
+ [0 , 255 , 178] ,
+ [108 , 255 , 0] ,
+ [184 , 0 , 255] ,
+ [255 , 0 , 76] ,
+ [146 , 255 , 0] ,
+ [51 , 0 , 255] ,
+ [0 , 197 , 255] ,
+ [255 , 248 , 0] ,
+ [255 , 0 , 19] ,
+ [255 , 0 , 38] ,
+ [89 , 255 , 0] ,
+ [127 , 255 , 0] ,
+ [255 , 153 , 0] ,
+ [0 , 255 , 255] ,
+ [0 , 255 , 216] ,
+ [0 , 255 , 121] ,
+ [255 , 0 , 248] ,
+ [70 , 0 , 255] ,
+ [0 , 255 , 159] ,
+ [0 , 216 , 255] ,
+ [0 , 6 , 255] ,
+ [0 , 63 , 255] ,
+ [31 , 255 , 0] ,
+ [255 , 57 , 0] ,
+ [255 , 0 , 210] ,
+ [0 , 255 , 102] ,
+ [242 , 255 , 0] ,
+ [255 , 191 , 0] ,
+ [0 , 255 , 63] ,
+ [255 , 0 , 95] ,
+ [146 , 0 , 255] ,
+ [184 , 255 , 0] ,
+ [255 , 114 , 0] ,
+ [0 , 255 , 235] ,
+ [255 , 229 , 0] ,
+ [0 , 178 , 255] ,
+ [255 , 0 , 114] ,
+ [255 , 0 , 57] ,
+ [0 , 140 , 255] ,
+ [0 , 121 , 255] ,
+ [12 , 255 , 0] ,
+ [255 , 210 , 0] ,
+ [0 , 255 , 44] ,
+ [165 , 255 , 0] ,
+ [0 , 25 , 255] ,
+ [0 , 255 , 140] ,
+ [0 , 101 , 255] ,
+ [0 , 255 , 82] ,
+ [223 , 255 , 0] ,
+ [242 , 0 , 255] ,
+ [89 , 0 , 255] ,
+ [165 , 0 , 255] ,
+ [70 , 255 , 0] ,
+ [255 , 0 , 172] ,
+ [255 , 76 , 0] ,
+ [203 , 255 , 0] ,
+ [204 , 0 , 255] ,
+ [255 , 0 , 229] ,
+ [255 , 133 , 0] ,
+ [127 , 0 , 255] ,
+ [0 , 235 , 255] ,
+ [0 , 255 , 197] ,
+ [255 , 0 , 191] ,
+ [0 , 44 , 255] ,
+ [50 , 255 , 0]
+]
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/utils/eval.py b/engine/object_detection_branch/retina_net/keras_retinanet/utils/eval.py
new file mode 100644
index 0000000..845a04b
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/utils/eval.py
@@ -0,0 +1,226 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from __future__ import print_function
+
+from .anchors import compute_overlap
+from .visualization import draw_detections, draw_annotations
+
+import numpy as np
+import os
+
+import cv2
+
+
+def _compute_ap(recall, precision):
+ """ Compute the average precision, given the recall and precision curves.
+
+ Code originally from https://github.com/rbgirshick/py-faster-rcnn.
+
+ # Arguments
+ recall: The recall curve (list).
+ precision: The precision curve (list).
+ # Returns
+ The average precision as computed in py-faster-rcnn.
+ """
+ # correct AP calculation
+ # first append sentinel values at the end
+ mrec = np.concatenate(([0.], recall, [1.]))
+ mpre = np.concatenate(([0.], precision, [0.]))
+
+ # compute the precision envelope
+ for i in range(mpre.size - 1, 0, -1):
+ mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+ # to calculate area under PR curve, look for points
+ # where X axis (recall) changes value
+ i = np.where(mrec[1:] != mrec[:-1])[0]
+
+ # and sum (\Delta recall) * prec
+ ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+ return ap
+
+
+def _get_detections(generator, model, score_threshold=0.05, max_detections=100, save_path=None):
+ """ Get the detections from the model using the generator.
+
+ The result is a list of lists such that the size is:
+ all_detections[num_images][num_classes] = detections[num_detections, 4 + num_classes]
+
+ # Arguments
+ generator : The generator used to run images through the model.
+ model : The model to run on the images.
+ score_threshold : The score confidence threshold to use.
+ max_detections : The maximum number of detections to use per image.
+ save_path : The path to save the images with visualized detections to.
+ # Returns
+ A list of lists containing the detections for each image in the generator.
+ """
+ all_detections = [[None for i in range(generator.num_classes())] for j in range(generator.size())]
+
+ for i in range(generator.size()):
+ raw_image = generator.load_image(i)
+ image = generator.preprocess_image(raw_image.copy())
+ image, scale = generator.resize_image(image)
+
+ # run network
+ boxes, scores, labels = model.predict_on_batch(np.expand_dims(image, axis=0))
+
+ # correct boxes for image scale
+ boxes /= scale
+
+ # select indices which have a score above the threshold
+ indices = np.where(scores[0, :] > score_threshold)[0]
+
+ # select those scores
+ scores = scores[0][indices]
+
+ # find the order with which to sort the scores
+ scores_sort = np.argsort(-scores)[:max_detections]
+
+ # select detections
+ image_boxes = boxes[0, indices[scores_sort], :]
+ image_scores = scores[scores_sort]
+ image_labels = labels[0, indices[scores_sort]]
+ image_detections = np.concatenate([image_boxes, np.expand_dims(image_scores, axis=1), np.expand_dims(image_labels, axis=1)], axis=1)
+
+ if save_path is not None:
+ draw_annotations(raw_image, generator.load_annotations(i), label_to_name=generator.label_to_name)
+ draw_detections(raw_image, image_boxes, image_scores, image_labels, label_to_name=generator.label_to_name)
+
+ cv2.imwrite(os.path.join(save_path, '{}.png'.format(i)), raw_image)
+
+ # copy detections to all_detections
+ for label in range(generator.num_classes()):
+ all_detections[i][label] = image_detections[image_detections[:, -1] == label, :-1]
+
+ print('{}/{}'.format(i + 1, generator.size()), end='\r')
+
+ return all_detections
+
+
+def _get_annotations(generator):
+ """ Get the ground truth annotations from the generator.
+
+ The result is a list of lists such that the size is:
+ all_detections[num_images][num_classes] = annotations[num_detections, 5]
+
+ # Arguments
+ generator : The generator used to retrieve ground truth annotations.
+ # Returns
+ A list of lists containing the annotations for each image in the generator.
+ """
+ all_annotations = [[None for i in range(generator.num_classes())] for j in range(generator.size())]
+
+ for i in range(generator.size()):
+ # load the annotations
+ annotations = generator.load_annotations(i)
+
+ # copy detections to all_annotations
+ for label in range(generator.num_classes()):
+ all_annotations[i][label] = annotations[annotations[:, 4] == label, :4].copy()
+
+ print('{}/{}'.format(i + 1, generator.size()), end='\r')
+
+ return all_annotations
+
+
+def evaluate(
+ generator,
+ model,
+ iou_threshold=0.5,
+ score_threshold=0.05,
+ max_detections=100,
+ save_path=None
+):
+ """ Evaluate a given dataset using a given model.
+
+ # Arguments
+ generator : The generator that represents the dataset to evaluate.
+ model : The model to evaluate.
+ iou_threshold : The threshold used to consider when a detection is positive or negative.
+ score_threshold : The score confidence threshold to use for detections.
+ max_detections : The maximum number of detections to use per image.
+ save_path : The path to save images with visualized detections to.
+ # Returns
+ A dict mapping class names to mAP scores.
+ """
+ # gather all detections and annotations
+ all_detections = _get_detections(generator, model, score_threshold=score_threshold, max_detections=max_detections, save_path=save_path)
+ all_annotations = _get_annotations(generator)
+ average_precisions = {}
+
+ # all_detections = pickle.load(open('all_detections.pkl', 'rb'))
+ # all_annotations = pickle.load(open('all_annotations.pkl', 'rb'))
+ # pickle.dump(all_detections, open('all_detections.pkl', 'wb'))
+ # pickle.dump(all_annotations, open('all_annotations.pkl', 'wb'))
+
+ # process detections and annotations
+ for label in range(generator.num_classes()):
+ false_positives = np.zeros((0,))
+ true_positives = np.zeros((0,))
+ scores = np.zeros((0,))
+ num_annotations = 0.0
+
+ for i in range(generator.size()):
+ detections = all_detections[i][label]
+ annotations = all_annotations[i][label]
+ num_annotations += annotations.shape[0]
+ detected_annotations = []
+
+ for d in detections:
+ scores = np.append(scores, d[4])
+
+ if annotations.shape[0] == 0:
+ false_positives = np.append(false_positives, 1)
+ true_positives = np.append(true_positives, 0)
+ continue
+
+ overlaps = compute_overlap(np.expand_dims(d, axis=0), annotations)
+ assigned_annotation = np.argmax(overlaps, axis=1)
+ max_overlap = overlaps[0, assigned_annotation]
+
+ if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations:
+ false_positives = np.append(false_positives, 0)
+ true_positives = np.append(true_positives, 1)
+ detected_annotations.append(assigned_annotation)
+ else:
+ false_positives = np.append(false_positives, 1)
+ true_positives = np.append(true_positives, 0)
+
+ # no annotations -> AP for this class is 0 (is this correct?)
+ if num_annotations == 0:
+ average_precisions[label] = 0
+ continue
+
+ # sort by score
+ indices = np.argsort(-scores)
+ false_positives = false_positives[indices]
+ true_positives = true_positives[indices]
+
+ # compute false positives and true positives
+ false_positives = np.cumsum(false_positives)
+ true_positives = np.cumsum(true_positives)
+
+ # compute recall and precision
+ recall = true_positives / num_annotations
+ precision = true_positives / np.maximum(true_positives + false_positives, np.finfo(np.float64).eps)
+
+ # compute average precision
+ average_precision = _compute_ap(recall, precision)
+ average_precisions[label] = average_precision
+
+ return average_precisions
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/utils/image.py b/engine/object_detection_branch/retina_net/keras_retinanet/utils/image.py
new file mode 100644
index 0000000..1e8dd3d
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/utils/image.py
@@ -0,0 +1,200 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from __future__ import division
+import keras
+import numpy as np
+import cv2
+from PIL import Image
+
+from .transform import change_transform_origin
+
+
+def read_image_bgr(path):
+ """ Read an image in BGR format.
+
+ Args
+ path: Path to the image.
+ """
+ image = np.asarray(Image.open(path).convert('RGB'))
+ return image[:, :, ::-1].copy()
+
+
+def preprocess_image(x):
+ """ Preprocess an image by subtracting the ImageNet mean.
+
+ Args
+ x: np.array of shape (None, None, 3) or (3, None, None).
+
+ Returns
+ The input with the ImageNet mean subtracted.
+ """
+ # mostly identical to "https://github.com/fchollet/keras/blob/master/keras/applications/imagenet_utils.py"
+ # except for converting RGB -> BGR since we assume BGR already
+ x = x.astype(keras.backend.floatx())
+ if keras.backend.image_data_format() == 'channels_first':
+ if x.ndim == 3:
+ x[0, :, :] -= 103.939
+ x[1, :, :] -= 116.779
+ x[2, :, :] -= 123.68
+ else:
+ x[:, 0, :, :] -= 103.939
+ x[:, 1, :, :] -= 116.779
+ x[:, 2, :, :] -= 123.68
+ else:
+ x[..., 0] -= 103.939
+ x[..., 1] -= 116.779
+ x[..., 2] -= 123.68
+
+ return x
+
+
+def adjust_transform_for_image(transform, image, relative_translation):
+ """ Adjust a transformation for a specific image.
+
+ The translation of the matrix will be scaled with the size of the image.
+ The linear part of the transformation will adjusted so that the origin of the transformation will be at the center of the image.
+ """
+ height, width, channels = image.shape
+
+ result = transform
+
+ # Scale the translation with the image size if specified.
+ if relative_translation:
+ result[0:2, 2] *= [width, height]
+
+ # Move the origin of transformation.
+ result = change_transform_origin(transform, (0.5 * width, 0.5 * height))
+
+ return result
+
+
+class TransformParameters:
+ """ Struct holding parameters determining how to apply a transformation to an image.
+
+ Args
+ fill_mode: One of: 'constant', 'nearest', 'reflect', 'wrap'
+ interpolation: One of: 'nearest', 'linear', 'cubic', 'area', 'lanczos4'
+ cval: Fill value to use with fill_mode='constant'
+ data_format: Same as for keras.preprocessing.image.apply_transform
+ relative_translation: If true (the default), interpret translation as a factor of the image size.
+ If false, interpret it as absolute pixels.
+ """
+ def __init__(
+ self,
+ fill_mode = 'nearest',
+ interpolation = 'linear',
+ cval = 0,
+ data_format = None,
+ relative_translation = True,
+ ):
+ self.fill_mode = fill_mode
+ self.cval = cval
+ self.interpolation = interpolation
+ self.relative_translation = relative_translation
+
+ if data_format is None:
+ data_format = keras.backend.image_data_format()
+ self.data_format = data_format
+
+ if data_format == 'channels_first':
+ self.channel_axis = 0
+ elif data_format == 'channels_last':
+ self.channel_axis = 2
+ else:
+ raise ValueError("invalid data_format, expected 'channels_first' or 'channels_last', got '{}'".format(data_format))
+
+ def cvBorderMode(self):
+ if self.fill_mode == 'constant':
+ return cv2.BORDER_CONSTANT
+ if self.fill_mode == 'nearest':
+ return cv2.BORDER_REPLICATE
+ if self.fill_mode == 'reflect':
+ return cv2.BORDER_REFLECT_101
+ if self.fill_mode == 'wrap':
+ return cv2.BORDER_WRAP
+
+ def cvInterpolation(self):
+ if self.interpolation == 'nearest':
+ return cv2.INTER_NEAREST
+ if self.interpolation == 'linear':
+ return cv2.INTER_LINEAR
+ if self.interpolation == 'cubic':
+ return cv2.INTER_CUBIC
+ if self.interpolation == 'area':
+ return cv2.INTER_AREA
+ if self.interpolation == 'lanczos4':
+ return cv2.INTER_LANCZOS4
+
+
+def apply_transform(matrix, image, params):
+ """
+ Apply a transformation to an image.
+
+ The origin of transformation is at the top left corner of the image.
+
+ The matrix is interpreted such that a point (x, y) on the original image is moved to transform * (x, y) in the generated image.
+ Mathematically speaking, that means that the matrix is a transformation from the transformed image space to the original image space.
+
+ Args
+ matrix: A homogeneous 3 by 3 matrix holding representing the transformation to apply.
+ image: The image to transform.
+ params: The transform parameters (see TransformParameters)
+ """
+ if params.channel_axis != 2:
+ image = np.moveaxis(image, params.channel_axis, 2)
+
+ output = cv2.warpAffine(
+ image,
+ matrix[:2, :],
+ dsize = (image.shape[1], image.shape[0]),
+ flags = params.cvInterpolation(),
+ borderMode = params.cvBorderMode(),
+ borderValue = params.cval,
+ )
+
+ if params.channel_axis != 2:
+ output = np.moveaxis(output, 2, params.channel_axis)
+ return output
+
+
+def resize_image(img, min_side=800, max_side=1333):
+ """ Resize an image such that the size is constrained to min_side and max_side.
+
+ Args
+ min_side: The image's min side will be equal to min_side after resizing.
+ max_side: If after resizing the image's max side is above max_side, resize until the max side is equal to max_side.
+
+ Returns
+ A resized image.
+ """
+ (rows, cols, _) = img.shape
+
+ smallest_side = min(rows, cols)
+
+ # rescale the image so the smallest side is min_side
+ scale = min_side / smallest_side
+
+ # check if the largest side is now greater than max_side, which can happen
+ # when images have a large aspect ratio
+ largest_side = max(rows, cols)
+ if largest_side * scale > max_side:
+ scale = max_side / largest_side
+
+ # resize the image with the computed scale
+ img = cv2.resize(img, None, fx=scale, fy=scale)
+
+ return img, scale
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/utils/keras_version.py b/engine/object_detection_branch/retina_net/keras_retinanet/utils/keras_version.py
new file mode 100644
index 0000000..1423f77
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/utils/keras_version.py
@@ -0,0 +1,55 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from __future__ import print_function
+
+import keras
+import sys
+
+minimum_keras_version = 2, 1, 3
+
+
+def keras_version():
+ """ Get the Keras version.
+
+ Returns
+ tuple of (major, minor, patch).
+ """
+ return tuple(map(int, keras.__version__.split('.')))
+
+
+def keras_version_ok():
+ """ Check if the current Keras version is higher than the minimum version.
+ """
+ return keras_version() >= minimum_keras_version
+
+
+def assert_keras_version():
+ """ Assert that the Keras version is up to date.
+ """
+ detected = keras.__version__
+ required = '.'.join(map(str, minimum_keras_version))
+ assert(keras_version() >= minimum_keras_version), 'You are using keras version {}. The minimum required version is {}.'.format(detected, required)
+
+
+def check_keras_version():
+ """ Check that the Keras version is up to date. If it isn't, print an error message and exit the script.
+ """
+ try:
+ assert_keras_version()
+ except AssertionError as e:
+ print(e, file=sys.stderr)
+ sys.exit(1)
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/utils/model.py b/engine/object_detection_branch/retina_net/keras_retinanet/utils/model.py
new file mode 100644
index 0000000..702262c
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/utils/model.py
@@ -0,0 +1,28 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+
+def freeze(model):
+ """ Set all layers in a model to non-trainable.
+
+ The weights for these layers will not be updated during training.
+
+ This function modifies the given model in-place,
+ but it also returns the modified model to allow easy chaining with other functions.
+ """
+ for layer in model.layers:
+ layer.trainable = False
+ return model
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/utils/transform.py b/engine/object_detection_branch/retina_net/keras_retinanet/utils/transform.py
new file mode 100644
index 0000000..1cc637c
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/utils/transform.py
@@ -0,0 +1,273 @@
+import numpy as np
+
+DEFAULT_PRNG = np.random
+
+
+def colvec(*args):
+ """ Create a numpy array representing a column vector. """
+ return np.array([args]).T
+
+
+def transform_aabb(transform, aabb):
+ """ Apply a transformation to an axis aligned bounding box.
+
+ The result is a new AABB in the same coordinate system as the original AABB.
+ The new AABB contains all corner points of the original AABB after applying the given transformation.
+
+ Args
+ transform: The transformation to apply.
+ x1: The minimum x value of the AABB.
+ y1: The minimum y value of the AABB.
+ x2: The maximum x value of the AABB.
+ y2: The maximum y value of the AABB.
+ Returns
+ The new AABB as tuple (x1, y1, x2, y2)
+ """
+ x1, y1, x2, y2 = aabb
+ # Transform all 4 corners of the AABB.
+ points = transform.dot([
+ [x1, x2, x1, x2],
+ [y1, y2, y2, y1],
+ [1, 1, 1, 1 ],
+ ])
+
+ # Extract the min and max corners again.
+ min_corner = points.min(axis=1)
+ max_corner = points.max(axis=1)
+
+ return [min_corner[0], min_corner[1], max_corner[0], max_corner[1]]
+
+
+def _random_vector(min, max, prng=DEFAULT_PRNG):
+ """ Construct a random vector between min and max.
+ Args
+ min: the minimum value for each component
+ max: the maximum value for each component
+ """
+ min = np.array(min)
+ max = np.array(max)
+ assert min.shape == max.shape
+ assert len(min.shape) == 1
+ return prng.uniform(min, max)
+
+
+def rotation(angle):
+ """ Construct a homogeneous 2D rotation matrix.
+ Args
+ angle: the angle in radians
+ Returns
+ the rotation matrix as 3 by 3 numpy array
+ """
+ return np.array([
+ [np.cos(angle), -np.sin(angle), 0],
+ [np.sin(angle), np.cos(angle), 0],
+ [0, 0, 1]
+ ])
+
+
+def random_rotation(min, max, prng=DEFAULT_PRNG):
+ """ Construct a random rotation between -max and max.
+ Args
+ min: a scalar for the minimum absolute angle in radians
+ max: a scalar for the maximum absolute angle in radians
+ prng: the pseudo-random number generator to use.
+ Returns
+ a homogeneous 3 by 3 rotation matrix
+ """
+ return rotation(prng.uniform(min, max))
+
+
+def translation(translation):
+ """ Construct a homogeneous 2D translation matrix.
+ # Arguments
+ translation: the translation 2D vector
+ # Returns
+ the translation matrix as 3 by 3 numpy array
+ """
+ return np.array([
+ [1, 0, translation[0]],
+ [0, 1, translation[1]],
+ [0, 0, 1]
+ ])
+
+
+def random_translation(min, max, prng=DEFAULT_PRNG):
+ """ Construct a random 2D translation between min and max.
+ Args
+ min: a 2D vector with the minimum translation for each dimension
+ max: a 2D vector with the maximum translation for each dimension
+ prng: the pseudo-random number generator to use.
+ Returns
+ a homogeneous 3 by 3 translation matrix
+ """
+ return translation(_random_vector(min, max, prng))
+
+
+def shear(angle):
+ """ Construct a homogeneous 2D shear matrix.
+ Args
+ angle: the shear angle in radians
+ Returns
+ the shear matrix as 3 by 3 numpy array
+ """
+ return np.array([
+ [1, -np.sin(angle), 0],
+ [0, np.cos(angle), 0],
+ [0, 0, 1]
+ ])
+
+
+def random_shear(min, max, prng=DEFAULT_PRNG):
+ """ Construct a random 2D shear matrix with shear angle between -max and max.
+ Args
+ min: the minimum shear angle in radians.
+ max: the maximum shear angle in radians.
+ prng: the pseudo-random number generator to use.
+ Returns
+ a homogeneous 3 by 3 shear matrix
+ """
+ return shear(prng.uniform(min, max))
+
+
+def scaling(factor):
+ """ Construct a homogeneous 2D scaling matrix.
+ Args
+ factor: a 2D vector for X and Y scaling
+ Returns
+ the zoom matrix as 3 by 3 numpy array
+ """
+ return np.array([
+ [factor[0], 0, 0],
+ [0, factor[1], 0],
+ [0, 0, 1]
+ ])
+
+
+def random_scaling(min, max, prng=DEFAULT_PRNG):
+ """ Construct a random 2D scale matrix between -max and max.
+ Args
+ min: a 2D vector containing the minimum scaling factor for X and Y.
+ min: a 2D vector containing The maximum scaling factor for X and Y.
+ prng: the pseudo-random number generator to use.
+ Returns
+ a homogeneous 3 by 3 scaling matrix
+ """
+ return scaling(_random_vector(min, max, prng))
+
+
+def random_flip(flip_x_chance, flip_y_chance, prng=DEFAULT_PRNG):
+ """ Construct a transformation randomly containing X/Y flips (or not).
+ Args
+ flip_x_chance: The chance that the result will contain a flip along the X axis.
+ flip_y_chance: The chance that the result will contain a flip along the Y axis.
+ prng: The pseudo-random number generator to use.
+ Returns
+ a homogeneous 3 by 3 transformation matrix
+ """
+ flip_x = prng.uniform(0, 1) < flip_x_chance
+ flip_y = prng.uniform(0, 1) < flip_y_chance
+ # 1 - 2 * bool gives 1 for False and -1 for True.
+ return scaling((1 - 2 * flip_x, 1 - 2 * flip_y))
+
+
+def change_transform_origin(transform, center):
+ """ Create a new transform representing the same transformation,
+ only with the origin of the linear part changed.
+ Args
+ transform: the transformation matrix
+ center: the new origin of the transformation
+ Returns
+ translate(center) * transform * translate(-center)
+ """
+ center = np.array(center)
+ return np.linalg.multi_dot([translation(center), transform, translation(-center)])
+
+
+def random_transform(
+ min_rotation=0,
+ max_rotation=0,
+ min_translation=(0, 0),
+ max_translation=(0, 0),
+ min_shear=0,
+ max_shear=0,
+ min_scaling=(1, 1),
+ max_scaling=(1, 1),
+ flip_x_chance=0,
+ flip_y_chance=0,
+ prng=DEFAULT_PRNG
+):
+ """ Create a random transformation.
+
+ The transformation consists of the following operations in this order (from left to right):
+ * rotation
+ * translation
+ * shear
+ * scaling
+ * flip x (if applied)
+ * flip y (if applied)
+
+ Note that by default, the data generators in `keras_retinanet.preprocessing.generators` interpret the translation
+ as factor of the image size. So an X translation of 0.1 would translate the image by 10% of it's width.
+ Set `relative_translation` to `False` in the `TransformParameters` of a data generator to have it interpret
+ the translation directly as pixel distances instead.
+
+ Args
+ min_rotation: The minimum rotation in radians for the transform as scalar.
+ max_rotation: The maximum rotation in radians for the transform as scalar.
+ min_translation: The minimum translation for the transform as 2D column vector.
+ max_translation: The maximum translation for the transform as 2D column vector.
+ min_shear: The minimum shear angle for the transform in radians.
+ max_shear: The maximum shear angle for the transform in radians.
+ min_scaling: The minimum scaling for the transform as 2D column vector.
+ max_scaling: The maximum scaling for the transform as 2D column vector.
+ flip_x_chance: The chance (0 to 1) that a transform will contain a flip along X direction.
+ flip_y_chance: The chance (0 to 1) that a transform will contain a flip along Y direction.
+ prng: The pseudo-random number generator to use.
+ """
+ return np.linalg.multi_dot([
+ random_rotation(min_rotation, max_rotation, prng),
+ random_translation(min_translation, max_translation, prng),
+ random_shear(min_shear, max_shear, prng),
+ random_scaling(min_scaling, max_scaling, prng),
+ random_flip(flip_x_chance, flip_y_chance, prng)
+ ])
+
+
+def random_transform_generator(prng=None, **kwargs):
+ """ Create a random transform generator.
+
+ Uses a dedicated, newly created, properly seeded PRNG by default instead of the global DEFAULT_PRNG.
+
+ The transformation consists of the following operations in this order (from left to right):
+ * rotation
+ * translation
+ * shear
+ * scaling
+ * flip x (if applied)
+ * flip y (if applied)
+
+ Note that by default, the data generators in `keras_retinanet.preprocessing.generators` interpret the translation
+ as factor of the image size. So an X translation of 0.1 would translate the image by 10% of it's width.
+ Set `relative_translation` to `False` in the `TransformParameters` of a data generator to have it interpret
+ the translation directly as pixel distances instead.
+
+ Args
+ min_rotation: The minimum rotation in radians for the transform as scalar.
+ max_rotation: The maximum rotation in radians for the transform as scalar.
+ min_translation: The minimum translation for the transform as 2D column vector.
+ max_translation: The maximum translation for the transform as 2D column vector.
+ min_shear: The minimum shear angle for the transform in radians.
+ max_shear: The maximum shear angle for the transform in radians.
+ min_scaling: The minimum scaling for the transform as 2D column vector.
+ max_scaling: The maximum scaling for the transform as 2D column vector.
+ flip_x_chance: The chance (0 to 1) that a transform will contain a flip along X direction.
+ flip_y_chance: The chance (0 to 1) that a transform will contain a flip along Y direction.
+ prng: The pseudo-random number generator to use.
+ """
+
+ if prng is None:
+ # RandomState automatically seeds using the best available method.
+ prng = np.random.RandomState()
+
+ while True:
+ yield random_transform(prng=prng, **kwargs)
diff --git a/engine/object_detection_branch/retina_net/keras_retinanet/utils/visualization.py b/engine/object_detection_branch/retina_net/keras_retinanet/utils/visualization.py
new file mode 100644
index 0000000..23d3097
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/keras_retinanet/utils/visualization.py
@@ -0,0 +1,102 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import cv2
+import numpy as np
+
+from .colors import label_color
+
+
+def draw_box(image, box, color, thickness=2):
+ """ Draws a box on an image with a given color.
+
+ # Arguments
+ image : The image to draw on.
+ box : A list of 4 elements (x1, y1, x2, y2).
+ color : The color of the box.
+ thickness : The thickness of the lines to draw a box with.
+ """
+ b = np.array(box).astype(int)
+ cv2.rectangle(image, (b[0], b[1]), (b[2], b[3]), color, thickness, cv2.LINE_AA)
+
+
+def draw_caption(image, box, caption):
+ """ Draws a caption above the box in an image.
+
+ # Arguments
+ image : The image to draw on.
+ box : A list of 4 elements (x1, y1, x2, y2).
+ caption : String containing the text to draw.
+ """
+ b = np.array(box).astype(int)
+ # cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (0, 0, 0), 2)
+ # cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1)
+
+ cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (53, 42, 146), 2)
+
+
+def draw_boxes(image, boxes, color, thickness=2):
+ """ Draws boxes on an image with a given color.
+
+ # Arguments
+ image : The image to draw on.
+ boxes : A [N, 4] matrix (x1, y1, x2, y2).
+ color : The color of the boxes.
+ thickness : The thickness of the lines to draw boxes with.
+ """
+ for b in boxes:
+ draw_box(image, b, color, thickness=thickness)
+
+
+def draw_detections(image, boxes, scores, labels, color=None, label_to_name=None, score_threshold=0.5):
+ """ Draws detections in an image.
+
+ # Arguments
+ image : The image to draw on.
+ boxes : A [N, 4] matrix (x1, y1, x2, y2).
+ scores : A list of N classification scores.
+ labels : A list of N labels.
+ color : The color of the boxes. By default the color from keras_retinanet.utils.colors.label_color will be used.
+ label_to_name : (optional) Functor for mapping a label to a name.
+ score_threshold : Threshold used for determining what detections to draw.
+ """
+ selection = np.where(scores > score_threshold)[0]
+
+ for i in selection:
+ c = color if color is not None else label_color(labels[i])
+ draw_box(image, boxes[i, :], color=c)
+
+ # draw labels
+ caption = (label_to_name(labels[i]) if label_to_name else labels[i]) + ': {0:.2f}'.format(scores[i])
+ draw_caption(image, boxes[i, :], caption)
+
+
+def draw_annotations(image, annotations, color=(0, 255, 0), label_to_name=None):
+ """ Draws annotations in an image.
+
+ # Arguments
+ image : The image to draw on.
+ annotations : A [N, 5] matrix (x1, y1, x2, y2, label).
+ color : The color of the boxes. By default the color from keras_retinanet.utils.colors.label_color will be used.
+ label_to_name : (optional) Functor for mapping a label to a name.
+ """
+ for a in annotations:
+ label = a[4]
+ c = color if color is not None else label_color(label)
+ caption = '{}'.format(label_to_name(label) if label_to_name else label)
+ draw_caption(image, a, caption)
+
+ draw_box(image, a, color=c)
diff --git a/engine/object_detection_branch/retina_net/setup.cfg b/engine/object_detection_branch/retina_net/setup.cfg
new file mode 100644
index 0000000..75bf52f
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/setup.cfg
@@ -0,0 +1,11 @@
+# ignore:
+# E201 whitespace after '['
+# E202 whitespace before ']'
+# E203 whitespace before ':'
+# E221 multiple spaces before operator
+# E241 multiple spaces after ','
+# E251 unexpected spaces around keyword / parameter equals
+# E501 line too long (85 > 79 characters)
+[tool:pytest]
+flake8-max-line-length = 127
+flake8-ignore = E201 E202 E203 E221 E241 E251 E402 E501
diff --git a/engine/object_detection_branch/retina_net/setup.py b/engine/object_detection_branch/retina_net/setup.py
new file mode 100644
index 0000000..d3f4e2a
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/setup.py
@@ -0,0 +1,22 @@
+import setuptools
+
+setuptools.setup(
+ name='keras-retinanet',
+ version='0.3.1',
+ description='Keras implementation of RetinaNet object detection.',
+ url='https://github.com/fizyr/keras-retinanet',
+ author='Hans Gaiser',
+ author_email='h.gaiser@fizyr.com',
+ maintainer='Hans Gaiser',
+ maintainer_email='h.gaiser@fizyr.com',
+ packages=setuptools.find_packages(),
+ install_requires=['keras', 'keras-resnet', 'six', 'scipy'],
+ entry_points = {
+ 'console_scripts': [
+ 'retinanet-train=keras_retinanet.bin.train:main',
+ 'retinanet-evaluate=keras_retinanet.bin.evaluate:main',
+ 'retinanet-debug=keras_retinanet.bin.debug:main',
+ 'retinanet-convert-model=keras_retinanet.bin.convert_model:main',
+ ],
+ }
+)
diff --git a/engine/object_detection_branch/retina_net/single_img_inference.py b/engine/object_detection_branch/retina_net/single_img_inference.py
new file mode 100644
index 0000000..57e3cfe
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/single_img_inference.py
@@ -0,0 +1,129 @@
+from __future__ import print_function
+
+import matplotlib.pyplot as plt
+# import miscellaneous modules
+import numpy as np
+
+import cv2
+from engine.object_detection_branch.retina_net.keras_retinanet import models
+from keras.utils.data_utils import get_file
+
+from engine.object_detection_branch.retina_net.keras_retinanet.utils.visualization import draw_box, draw_caption
+from engine.object_detection_branch.retina_net.keras_retinanet.utils.colors import label_color
+
+from engine.object_detection_branch.retina_net.keras_retinanet.utils.image import read_image_bgr, preprocess_image, resize_image
+
+MODEL_PATH = 'https://github.com/GKalliatakis/Keras-EMOTIC-resources/releases/download/v1.0.2/resnet50_coco_best_v2.1.0.h5'
+
+
+def RetinaNet_single_img_detection(img_path,
+ imshow = False):
+
+ # load the downloaded/trained model
+ model_path = get_file('resnet50_coco_best_v2.h5',
+ MODEL_PATH,
+ cache_subdir='EMOTIC/object_detectors')
+
+ # load RetinaNet model
+ model = models.load_model(model_path, backbone_name='resnet50')
+
+ # load label to names mapping
+ labels_to_names = {0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane',
+ 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light',
+ 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench',
+ 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow',
+ 20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack',
+ 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee',
+ 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat',
+ 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle',
+ 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl',
+ 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli',
+ 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair',
+ 57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv',
+ 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microwave',
+ 69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase',
+ 76: 'scissors', 77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'}
+
+ # load image
+ image = read_image_bgr(img_path)
+
+
+ if imshow:
+ draw = image.copy()
+ draw = cv2.cvtColor(draw, cv2.COLOR_BGR2RGB)
+
+ # preprocess image for network
+ image = preprocess_image(image)
+ image, scale = resize_image(image)
+
+ # process image
+ boxes, scores, labels = model.predict_on_batch(np.expand_dims(image, axis=0))
+
+ # correct for image scale
+ boxes /= scale
+
+
+ persons_counter = 0
+ # run a for loop to define the number of detected persons
+ for box, score, label in zip(boxes[0], scores[0], labels[0]):
+ # scores are sorted so we can break
+ if score < 0.5:
+ break
+
+ # decode predicted labels
+ decoded_label = "{}".format(labels_to_names[label])
+
+ if decoded_label == 'person':
+ # print('[INFO] `person` was detected')
+ persons_counter += 1
+
+ # b = box.astype(int)
+
+ # TODO must handle the cases where no `person` object class was detected
+ # else:
+ # print ('[INFO] No `person` was detected')
+
+
+ final_array = np.empty([persons_counter, 4])
+ counter = 0
+
+ for box, score, label in zip(boxes[0], scores[0], labels[0]):
+
+ if counter > persons_counter:
+ break
+
+ # scores are sorted so we can break
+ if score < 0.5:
+ break
+
+ # decode predicted labels
+ decoded_label = "{}".format(labels_to_names[label])
+
+ if decoded_label == 'person':
+
+ b = box.astype(int)
+ final_array[counter][0] = b[0]
+ final_array[counter][1] = b[1]
+ final_array[counter][2] = b[2]
+ final_array[counter][3] = b[3]
+
+ counter += 1
+
+ if imshow:
+ color = label_color(label)
+ draw_box(draw, b, color=color)
+ caption = "{} {:.3f}".format(labels_to_names[label], score)
+ draw_caption(draw, b, caption)
+
+
+
+ if imshow:
+ plt.figure(figsize=(20, 12))
+ plt.axis('off')
+ plt.imshow(draw)
+ plt.show()
+
+
+ return final_array, persons_counter
+
+
diff --git a/engine/object_detection_branch/retina_net/tests/__init__.py b/engine/object_detection_branch/retina_net/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/retina_net/tests/backend/__init__.py b/engine/object_detection_branch/retina_net/tests/backend/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/retina_net/tests/backend/test_common.py b/engine/object_detection_branch/retina_net/tests/backend/test_common.py
new file mode 100644
index 0000000..a89ea2b
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/tests/backend/test_common.py
@@ -0,0 +1,124 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import numpy as np
+
+import keras
+import keras_retinanet.backend
+
+
+def test_bbox_transform_inv():
+ boxes = np.array([[
+ [100, 100, 200, 200],
+ [100, 100, 300, 300],
+ [100, 100, 200, 300],
+ [100, 100, 300, 200],
+ [80, 120, 200, 200],
+ [80, 120, 300, 300],
+ [80, 120, 200, 300],
+ [80, 120, 300, 200],
+ ]])
+ boxes = keras.backend.variable(boxes)
+
+ deltas = np.array([[
+ [0 , 0 , 0 , 0 ],
+ [0 , 0.1, 0 , 0 ],
+ [-0.3, 0 , 0 , 0 ],
+ [0.2 , 0.2, 0 , 0 ],
+ [0 , 0 , 0.1 , 0 ],
+ [0 , 0 , 0 , -0.3],
+ [0 , 0 , 0.2 , 0.2 ],
+ [0.1 , 0.2, -0.3, 0.4 ],
+ ]])
+ deltas = keras.backend.variable(deltas)
+
+ expected = np.array([[
+ [100 , 100 , 200 , 200 ],
+ [100 , 104 , 300 , 300 ],
+ [ 94 , 100 , 200 , 300 ],
+ [108 , 104 , 300 , 200 ],
+ [ 80 , 120 , 202.4 , 200 ],
+ [ 80 , 120 , 300 , 289.2],
+ [ 80 , 120 , 204.8 , 307.2],
+ [ 84.4, 123.2, 286.8 , 206.4]
+ ]])
+
+ result = keras_retinanet.backend.bbox_transform_inv(boxes, deltas)
+ result = keras.backend.eval(result)
+
+ np.testing.assert_array_almost_equal(result, expected, decimal=2)
+
+
+def test_shift():
+ shape = (2, 3)
+ stride = 8
+
+ anchors = np.array([
+ [-8, -8, 8, 8],
+ [-16, -16, 16, 16],
+ [-12, -12, 12, 12],
+ [-12, -16, 12, 16],
+ [-16, -12, 16, 12]
+ ], dtype=keras.backend.floatx())
+
+ expected = [
+ # anchors for (0, 0)
+ [4 - 8, 4 - 8, 4 + 8, 4 + 8],
+ [4 - 16, 4 - 16, 4 + 16, 4 + 16],
+ [4 - 12, 4 - 12, 4 + 12, 4 + 12],
+ [4 - 12, 4 - 16, 4 + 12, 4 + 16],
+ [4 - 16, 4 - 12, 4 + 16, 4 + 12],
+
+ # anchors for (0, 1)
+ [12 - 8, 4 - 8, 12 + 8, 4 + 8],
+ [12 - 16, 4 - 16, 12 + 16, 4 + 16],
+ [12 - 12, 4 - 12, 12 + 12, 4 + 12],
+ [12 - 12, 4 - 16, 12 + 12, 4 + 16],
+ [12 - 16, 4 - 12, 12 + 16, 4 + 12],
+
+ # anchors for (0, 2)
+ [20 - 8, 4 - 8, 20 + 8, 4 + 8],
+ [20 - 16, 4 - 16, 20 + 16, 4 + 16],
+ [20 - 12, 4 - 12, 20 + 12, 4 + 12],
+ [20 - 12, 4 - 16, 20 + 12, 4 + 16],
+ [20 - 16, 4 - 12, 20 + 16, 4 + 12],
+
+ # anchors for (1, 0)
+ [4 - 8, 12 - 8, 4 + 8, 12 + 8],
+ [4 - 16, 12 - 16, 4 + 16, 12 + 16],
+ [4 - 12, 12 - 12, 4 + 12, 12 + 12],
+ [4 - 12, 12 - 16, 4 + 12, 12 + 16],
+ [4 - 16, 12 - 12, 4 + 16, 12 + 12],
+
+ # anchors for (1, 1)
+ [12 - 8, 12 - 8, 12 + 8, 12 + 8],
+ [12 - 16, 12 - 16, 12 + 16, 12 + 16],
+ [12 - 12, 12 - 12, 12 + 12, 12 + 12],
+ [12 - 12, 12 - 16, 12 + 12, 12 + 16],
+ [12 - 16, 12 - 12, 12 + 16, 12 + 12],
+
+ # anchors for (1, 2)
+ [20 - 8, 12 - 8, 20 + 8, 12 + 8],
+ [20 - 16, 12 - 16, 20 + 16, 12 + 16],
+ [20 - 12, 12 - 12, 20 + 12, 12 + 12],
+ [20 - 12, 12 - 16, 20 + 12, 12 + 16],
+ [20 - 16, 12 - 12, 20 + 16, 12 + 12],
+ ]
+
+ result = keras_retinanet.backend.shift(shape, stride, anchors)
+ result = keras.backend.eval(result)
+
+ np.testing.assert_array_equal(result, expected)
diff --git a/engine/object_detection_branch/retina_net/tests/bin/__init__.py b/engine/object_detection_branch/retina_net/tests/bin/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/retina_net/tests/bin/test_train.py b/engine/object_detection_branch/retina_net/tests/bin/test_train.py
new file mode 100644
index 0000000..fba0c52
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/tests/bin/test_train.py
@@ -0,0 +1,82 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import warnings
+
+import keras_retinanet.bin.train
+
+
+def test_coco():
+ # ignore warnings in this test
+ warnings.simplefilter('ignore')
+
+ # run training / evaluation
+ keras_retinanet.bin.train.main([
+ '--epochs=1',
+ '--steps=1',
+ '--no-weights',
+ '--no-snapshots',
+ 'coco',
+ 'tests/test-data/coco',
+ ])
+
+
+def test_pascal():
+ # ignore warnings in this test
+ warnings.simplefilter('ignore')
+
+ # run training / evaluation
+ keras_retinanet.bin.train.main([
+ '--epochs=1',
+ '--steps=1',
+ '--no-weights',
+ '--no-snapshots',
+ 'pascal',
+ 'tests/test-data/pascal',
+ ])
+
+
+def test_csv():
+ # ignore warnings in this test
+ warnings.simplefilter('ignore')
+
+ # run training / evaluation
+ keras_retinanet.bin.train.main([
+ '--epochs=1',
+ '--steps=1',
+ '--no-weights',
+ '--no-snapshots',
+ 'csv',
+ 'tests/test-data/csv/annotations.csv',
+ 'tests/test-data/csv/classes.csv',
+ ])
+
+
+def test_vgg():
+ # ignore warnings in this test
+ warnings.simplefilter('ignore')
+
+ # run training / evaluation
+ keras_retinanet.bin.train.main([
+ '--backbone=vgg16',
+ '--epochs=1',
+ '--steps=1',
+ '--no-weights',
+ '--no-snapshots',
+ '--freeze-backbone',
+ 'coco',
+ 'tests/test-data/coco',
+ ])
diff --git a/engine/object_detection_branch/retina_net/tests/layers/__init__.py b/engine/object_detection_branch/retina_net/tests/layers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/retina_net/tests/layers/test_filter_detections.py b/engine/object_detection_branch/retina_net/tests/layers/test_filter_detections.py
new file mode 100644
index 0000000..fd1018f
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/tests/layers/test_filter_detections.py
@@ -0,0 +1,172 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import numpy as np
+
+import keras
+import keras_retinanet.layers
+
+
+class TestFilterDetections(object):
+ def test_simple(self):
+ # create simple FilterDetections layer
+ filter_detections_layer = keras_retinanet.layers.FilterDetections()
+
+ # create simple input
+ boxes = np.array([[
+ [0, 0, 10, 10],
+ [0, 0, 10, 10], # this will be suppressed
+ ]], dtype=keras.backend.floatx())
+ boxes = keras.backend.variable(boxes)
+
+ classification = np.array([[
+ [0, 0.9], # this will be suppressed
+ [0, 1],
+ ]], dtype=keras.backend.floatx())
+ classification = keras.backend.variable(classification)
+
+ # compute output
+ actual_boxes, actual_scores, actual_labels = filter_detections_layer.call([boxes, classification])
+ actual_boxes = keras.backend.eval(actual_boxes)
+ actual_scores = keras.backend.eval(actual_scores)
+ actual_labels = keras.backend.eval(actual_labels)
+
+ # define expected output
+ expected_boxes = -1 * np.ones((1, 300, 4), dtype=keras.backend.floatx())
+ expected_boxes[0, 0, :] = [0, 0, 10, 10]
+
+ expected_scores = -1 * np.ones((1, 300), dtype=keras.backend.floatx())
+ expected_scores[0, 0] = 1
+
+ expected_labels = -1 * np.ones((1, 300), dtype=keras.backend.floatx())
+ expected_labels[0, 0] = 1
+
+ # assert actual and expected are equal
+ np.testing.assert_array_equal(actual_boxes, expected_boxes)
+ np.testing.assert_array_equal(actual_scores, expected_scores)
+ np.testing.assert_array_equal(actual_labels, expected_labels)
+
+ def test_simple_with_other(self):
+ # create simple FilterDetections layer
+ filter_detections_layer = keras_retinanet.layers.FilterDetections()
+
+ # create simple input
+ boxes = np.array([[
+ [0, 0, 10, 10],
+ [0, 0, 10, 10], # this will be suppressed
+ ]], dtype=keras.backend.floatx())
+ boxes = keras.backend.variable(boxes)
+
+ classification = np.array([[
+ [0, 0.9], # this will be suppressed
+ [0, 1],
+ ]], dtype=keras.backend.floatx())
+ classification = keras.backend.variable(classification)
+
+ other = []
+ other.append(np.array([[
+ [0, 1234], # this will be suppressed
+ [0, 5678],
+ ]], dtype=keras.backend.floatx()))
+ other.append(np.array([[
+ 5678, # this will be suppressed
+ 1234,
+ ]], dtype=keras.backend.floatx()))
+ other = [keras.backend.variable(o) for o in other]
+
+ # compute output
+ actual = filter_detections_layer.call([boxes, classification] + other)
+ actual_boxes = keras.backend.eval(actual[0])
+ actual_scores = keras.backend.eval(actual[1])
+ actual_labels = keras.backend.eval(actual[2])
+ actual_other = [keras.backend.eval(a) for a in actual[3:]]
+
+ # define expected output
+ expected_boxes = -1 * np.ones((1, 300, 4), dtype=keras.backend.floatx())
+ expected_boxes[0, 0, :] = [0, 0, 10, 10]
+
+ expected_scores = -1 * np.ones((1, 300), dtype=keras.backend.floatx())
+ expected_scores[0, 0] = 1
+
+ expected_labels = -1 * np.ones((1, 300), dtype=keras.backend.floatx())
+ expected_labels[0, 0] = 1
+
+ expected_other = []
+ expected_other.append(-1 * np.ones((1, 300, 2), dtype=keras.backend.floatx()))
+ expected_other[-1][0, 0, :] = [0, 5678]
+ expected_other.append(-1 * np.ones((1, 300), dtype=keras.backend.floatx()))
+ expected_other[-1][0, 0] = 1234
+
+ # assert actual and expected are equal
+ np.testing.assert_array_equal(actual_boxes, expected_boxes)
+ np.testing.assert_array_equal(actual_scores, expected_scores)
+ np.testing.assert_array_equal(actual_labels, expected_labels)
+
+ for a, e in zip(actual_other, expected_other):
+ np.testing.assert_array_equal(a, e)
+
+ def test_mini_batch(self):
+ # create simple FilterDetections layer
+ filter_detections_layer = keras_retinanet.layers.FilterDetections()
+
+ # create input with batch_size=2
+ boxes = np.array([
+ [
+ [0, 0, 10, 10], # this will be suppressed
+ [0, 0, 10, 10],
+ ],
+ [
+ [100, 100, 150, 150],
+ [100, 100, 150, 150], # this will be suppressed
+ ],
+ ], dtype=keras.backend.floatx())
+ boxes = keras.backend.variable(boxes)
+
+ classification = np.array([
+ [
+ [0, 0.9], # this will be suppressed
+ [0, 1],
+ ],
+ [
+ [1, 0],
+ [0.9, 0], # this will be suppressed
+ ],
+ ], dtype=keras.backend.floatx())
+ classification = keras.backend.variable(classification)
+
+ # compute output
+ actual_boxes, actual_scores, actual_labels = filter_detections_layer.call([boxes, classification])
+ actual_boxes = keras.backend.eval(actual_boxes)
+ actual_scores = keras.backend.eval(actual_scores)
+ actual_labels = keras.backend.eval(actual_labels)
+
+ # define expected output
+ expected_boxes = -1 * np.ones((2, 300, 4), dtype=keras.backend.floatx())
+ expected_boxes[0, 0, :] = [0, 0, 10, 10]
+ expected_boxes[1, 0, :] = [100, 100, 150, 150]
+
+ expected_scores = -1 * np.ones((2, 300), dtype=keras.backend.floatx())
+ expected_scores[0, 0] = 1
+ expected_scores[1, 0] = 1
+
+ expected_labels = -1 * np.ones((2, 300), dtype=keras.backend.floatx())
+ expected_labels[0, 0] = 1
+ expected_labels[1, 0] = 0
+
+ # assert actual and expected are equal
+ np.testing.assert_array_equal(actual_boxes, expected_boxes)
+ np.testing.assert_array_equal(actual_scores, expected_scores)
+ np.testing.assert_array_equal(actual_labels, expected_labels)
diff --git a/engine/object_detection_branch/retina_net/tests/layers/test_misc.py b/engine/object_detection_branch/retina_net/tests/layers/test_misc.py
new file mode 100644
index 0000000..a91f659
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/tests/layers/test_misc.py
@@ -0,0 +1,211 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import numpy as np
+
+import keras
+import keras_retinanet.layers
+
+
+class TestAnchors(object):
+ def test_simple(self):
+ # create simple Anchors layer
+ anchors_layer = keras_retinanet.layers.Anchors(
+ size=32,
+ stride=8,
+ ratios=np.array([1], keras.backend.floatx()),
+ scales=np.array([1], keras.backend.floatx()),
+ )
+
+ # create fake features input (only shape is used anyway)
+ features = np.zeros((1, 2, 2, 1024), dtype=keras.backend.floatx())
+ features = keras.backend.variable(features)
+
+ # call the Anchors layer
+ anchors = anchors_layer.call(features)
+ anchors = keras.backend.eval(anchors)
+
+ # expected anchor values
+ expected = np.array([[
+ [-12, -12, 20, 20],
+ [-4 , -12, 28, 20],
+ [-12, -4 , 20, 28],
+ [-4 , -4 , 28, 28],
+ ]], dtype=keras.backend.floatx())
+
+ # test anchor values
+ np.testing.assert_array_equal(anchors, expected)
+
+ # mark test to fail
+ def test_mini_batch(self):
+ # create simple Anchors layer
+ anchors_layer = keras_retinanet.layers.Anchors(
+ size=32,
+ stride=8,
+ ratios=np.array([1], dtype=keras.backend.floatx()),
+ scales=np.array([1], dtype=keras.backend.floatx()),
+ )
+
+ # create fake features input with batch_size=2
+ features = np.zeros((2, 2, 2, 1024), dtype=keras.backend.floatx())
+ features = keras.backend.variable(features)
+
+ # call the Anchors layer
+ anchors = anchors_layer.call(features)
+ anchors = keras.backend.eval(anchors)
+
+ # expected anchor values
+ expected = np.array([[
+ [-12, -12, 20, 20],
+ [-4 , -12, 28, 20],
+ [-12, -4 , 20, 28],
+ [-4 , -4 , 28, 28],
+ ]], dtype=keras.backend.floatx())
+ expected = np.tile(expected, (2, 1, 1))
+
+ # test anchor values
+ np.testing.assert_array_equal(anchors, expected)
+
+
+class TestUpsampleLike(object):
+ def test_simple(self):
+ # create simple UpsampleLike layer
+ upsample_like_layer = keras_retinanet.layers.UpsampleLike()
+
+ # create input source
+ source = np.zeros((1, 2, 2, 1), dtype=keras.backend.floatx())
+ source = keras.backend.variable(source)
+ target = np.zeros((1, 5, 5, 1), dtype=keras.backend.floatx())
+ expected = target
+ target = keras.backend.variable(target)
+
+ # compute output
+ actual = upsample_like_layer.call([source, target])
+ actual = keras.backend.eval(actual)
+
+ np.testing.assert_array_equal(actual, expected)
+
+ def test_mini_batch(self):
+ # create simple UpsampleLike layer
+ upsample_like_layer = keras_retinanet.layers.UpsampleLike()
+
+ # create input source
+ source = np.zeros((2, 2, 2, 1), dtype=keras.backend.floatx())
+ source = keras.backend.variable(source)
+
+ target = np.zeros((2, 5, 5, 1), dtype=keras.backend.floatx())
+ expected = target
+ target = keras.backend.variable(target)
+
+ # compute output
+ actual = upsample_like_layer.call([source, target])
+ actual = keras.backend.eval(actual)
+
+ np.testing.assert_array_equal(actual, expected)
+
+
+class TestRegressBoxes(object):
+ def test_simple(self):
+ mean = [0, 0, 0, 0]
+ std = [0.2, 0.2, 0.2, 0.2]
+
+ # create simple RegressBoxes layer
+ regress_boxes_layer = keras_retinanet.layers.RegressBoxes(mean=mean, std=std)
+
+ # create input
+ anchors = np.array([[
+ [0 , 0 , 10 , 10 ],
+ [50, 50, 100, 100],
+ [20, 20, 40 , 40 ],
+ ]], dtype=keras.backend.floatx())
+ anchors = keras.backend.variable(anchors)
+
+ regression = np.array([[
+ [0 , 0 , 0 , 0 ],
+ [0.1, 0.1, 0 , 0 ],
+ [0 , 0 , 0.1, 0.1],
+ ]], dtype=keras.backend.floatx())
+ regression = keras.backend.variable(regression)
+
+ # compute output
+ actual = regress_boxes_layer.call([anchors, regression])
+ actual = keras.backend.eval(actual)
+
+ # compute expected output
+ expected = np.array([[
+ [0 , 0 , 10 , 10 ],
+ [51, 51, 100 , 100 ],
+ [20, 20, 40.4, 40.4],
+ ]], dtype=keras.backend.floatx())
+
+ np.testing.assert_array_almost_equal(actual, expected, decimal=2)
+
+ # mark test to fail
+ def test_mini_batch(self):
+ mean = [0, 0, 0, 0]
+ std = [0.2, 0.2, 0.2, 0.2]
+
+ # create simple RegressBoxes layer
+ regress_boxes_layer = keras_retinanet.layers.RegressBoxes(mean=mean, std=std)
+
+ # create input
+ anchors = np.array([
+ [
+ [0 , 0 , 10 , 10 ], # 1
+ [50, 50, 100, 100], # 2
+ [20, 20, 40 , 40 ], # 3
+ ],
+ [
+ [20, 20, 40 , 40 ], # 3
+ [0 , 0 , 10 , 10 ], # 1
+ [50, 50, 100, 100], # 2
+ ],
+ ], dtype=keras.backend.floatx())
+ anchors = keras.backend.variable(anchors)
+
+ regression = np.array([
+ [
+ [0 , 0 , 0 , 0 ], # 1
+ [0.1, 0.1, 0 , 0 ], # 2
+ [0 , 0 , 0.1, 0.1], # 3
+ ],
+ [
+ [0 , 0 , 0.1, 0.1], # 3
+ [0 , 0 , 0 , 0 ], # 1
+ [0.1, 0.1, 0 , 0 ], # 2
+ ],
+ ], dtype=keras.backend.floatx())
+ regression = keras.backend.variable(regression)
+
+ # compute output
+ actual = regress_boxes_layer.call([anchors, regression])
+ actual = keras.backend.eval(actual)
+
+ # compute expected output
+ expected = np.array([
+ [
+ [0 , 0 , 10 , 10 ], # 1
+ [51, 51, 100 , 100 ], # 2
+ [20, 20, 40.4, 40.4], # 3
+ ],
+ [
+ [20, 20, 40.4, 40.4], # 3
+ [0 , 0 , 10 , 10 ], # 1
+ [51, 51, 100 , 100 ], # 2
+ ],
+ ], dtype=keras.backend.floatx())
+
+ np.testing.assert_array_almost_equal(actual, expected, decimal=2)
diff --git a/engine/object_detection_branch/retina_net/tests/models/__init__.py b/engine/object_detection_branch/retina_net/tests/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/retina_net/tests/models/test_densenet.py b/engine/object_detection_branch/retina_net/tests/models/test_densenet.py
new file mode 100644
index 0000000..ac7f60a
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/tests/models/test_densenet.py
@@ -0,0 +1,52 @@
+"""
+Copyright 2018 vidosits (https://github.com/vidosits/)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import numpy as np
+import warnings
+
+import keras
+import pytest
+from keras_retinanet import losses
+from keras_retinanet.models.densenet import DenseNetBackbone
+
+parameters = ['densenet121']
+
+
+@pytest.mark.parametrize("backbone", parameters)
+def test_backbone(backbone):
+ # ignore warnings in this test
+ warnings.simplefilter('ignore')
+
+ num_classes = 10
+
+ inputs = np.zeros((1, 200, 400, 3), dtype=np.float32)
+ targets = [np.zeros((1, 14814, 5), dtype=np.float32), np.zeros((1, 14814, num_classes))]
+
+ inp = keras.layers.Input(inputs[0].shape)
+
+ densenet_backbone = DenseNetBackbone(backbone)
+ model = densenet_backbone.retinanet(num_classes=num_classes, inputs=inp)
+ model.summary()
+
+ # compile model
+ model.compile(
+ loss={
+ 'regression': losses.smooth_l1(),
+ 'classification': losses.focal()
+ },
+ optimizer=keras.optimizers.adam(lr=1e-5, clipnorm=0.001))
+
+ model.fit(inputs, targets, batch_size=1)
diff --git a/engine/object_detection_branch/retina_net/tests/models/test_mobilenet.py b/engine/object_detection_branch/retina_net/tests/models/test_mobilenet.py
new file mode 100644
index 0000000..dd29b06
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/tests/models/test_mobilenet.py
@@ -0,0 +1,57 @@
+"""
+Copyright 2017-2018 lvaleriu (https://github.com/lvaleriu/)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import numpy as np
+import warnings
+
+import keras
+import pytest
+from keras_retinanet import losses
+from keras_retinanet.models.mobilenet import MobileNetBackbone
+
+alphas = ['1.0']
+parameters = []
+
+for backbone in MobileNetBackbone.allowed_backbones:
+ for alpha in alphas:
+ parameters.append((backbone, alpha))
+
+
+@pytest.mark.parametrize("backbone, alpha", parameters)
+def test_backbone(backbone, alpha):
+ # ignore warnings in this test
+ warnings.simplefilter('ignore')
+
+ num_classes = 10
+
+ inputs = np.zeros((1, 1024, 363, 3), dtype=np.float32)
+ targets = [np.zeros((1, 70776, 5), dtype=np.float32), np.zeros((1, 70776, num_classes))]
+
+ inp = keras.layers.Input(inputs[0].shape)
+
+ mobilenet_backbone = MobileNetBackbone(backbone='{}_{}'.format(backbone, format(alpha)))
+ training_model = mobilenet_backbone.retinanet(num_classes=num_classes, inputs=inp)
+ training_model.summary()
+
+ # compile model
+ training_model.compile(
+ loss={
+ 'regression': losses.smooth_l1(),
+ 'classification': losses.focal()
+ },
+ optimizer=keras.optimizers.adam(lr=1e-5, clipnorm=0.001))
+
+ training_model.fit(inputs, targets, batch_size=1)
diff --git a/engine/object_detection_branch/retina_net/tests/preprocessing/__init__.py b/engine/object_detection_branch/retina_net/tests/preprocessing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/retina_net/tests/preprocessing/test_csv_generator.py b/engine/object_detection_branch/retina_net/tests/preprocessing/test_csv_generator.py
new file mode 100644
index 0000000..8524818
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/tests/preprocessing/test_csv_generator.py
@@ -0,0 +1,216 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import csv
+
+import pytest
+try:
+ from io import StringIO
+except ImportError:
+ from stringio import StringIO
+
+from keras_retinanet.preprocessing import csv_generator
+
+
+def csv_str(string):
+ if str == bytes:
+ string = string.decode('utf-8')
+ return csv.reader(StringIO(string))
+
+
+def annotation(x1, y1, x2, y2, class_name):
+ return {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2, 'class': class_name}
+
+
+def test_read_classes():
+ assert csv_generator._read_classes(csv_str('')) == {}
+ assert csv_generator._read_classes(csv_str('a,1')) == {'a': 1}
+ assert csv_generator._read_classes(csv_str('a,1\nb,2')) == {'a': 1, 'b': 2}
+
+
+def test_read_classes_wrong_format():
+ with pytest.raises(ValueError):
+ try:
+ csv_generator._read_classes(csv_str('a,b,c'))
+ except ValueError as e:
+ assert str(e).startswith('line 1: format should be')
+ raise
+ with pytest.raises(ValueError):
+ try:
+ csv_generator._read_classes(csv_str('a,1\nb,c,d'))
+ except ValueError as e:
+ assert str(e).startswith('line 2: format should be')
+ raise
+
+
+def test_read_classes_malformed_class_id():
+ with pytest.raises(ValueError):
+ try:
+ csv_generator._read_classes(csv_str('a,b'))
+ except ValueError as e:
+ assert str(e).startswith("line 1: malformed class ID:")
+ raise
+
+ with pytest.raises(ValueError):
+ try:
+ csv_generator._read_classes(csv_str('a,1\nb,c'))
+ except ValueError as e:
+ assert str(e).startswith('line 2: malformed class ID:')
+ raise
+
+
+def test_read_classes_duplicate_name():
+ with pytest.raises(ValueError):
+ try:
+ csv_generator._read_classes(csv_str('a,1\nb,2\na,3'))
+ except ValueError as e:
+ assert str(e).startswith('line 3: duplicate class name')
+ raise
+
+
+def test_read_annotations():
+ classes = {'a': 1, 'b': 2, 'c': 4, 'd': 10}
+ annotations = csv_generator._read_annotations(csv_str(
+ 'a.png,0,1,2,3,a' '\n'
+ 'b.png,4,5,6,7,b' '\n'
+ 'c.png,8,9,10,11,c' '\n'
+ 'd.png,12,13,14,15,d' '\n'
+ ), classes)
+ assert annotations == {
+ 'a.png': [annotation( 0, 1, 2, 3, 'a')],
+ 'b.png': [annotation( 4, 5, 6, 7, 'b')],
+ 'c.png': [annotation( 8, 9, 10, 11, 'c')],
+ 'd.png': [annotation(12, 13, 14, 15, 'd')],
+ }
+
+
+def test_read_annotations_multiple():
+ classes = {'a': 1, 'b': 2, 'c': 4, 'd': 10}
+ annotations = csv_generator._read_annotations(csv_str(
+ 'a.png,0,1,2,3,a' '\n'
+ 'b.png,4,5,6,7,b' '\n'
+ 'a.png,8,9,10,11,c' '\n'
+ ), classes)
+ assert annotations == {
+ 'a.png': [
+ annotation(0, 1, 2, 3, 'a'),
+ annotation(8, 9, 10, 11, 'c'),
+ ],
+ 'b.png': [annotation(4, 5, 6, 7, 'b')],
+ }
+
+
+def test_read_annotations_wrong_format():
+ classes = {'a': 1, 'b': 2, 'c': 4, 'd': 10}
+ with pytest.raises(ValueError):
+ try:
+ csv_generator._read_annotations(csv_str('a.png,1,2,3,a'), classes)
+ except ValueError as e:
+ assert str(e).startswith("line 1: format should be")
+ raise
+
+ with pytest.raises(ValueError):
+ try:
+ csv_generator._read_annotations(csv_str(
+ 'a.png,0,1,2,3,a' '\n'
+ 'a.png,1,2,3,a' '\n'
+ ), classes)
+ except ValueError as e:
+ assert str(e).startswith("line 2: format should be")
+ raise
+
+
+def test_read_annotations_wrong_x1():
+ with pytest.raises(ValueError):
+ try:
+ csv_generator._read_annotations(csv_str('a.png,a,0,1,2,a'), {'a': 1})
+ except ValueError as e:
+ assert str(e).startswith("line 1: malformed x1:")
+ raise
+
+
+def test_read_annotations_wrong_y1():
+ with pytest.raises(ValueError):
+ try:
+ csv_generator._read_annotations(csv_str('a.png,0,a,1,2,a'), {'a': 1})
+ except ValueError as e:
+ assert str(e).startswith("line 1: malformed y1:")
+ raise
+
+
+def test_read_annotations_wrong_x2():
+ with pytest.raises(ValueError):
+ try:
+ csv_generator._read_annotations(csv_str('a.png,0,1,a,2,a'), {'a': 1})
+ except ValueError as e:
+ assert str(e).startswith("line 1: malformed x2:")
+ raise
+
+
+def test_read_annotations_wrong_y2():
+ with pytest.raises(ValueError):
+ try:
+ csv_generator._read_annotations(csv_str('a.png,0,1,2,a,a'), {'a': 1})
+ except ValueError as e:
+ assert str(e).startswith("line 1: malformed y2:")
+ raise
+
+
+def test_read_annotations_wrong_class():
+ with pytest.raises(ValueError):
+ try:
+ csv_generator._read_annotations(csv_str('a.png,0,1,2,3,g'), {'a': 1})
+ except ValueError as e:
+ assert str(e).startswith("line 1: unknown class name:")
+ raise
+
+
+def test_read_annotations_invalid_bb_x():
+ with pytest.raises(ValueError):
+ try:
+ csv_generator._read_annotations(csv_str('a.png,1,2,1,3,g'), {'a': 1})
+ except ValueError as e:
+ assert str(e).startswith("line 1: x2 (1) must be higher than x1 (1)")
+ raise
+ with pytest.raises(ValueError):
+ try:
+ csv_generator._read_annotations(csv_str('a.png,9,2,5,3,g'), {'a': 1})
+ except ValueError as e:
+ assert str(e).startswith("line 1: x2 (5) must be higher than x1 (9)")
+ raise
+
+
+def test_read_annotations_invalid_bb_y():
+ with pytest.raises(ValueError):
+ try:
+ csv_generator._read_annotations(csv_str('a.png,1,2,3,2,a'), {'a': 1})
+ except ValueError as e:
+ assert str(e).startswith("line 1: y2 (2) must be higher than y1 (2)")
+ raise
+ with pytest.raises(ValueError):
+ try:
+ csv_generator._read_annotations(csv_str('a.png,1,8,3,5,a'), {'a': 1})
+ except ValueError as e:
+ assert str(e).startswith("line 1: y2 (5) must be higher than y1 (8)")
+ raise
+
+
+def test_read_annotations_empty_image():
+ # Check that images without annotations are parsed.
+ assert csv_generator._read_annotations(csv_str('a.png,,,,,\nb.png,,,,,'), {'a': 1}) == {'a.png': [], 'b.png': []}
+
+ # Check that lines without annotations don't clear earlier annotations.
+ assert csv_generator._read_annotations(csv_str('a.png,0,1,2,3,a\na.png,,,,,'), {'a': 1}) == {'a.png': [annotation(0, 1, 2, 3, 'a')]}
diff --git a/engine/object_detection_branch/retina_net/tests/preprocessing/test_generator.py b/engine/object_detection_branch/retina_net/tests/preprocessing/test_generator.py
new file mode 100644
index 0000000..5cd62aa
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/tests/preprocessing/test_generator.py
@@ -0,0 +1,181 @@
+"""
+Copyright 2017-2018 Fizyr (https://fizyr.com)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import numpy as np
+
+import keras.backend
+import pytest
+from keras_retinanet.preprocessing.generator import Generator
+
+
+class SimpleGenerator(Generator):
+ def __init__(self, annotations_group, num_classes=0, image=None):
+ self.annotations_group = annotations_group
+ self.num_classes_ = num_classes
+ self.image = image
+ super(SimpleGenerator, self).__init__(group_method='none', shuffle_groups=False)
+
+ def num_classes(self):
+ return self.num_classes_
+
+ def load_image(self, image_index):
+ return self.image
+
+ def size(self):
+ return len(self.annotations_group)
+
+ def load_annotations(self, image_index):
+ result = self.annotations_group[image_index]
+ return result
+
+
+class TestLoadAnnotationsGroup(object):
+ def test_simple(self):
+ input_annotations_group = [
+ np.array([
+ [ 0, 0, 10, 10],
+ [150, 150, 350, 350]
+ ]),
+ ]
+ expected_annotations_group = input_annotations_group
+
+ simple_generator = SimpleGenerator(input_annotations_group)
+ annotations_group = simple_generator.load_annotations_group(simple_generator.groups[0])
+
+ np.testing.assert_equal(expected_annotations_group, annotations_group)
+
+ def test_multiple(self):
+ input_annotations_group = [
+ np.array([
+ [ 0, 0, 10, 10],
+ [150, 150, 350, 350]
+ ]),
+ np.array([
+ [0, 0, 1, 1]
+ ])
+ ]
+ expected_annotations_group = input_annotations_group
+
+ simple_generator = SimpleGenerator(input_annotations_group)
+ annotations_group_0 = simple_generator.load_annotations_group(simple_generator.groups[0])
+ annotations_group_1 = simple_generator.load_annotations_group(simple_generator.groups[1])
+
+ np.testing.assert_equal([expected_annotations_group[0]], annotations_group_0)
+ np.testing.assert_equal([expected_annotations_group[1]], annotations_group_1)
+
+
+class TestFilterAnnotations(object):
+ def test_simple_filter(self):
+ input_annotations_group = [
+ np.array([
+ [ 0, 0, 10, 10],
+ [150, 150, 50, 50]
+ ]),
+ ]
+
+ input_image = np.zeros((500, 500, 3))
+
+ expected_annotations_group = [
+ np.array([
+ [0, 0, 10, 10],
+ ]),
+ ]
+
+ simple_generator = SimpleGenerator(input_annotations_group)
+ annotations_group = simple_generator.load_annotations_group(simple_generator.groups[0])
+ # expect a UserWarning
+ with pytest.warns(UserWarning):
+ image_group, annotations_group = simple_generator.filter_annotations([input_image], annotations_group, simple_generator.groups[0])
+
+ np.testing.assert_equal(expected_annotations_group, annotations_group)
+
+ def test_multiple_filter(self):
+ input_annotations_group = [
+ np.array([
+ [ 0, 0, 10, 10],
+ [150, 150, 50, 50],
+ [150, 150, 350, 350],
+ [350, 350, 150, 150],
+ [ 1, 1, 2, 2],
+ [ 2, 2, 1, 1]
+ ]),
+ np.array([
+ [0, 0, -1, -1]
+ ]),
+ np.array([
+ [-10, -10, 0, 0],
+ [-10, -10, -100, -100],
+ [ 10, 10, 100, 100]
+ ]),
+ np.array([
+ [ 10, 10, 100, 100],
+ [ 10, 10, 600, 600]
+ ]),
+ ]
+
+ input_image = np.zeros((500, 500, 3))
+
+ expected_annotations_group = [
+ np.array([
+ [ 0, 0, 10, 10],
+ [150, 150, 350, 350],
+ [ 1, 1, 2, 2]
+ ]),
+ np.zeros((0, 4)),
+ np.array([
+ [10, 10, 100, 100]
+ ]),
+ np.array([
+ [ 10, 10, 100, 100]
+ ]),
+ ]
+
+ simple_generator = SimpleGenerator(input_annotations_group)
+ # expect a UserWarning
+ annotations_group_0 = simple_generator.load_annotations_group(simple_generator.groups[0])
+ with pytest.warns(UserWarning):
+ image_group, annotations_group_0 = simple_generator.filter_annotations([input_image], annotations_group_0, simple_generator.groups[0])
+
+ annotations_group_1 = simple_generator.load_annotations_group(simple_generator.groups[1])
+ with pytest.warns(UserWarning):
+ image_group, annotations_group_1 = simple_generator.filter_annotations([input_image], annotations_group_1, simple_generator.groups[1])
+
+ annotations_group_2 = simple_generator.load_annotations_group(simple_generator.groups[2])
+ with pytest.warns(UserWarning):
+ image_group, annotations_group_2 = simple_generator.filter_annotations([input_image], annotations_group_2, simple_generator.groups[2])
+
+ np.testing.assert_equal([expected_annotations_group[0]], annotations_group_0)
+ np.testing.assert_equal([expected_annotations_group[1]], annotations_group_1)
+ np.testing.assert_equal([expected_annotations_group[2]], annotations_group_2)
+
+ def test_complete(self):
+ input_annotations_group = [
+ np.array([
+ [ 0, 0, 50, 50, 0], # one object of class 0
+ [150, 150, 50, 50, 1], # one object of class 1 with an invalid box
+ ], dtype=keras.backend.floatx()),
+ ]
+
+ input_image = np.zeros((500, 500, 3), dtype=np.uint8)
+
+ simple_generator = SimpleGenerator(input_annotations_group, image=input_image, num_classes=2)
+ # expect a UserWarning
+ with pytest.warns(UserWarning):
+ _, [_, labels_batch] = simple_generator.next()
+
+ # test that only object with class 0 is present in labels_batch
+ labels = np.unique(np.argmax(labels_batch == 1, axis=2))
+ assert(len(labels) == 1 and labels[0] == 0), 'Expected only class 0 to be present, but got classes {}'.format(labels)
diff --git a/engine/object_detection_branch/retina_net/tests/test_losses.py b/engine/object_detection_branch/retina_net/tests/test_losses.py
new file mode 100644
index 0000000..a60eb4f
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/tests/test_losses.py
@@ -0,0 +1,32 @@
+import numpy as np
+
+import keras
+import keras_retinanet.losses
+import pytest
+
+
+def test_smooth_l1():
+ regression = np.array([
+ [
+ [0, 0, 0, 0],
+ [0, 0, 0, 0],
+ [0, 0, 0, 0],
+ [0, 0, 0, 0],
+ ]
+ ], dtype=keras.backend.floatx())
+ regression = keras.backend.variable(regression)
+
+ regression_target = np.array([
+ [
+ [0, 0, 0, 1, 1],
+ [0, 0, 1, 0, 1],
+ [0, 0, 0.05, 0, 1],
+ [0, 0, 1, 0, 0],
+ ]
+ ], dtype=keras.backend.floatx())
+ regression_target = keras.backend.variable(regression_target)
+
+ loss = keras_retinanet.losses.smooth_l1()(regression_target, regression)
+ loss = keras.backend.eval(loss)
+
+ assert loss == pytest.approx((((1 - 0.5 / 9) * 2 + (0.5 * 9 * 0.05 ** 2)) / 3))
diff --git a/engine/object_detection_branch/retina_net/tests/utils/__init__.py b/engine/object_detection_branch/retina_net/tests/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/retina_net/tests/utils/test_transform.py b/engine/object_detection_branch/retina_net/tests/utils/test_transform.py
new file mode 100644
index 0000000..5b73785
--- /dev/null
+++ b/engine/object_detection_branch/retina_net/tests/utils/test_transform.py
@@ -0,0 +1,151 @@
+import numpy as np
+from math import pi
+
+from keras_retinanet.utils.transform import (
+ colvec,
+ transform_aabb,
+ rotation, random_rotation,
+ translation, random_translation,
+ scaling, random_scaling,
+ shear, random_shear,
+ random_flip,
+ random_transform,
+ random_transform_generator,
+ change_transform_origin,
+)
+from numpy.testing import assert_almost_equal
+
+
+def test_colvec():
+ assert np.array_equal(colvec(0), np.array([[0]]))
+ assert np.array_equal(colvec(1, 2, 3), np.array([[1], [2], [3]]))
+ assert np.array_equal(colvec(-1, -2), np.array([[-1], [-2]]))
+
+
+def test_rotation():
+ assert_almost_equal(colvec( 1, 0, 1), rotation(0.0 * pi).dot(colvec(1, 0, 1)))
+ assert_almost_equal(colvec( 0, 1, 1), rotation(0.5 * pi).dot(colvec(1, 0, 1)))
+ assert_almost_equal(colvec(-1, 0, 1), rotation(1.0 * pi).dot(colvec(1, 0, 1)))
+ assert_almost_equal(colvec( 0, -1, 1), rotation(1.5 * pi).dot(colvec(1, 0, 1)))
+ assert_almost_equal(colvec( 1, 0, 1), rotation(2.0 * pi).dot(colvec(1, 0, 1)))
+
+ assert_almost_equal(colvec( 0, 1, 1), rotation(0.0 * pi).dot(colvec(0, 1, 1)))
+ assert_almost_equal(colvec(-1, 0, 1), rotation(0.5 * pi).dot(colvec(0, 1, 1)))
+ assert_almost_equal(colvec( 0, -1, 1), rotation(1.0 * pi).dot(colvec(0, 1, 1)))
+ assert_almost_equal(colvec( 1, 0, 1), rotation(1.5 * pi).dot(colvec(0, 1, 1)))
+ assert_almost_equal(colvec( 0, 1, 1), rotation(2.0 * pi).dot(colvec(0, 1, 1)))
+
+
+def test_random_rotation():
+ prng = np.random.RandomState(0)
+ for i in range(100):
+ assert_almost_equal(1, np.linalg.det(random_rotation(-i, i, prng)))
+
+
+def test_translation():
+ assert_almost_equal(colvec( 1, 2, 1), translation(colvec( 0, 0)).dot(colvec(1, 2, 1)))
+ assert_almost_equal(colvec( 4, 6, 1), translation(colvec( 3, 4)).dot(colvec(1, 2, 1)))
+ assert_almost_equal(colvec(-2, -2, 1), translation(colvec(-3, -4)).dot(colvec(1, 2, 1)))
+
+
+def assert_is_translation(transform, min, max):
+ assert transform.shape == (3, 3)
+ assert np.array_equal(transform[:, 0:2], np.eye(3, 2))
+ assert transform[2, 2] == 1
+ assert np.greater_equal(transform[0:2, 2], min).all()
+ assert np.less( transform[0:2, 2], max).all()
+
+
+def test_random_translation():
+ prng = np.random.RandomState(0)
+ min = (-10, -20)
+ max = (20, 10)
+ for i in range(100):
+ assert_is_translation(random_translation(min, max, prng), min, max)
+
+
+def test_shear():
+ assert_almost_equal(colvec( 1, 2, 1), shear(0.0 * pi).dot(colvec(1, 2, 1)))
+ assert_almost_equal(colvec(-1, 0, 1), shear(0.5 * pi).dot(colvec(1, 2, 1)))
+ assert_almost_equal(colvec( 1, -2, 1), shear(1.0 * pi).dot(colvec(1, 2, 1)))
+ assert_almost_equal(colvec( 3, 0, 1), shear(1.5 * pi).dot(colvec(1, 2, 1)))
+ assert_almost_equal(colvec( 1, 2, 1), shear(2.0 * pi).dot(colvec(1, 2, 1)))
+
+
+def assert_is_shear(transform):
+ assert transform.shape == (3, 3)
+ assert np.array_equal(transform[:, 0], [1, 0, 0])
+ assert np.array_equal(transform[:, 2], [0, 0, 1])
+ assert transform[2, 1] == 0
+ # sin^2 + cos^2 == 1
+ assert_almost_equal(1, transform[0, 1] ** 2 + transform[1, 1] ** 2)
+
+
+def test_random_shear():
+ prng = np.random.RandomState(0)
+ for i in range(100):
+ assert_is_shear(random_shear(-pi, pi, prng))
+
+
+def test_scaling():
+ assert_almost_equal(colvec(1.0, 2, 1), scaling(colvec(1.0, 1.0)).dot(colvec(1, 2, 1)))
+ assert_almost_equal(colvec(0.0, 2, 1), scaling(colvec(0.0, 1.0)).dot(colvec(1, 2, 1)))
+ assert_almost_equal(colvec(1.0, 0, 1), scaling(colvec(1.0, 0.0)).dot(colvec(1, 2, 1)))
+ assert_almost_equal(colvec(0.5, 4, 1), scaling(colvec(0.5, 2.0)).dot(colvec(1, 2, 1)))
+
+
+def assert_is_scaling(transform, min, max):
+ assert transform.shape == (3, 3)
+ assert np.array_equal(transform[2, :], [0, 0, 1])
+ assert np.array_equal(transform[:, 2], [0, 0, 1])
+ assert transform[1, 0] == 0
+ assert transform[0, 1] == 0
+ assert np.greater_equal(np.diagonal(transform)[:2], min).all()
+ assert np.less( np.diagonal(transform)[:2], max).all()
+
+
+def test_random_scaling():
+ prng = np.random.RandomState(0)
+ min = (0.1, 0.2)
+ max = (20, 10)
+ for i in range(100):
+ assert_is_scaling(random_scaling(min, max, prng), min, max)
+
+
+def assert_is_flip(transform):
+ assert transform.shape == (3, 3)
+ assert np.array_equal(transform[2, :], [0, 0, 1])
+ assert np.array_equal(transform[:, 2], [0, 0, 1])
+ assert transform[1, 0] == 0
+ assert transform[0, 1] == 0
+ assert abs(transform[0, 0]) == 1
+ assert abs(transform[1, 1]) == 1
+
+
+def test_random_flip():
+ prng = np.random.RandomState(0)
+ for i in range(100):
+ assert_is_flip(random_flip(0.5, 0.5, prng))
+
+
+def test_random_transform():
+ prng = np.random.RandomState(0)
+ for i in range(100):
+ transform = random_transform(prng=prng)
+ assert np.array_equal(transform, np.identity(3))
+
+ for i, transform in zip(range(100), random_transform_generator(prng=np.random.RandomState())):
+ assert np.array_equal(transform, np.identity(3))
+
+
+def test_transform_aabb():
+ assert np.array_equal([1, 2, 3, 4], transform_aabb(np.identity(3), [1, 2, 3, 4]))
+ assert_almost_equal([-3, -4, -1, -2], transform_aabb(rotation(pi), [1, 2, 3, 4]))
+ assert_almost_equal([ 2, 4, 4, 6], transform_aabb(translation([1, 2]), [1, 2, 3, 4]))
+
+
+def test_change_transform_origin():
+ assert np.array_equal(change_transform_origin(translation([3, 4]), [1, 2]), translation([3, 4]))
+ assert_almost_equal(colvec(1, 2, 1), change_transform_origin(rotation(pi), [1, 2]).dot(colvec(1, 2, 1)))
+ assert_almost_equal(colvec(0, 0, 1), change_transform_origin(rotation(pi), [1, 2]).dot(colvec(2, 4, 1)))
+ assert_almost_equal(colvec(0, 0, 1), change_transform_origin(scaling([0.5, 0.5]), [-2, -4]).dot(colvec(2, 4, 1)))
diff --git a/engine/object_detection_branch/single_shot_detector/.gitattributes b/engine/object_detection_branch/single_shot_detector/.gitattributes
new file mode 100755
index 0000000..f4c7e5f
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/.gitattributes
@@ -0,0 +1 @@
+*.ipynb linguist-language=Python
diff --git a/engine/object_detection_branch/single_shot_detector/.github/stale.yml b/engine/object_detection_branch/single_shot_detector/.github/stale.yml
new file mode 100644
index 0000000..73cb6b9
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/.github/stale.yml
@@ -0,0 +1,24 @@
+# Configuration for probot-stale - https://github.com/probot/stale
+
+# Number of days of inactivity before an Issue or Pull Request becomes stale
+daysUntilStale: 7
+# Number of days of inactivity before a stale Issue or Pull Request is closed
+daysUntilClose: 7
+# Issues or Pull Requests with these labels will never be considered stale. Set to `[]` to disable
+exemptLabels:
+ - pinned
+ - security
+ - "[Status] Maybe Later"
+# Label to use when marking as stale
+staleLabel: stale
+# Comment to post when marking as stale. Set to `false` to disable
+markComment: >
+ This issue has been automatically marked as stale because it has not had
+ recent activity. It will be closed if no further activity occurs. Thank you
+ for your contributions.
+# Comment to post when removing the stale label. Set to `false` to disable
+unmarkComment: false
+# Comment to post when closing a stale Issue or Pull Request. Set to `false` to disable
+closeComment: false
+# Limit to only `issues` or `pulls`
+# only: issues
diff --git a/engine/object_detection_branch/single_shot_detector/.gitignore b/engine/object_detection_branch/single_shot_detector/.gitignore
new file mode 100755
index 0000000..9531469
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/.gitignore
@@ -0,0 +1,98 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+.ipynb_checkpoints/
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# dotenv
+.env
+
+# virtualenv
+.venv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
+
+# Ignore any files and directories that begin with the word "local"
+local*
diff --git a/engine/object_detection_branch/single_shot_detector/CONTRIBUTING.md b/engine/object_detection_branch/single_shot_detector/CONTRIBUTING.md
new file mode 100755
index 0000000..faec61b
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/CONTRIBUTING.md
@@ -0,0 +1,22 @@
+# Contributing Guidelines
+---
+
+Contributions to this repository are welcome, but before you create a pull request, consider the following guidelines:
+
+1. The To-do list in the README of this repository defines the main topics for which contributions are welcome. If you want to contribute, ideally contribute to one of the topics listed there.
+2. If you'd like to contribute features that are not mentioned on the to-do list in the README, make sure to explain why your proposed change adds value, i.e. what relevant use case it solves. The benefit of any new feature will be compared against the cost of maintaining it and your contribution will be accepter or rejected based on this trade-off.
+3. One pull request should be about one specific feature or improvement, i.e. it should not contain multiple unrelated changes. If you want to contribute multiple features and/or improvements, create a separate pull request for every individual feature or improvement.
+3. When you create a pull request, make sure to explain properly
+ * why your propsed change adds value, i.e. what problem or use case it solves,
+ * all the API changes it will introduce, if any,
+ * all behavioral changes in any existing parts of the project it will introduce, if any.
+4. This should go without saying, but you are responsible for updating any parts of the code or the tutorial notebooks that are affected by your introduced changes.
+5. Any submitted code must conform to the coding standards and style of this repository. There is no formal guide for coding standards and style, but here are a few things to note:
+ * Any new modules, classes or functions must provide proper docstrings unless they are trivial. These docstrings must have sections for Arguments, Returns, and Raises (if applicable). For every argument of a function, the docstring must explain precisely what the argument does, what data type it expects, whether or not it is optional, and any requirements for the range of values it expects. The same goes for the returns. Use existing docstrings as templates.
+ * Naming:
+ * `ClassNames` consist of capitalized words without underscores.
+ * `module_names.py` consist of lower case words connected with underscores.
+ * `function_names` consist of lower case words connected with underscores.
+ * `variable_names` consist of lower case words connected with underscores.
+ * All module, class, function, and variable names must be descriptive in order to meet the goal that all code should always be as self-explanatory as possible. A longer and descriptive name is always preferable over a shorter and non-descriptive name. Abbreviations are generally to be avoided unless the full words would really make the name too long.
+ * More in-line comments are better than fewer in-line comments and all comments should be precise and succinct.
diff --git a/engine/object_detection_branch/single_shot_detector/ISSUE_TEMPLATE.md b/engine/object_detection_branch/single_shot_detector/ISSUE_TEMPLATE.md
new file mode 100755
index 0000000..0da1965
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/ISSUE_TEMPLATE.md
@@ -0,0 +1,29 @@
+### If you open a GitHub issue, here is the policy:
+
+Your issue must be about one of the following:
+
+1. a bug,
+2. a feature request,
+3. a documentation issue, or
+4. a question that is **specific to this SSD implementation**.
+
+You will only get help if you adhere to the following guidelines:
+
+* Before you open an issue, search the open **and closed** issues first. Your problem/question might already have been solved/answered before.
+* If you're getting unexpected behavior from code I wrote, open an issue and I'll try to help. If you're getting unexpected behavior from code **you** wrote, you'll have to fix it yourself. E.g. if you made a ton of changes to the code or the tutorials and now it doesn't work anymore, that's your own problem. I don't want to spend my time debugging your code.
+* Make sure you're using the latest master. If you're 30 commits behind and have a problem, the only answer you'll likely get is to pull the latest master and try again.
+* Read the documentation. All of it. If the answer to your problem/question can be found in the documentation, you might not get an answer, because, seriously, you could really have figured this out yourself.
+* If you're asking a question, it must be specific to this SSD implementation. General deep learning or object detection questions will likely get closed without an answer. E.g. a question like "How do I get the mAP of an SSD for my own dataset?" has nothing to do with this particular SSD implementation, because computing the mAP works the same way for any object detection model. You should ask such a question in an appropriate forum or on the [Data Science section of StackOverflow](https://datascience.stackexchange.com/) instead.
+* If you get an error:
+ * Provide the full stack trace of the error you're getting, not just the error message itself.
+ * Make sure any code you post is properly formatted as such.
+ * Provide any useful information about your environment, e.g.:
+ * Operating System
+ * Which commit of this repository you're on
+ * Keras version
+ * TensorFlow version
+ * Provide a minimal reproducible example, i.e. post code and explain clearly how you ended up with this error.
+ * Provide any useful information about your specific use case and parameters:
+ * What model are you trying to use/train?
+ * Describe the dataset you're using.
+ * List the values of any parameters you changed that might be relevant.
diff --git a/engine/object_detection_branch/single_shot_detector/LICENSE.txt b/engine/object_detection_branch/single_shot_detector/LICENSE.txt
new file mode 100644
index 0000000..0e30368
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/LICENSE.txt
@@ -0,0 +1,176 @@
+Copyright 2018 Pierluigi Ferrari.
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
diff --git a/engine/object_detection_branch/single_shot_detector/README.md b/engine/object_detection_branch/single_shot_detector/README.md
new file mode 100755
index 0000000..e04f98d
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/README.md
@@ -0,0 +1,266 @@
+## SSD: Single-Shot MultiBox Detector implementation in Keras
+---
+### Contents
+
+1. [Overview](#overview)
+2. [Performance](#performance)
+3. [Examples](#examples)
+4. [Dependencies](#dependencies)
+5. [How to use it](#how-to-use-it)
+6. [Download the convolutionalized VGG-16 weights](#download-the-convolutionalized-vgg-16-weights)
+7. [Download the original trained model weights](#download-the-original-trained-model-weights)
+8. [How to fine-tune one of the trained models on your own dataset](#how-to-fine-tune-one-of-the-trained-models-on-your-own-dataset)
+9. [ToDo](#todo)
+10. [Important notes](#important-notes)
+11. [Terminology](#terminology)
+
+### Overview
+
+This is a Keras port of the SSD model architecture introduced by Wei Liu et al. in the paper [SSD: Single Shot MultiBox Detector](https://arxiv.org/abs/1512.02325).
+
+Ports of the trained weights of all the original models are provided below. This implementation is accurate, meaning that both the ported weights and models trained from scratch produce the same mAP values as the respective models of the original Caffe implementation (see performance section below).
+
+The main goal of this project is to create an SSD implementation that is well documented for those who are interested in a low-level understanding of the model. The provided tutorials, documentation and detailed comments hopefully make it a bit easier to dig into the code and adapt or build upon the model than with most other implementations out there (Keras or otherwise) that provide little to no documentation and comments.
+
+The repository currently provides the following network architectures:
+* SSD300: [`keras_ssd300.py`](models/keras_ssd300.py)
+* SSD512: [`keras_ssd512.py`](models/keras_ssd512.py)
+* SSD7: [`keras_ssd7.py`](models/keras_ssd7.py) - a smaller 7-layer version that can be trained from scratch relatively quickly even on a mid-tier GPU, yet is capable enough for less complex object detection tasks and testing. You're obviously not going to get state-of-the-art results with that one, but it's fast.
+
+If you would like to use one of the provided trained models for transfer learning (i.e. fine-tune one of the trained models on your own dataset), there is a [Jupyter notebook tutorial](weight_sampling_tutorial.ipynb) that helps you sub-sample the trained weights so that they are compatible with your dataset, see further below.
+
+If you would like to build an SSD with your own base network architecture, you can use [`keras_ssd7.py`](models/keras_ssd7.py) as a template, it provides documentation and comments to help you.
+
+### Performance
+
+Here are the mAP evaluation results of the ported weights and below that the evaluation results of a model trained from scratch using this implementation. All models were evaluated using the official Pascal VOC test server (for 2012 `test`) or the official Pascal VOC Matlab evaluation script (for 2007 `test`). In all cases the results match (or slightly surpass) those of the original Caffe models. Download links to all ported weights are available further below.
+
+
+
+ |
+ Mean Average Precision |
+
+
+ evaluated on |
+ VOC2007 test |
+ VOC2012 test |
+
+
+ trained on IoU rule |
+ 07+12 0.5 |
+ 07+12+COCO 0.5 |
+ 07++12+COCO 0.5 |
+
+
+ SSD300 |
+ 77.5 |
+ 81.2 |
+ 79.4 |
+
+
+ SSD512 |
+ 79.8 |
+ 83.2 |
+ 82.3 |
+
+
+
+Training an SSD300 from scratch to convergence on Pascal VOC 2007 `trainval` and 2012 `trainval` produces the same mAP on Pascal VOC 2007 `test` as the original Caffe SSD300 "07+12" model. You can find a summary of the training [here](training_summaries/ssd300_pascal_07+12_training_summary.md).
+
+
+
+ |
+ Mean Average Precision |
+
+
+ |
+ Original Caffe Model |
+ Ported Weights |
+ Trained from Scratch |
+
+
+ SSD300 "07+12" |
+ 0.772 |
+ 0.775 |
+ 0.771 |
+
+
+
+The models achieve the following average number of frames per second (FPS) on Pascal VOC on an NVIDIA GeForce GTX 1070 mobile (i.e. the laptop version) and cuDNN v6. There are two things to note here. First, note that the benchmark prediction speeds of the original Caffe implementation were achieved using a TitanX GPU and cuDNN v4. Second, the paper says they measured the prediction speed at batch size 8, which I think isn't a meaningful way of measuring the speed. The whole point of measuring the speed of a detection model is to know how many individual sequential images the model can process per second, therefore measuring the prediction speed on batches of images and then deducing the time spent on each individual image in the batch defeats the purpose. For the sake of comparability, below you find the prediction speed for the original Caffe SSD implementation and the prediction speed for this implementation under the same conditions, i.e. at batch size 8. In addition you find the prediction speed for this implementation at batch size 1, which in my opinion is the more meaningful number.
+
+
+
+ |
+ Frames per Second |
+
+
+ |
+ Original Caffe Implementation |
+ This Implementation |
+
+
+ Batch Size |
+ 8 |
+ 8 |
+ 1 |
+
+
+ SSD300 |
+ 46 |
+ 49 |
+ 39 |
+
+
+ SSD512 |
+ 19 |
+ 25 |
+ 20 |
+
+
+ SSD7 |
+ |
+ 216 |
+ 127 |
+
+
+
+### Examples
+
+Below are some prediction examples of the fully trained original SSD300 "07+12" model (i.e. trained on Pascal VOC2007 `trainval` and VOC2012 `trainval`). The predictions were made on Pascal VOC2007 `test`.
+
+| | |
+|---|---|
+|  |  |
+|  |  |
+
+Here are some prediction examples of an SSD7 (i.e. the small 7-layer version) partially trained on two road traffic datasets released by [Udacity](https://github.com/udacity/self-driving-car/tree/master/annotations) with roughly 20,000 images in total and 5 object categories (more info in [`ssd7_training.ipynb`](ssd7_training.ipynb)). The predictions you see below were made after 10,000 training steps at batch size 32. Admittedly, cars are comparatively easy objects to detect and I picked a few of the better examples, but it is nonetheless remarkable what such a small model can do after only 10,000 training iterations.
+
+| | |
+|---|---|
+|  |  |
+|  |  |
+
+### Dependencies
+
+* Python 3.x
+* Numpy
+* TensorFlow 1.x
+* Keras 2.x
+* OpenCV
+* Beautiful Soup 4.x
+
+The Theano and CNTK backends are currently not supported.
+
+Python 2 compatibility: This implementation seems to work with Python 2.7, but I don't provide any support for it. It's 2018 and nobody should be using Python 2 anymore.
+
+### How to use it
+
+This repository provides Jupyter notebook tutorials that explain training, inference and evaluation, and there are a bunch of explanations in the subsequent sections that complement the notebooks.
+
+How to use a trained model for inference:
+* [`ssd300_inference.ipynb`](ssd300_inference.ipynb)
+* [`ssd512_inference.ipynb`](ssd512_inference.ipynb)
+
+How to train a model:
+* [`ssd300_training.ipynb`](ssd300_training.ipynb)
+* [`ssd7_training.ipynb`](ssd7_training.ipynb)
+
+How to use one of the provided trained models for transfer learning on your own dataset:
+* [Read below](#how-to-fine-tune-one-of-the-trained-models-on-your-own-dataset)
+
+How to evaluate a trained model:
+* In general: [`ssd300_evaluation.ipynb`](ssd300_evaluation.ipynb)
+* On MS COCO: [`ssd300_evaluation_COCO.ipynb`](ssd300_evaluation_COCO.ipynb)
+
+How to use the data generator:
+* The data generator used here has its own repository with a detailed tutorial [here](https://github.com/pierluigiferrari/data_generator_object_detection_2d)
+
+#### Training details
+
+The general training setup is layed out and explained in [`ssd7_training.ipynb`](ssd7_training.ipynb) and in [`ssd300_training.ipynb`](ssd300_training.ipynb). The setup and explanations are similar in both notebooks for the most part, so it doesn't matter which one you look at to understand the general training setup, but the parameters in [`ssd300_training.ipynb`](ssd300_training.ipynb) are preset to copy the setup of the original Caffe implementation for training on Pascal VOC, while the parameters in [`ssd7_training.ipynb`](ssd7_training.ipynb) are preset to train on the [Udacity traffic datasets](https://github.com/udacity/self-driving-car/tree/master/annotations).
+
+To train the original SSD300 model on Pascal VOC:
+
+1. Download the datasets:
+ ```c
+ wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
+ wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
+ wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
+ ```
+2. Download the weights for the convolutionalized VGG-16 or for one of the trained original models provided below.
+3. Set the file paths for the datasets and model weights accordingly in [`ssd300_training.ipynb`](ssd300_training.ipynb) and execute the cells.
+
+The procedure for training SSD512 is the same of course. It is imperative that you load the pre-trained VGG-16 weights when attempting to train an SSD300 or SSD512 from scratch, otherwise the training will probably fail. Here is a summary of a full training of the SSD300 "07+12" model for comparison with your own training:
+
+* [SSD300 Pascal VOC "07+12" training summary](training_summaries/ssd300_pascal_07+12_training_summary.md)
+
+#### Encoding and decoding boxes
+
+The [`ssd_encoder_decoder`](ssd_encoder_decoder) sub-package contains all functions and classes related to encoding and decoding boxes. Encoding boxes means converting ground truth labels into the target format that the loss function needs during training. It is this encoding process in which the matching of ground truth boxes to anchor boxes (the paper calls them default boxes and in the original C++ code they are called priors - all the same thing) happens. Decoding boxes means converting raw model output back to the input label format, which entails various conversion and filtering processes such as non-maximum suppression (NMS).
+
+In order to train the model, you need to create an instance of `SSDInputEncoder` that needs to be passed to the data generator. The data generator does the rest, so you don't usually need to call any of `SSDInputEncoder`'s methods manually.
+
+Models can be created in 'training' or 'inference' mode. In 'training' mode, the model outputs the raw prediction tensor that still needs to be post-processed with coordinate conversion, confidence thresholding, non-maximum suppression, etc. The functions `decode_detections()` and `decode_detections_fast()` are responsible for that. The former follows the original Caffe implementation, which entails performing NMS per object class, while the latter performs NMS globally across all object classes and is thus more efficient, but also behaves slightly differently. Read the documentation for details about both functions. If a model is created in 'inference' mode, its last layer is the `DecodeDetections` layer, which performs all the post-processing that `decode_detections()` does, but in TensorFlow. That means the output of the model is already the post-processed output. In order to be trainable, a model must be created in 'training' mode. The trained weights can then later be loaded into a model that was created in 'inference' mode.
+
+A note on the anchor box offset coordinates used internally by the model: This may or may not be obvious to you, but it is important to understand that it is not possible for the model to predict absolute coordinates for the predicted bounding boxes. In order to be able to predict absolute box coordinates, the convolutional layers responsible for localization would need to produce different output values for the same object instance at different locations within the input image. This isn't possible of course: For a given input to the filter of a convolutional layer, the filter will produce the same output regardless of the spatial position within the image because of the shared weights. This is the reason why the model predicts offsets to anchor boxes instead of absolute coordinates, and why during training, absolute ground truth coordinates are converted to anchor box offsets in the encoding process. The fact that the model predicts offsets to anchor box coordinates is in turn the reason why the model contains anchor box layers that do nothing but output the anchor box coordinates so that the model's output tensor can include those. If the model's output tensor did not contain the anchor box coordinates, the information to convert the predicted offsets back to absolute coordinates would be missing in the model output.
+
+#### Using a different base network architecture
+
+If you want to build a different base network architecture, you could use [`keras_ssd7.py`](models/keras_ssd7.py) as a template. It provides documentation and comments to help you turn it into a different base network. Put together the base network you want and add a predictor layer on top of each network layer from which you would like to make predictions. Create two predictor heads for each, one for localization, one for classification. Create an anchor box layer for each predictor layer and set the respective localization head's output as the input for the anchor box layer. The structure of all tensor reshaping and concatenation operations remains the same, you just have to make sure to include all of your predictor and anchor box layers of course.
+
+### Download the convolutionalized VGG-16 weights
+
+In order to train an SSD300 or SSD512 from scratch, download the weights of the fully convolutionalized VGG-16 model trained to convergence on ImageNet classification here:
+
+[`VGG_ILSVRC_16_layers_fc_reduced.h5`](https://drive.google.com/open?id=1sBmajn6vOE7qJ8GnxUJt4fGPuffVUZox).
+
+As with all other weights files below, this is a direct port of the corresponding `.caffemodel` file that is provided in the repository of the original Caffe implementation.
+
+### Download the original trained model weights
+
+Here are the ported weights for all the original trained models. The filenames correspond to their respective `.caffemodel` counterparts. The asterisks and footnotes refer to those in the README of the [original Caffe implementation](https://github.com/weiliu89/caffe/tree/ssd#models).
+
+1. PASCAL VOC models:
+
+ * 07+12: [SSD300*](https://drive.google.com/open?id=121-kCXaOHOkJE_Kf5lKcJvC_5q1fYb_q), [SSD512*](https://drive.google.com/open?id=19NIa0baRCFYT3iRxQkOKCD7CpN6BFO8p)
+ * 07++12: [SSD300*](https://drive.google.com/open?id=1M99knPZ4DpY9tI60iZqxXsAxX2bYWDvZ), [SSD512*](https://drive.google.com/open?id=18nFnqv9fG5Rh_fx6vUtOoQHOLySt4fEx)
+ * COCO[1]: [SSD300*](https://drive.google.com/open?id=17G1J4zEpFwiOzgBmq886ci4P3YaIz8bY), [SSD512*](https://drive.google.com/open?id=1wGc368WyXSHZOv4iow2tri9LnB0vm9X-)
+ * 07+12+COCO: [SSD300*](https://drive.google.com/open?id=1vtNI6kSnv7fkozl7WxyhGyReB6JvDM41), [SSD512*](https://drive.google.com/open?id=14mELuzm0OvXnwjb0mzAiG-Ake9_NP_LQ)
+ * 07++12+COCO: [SSD300*](https://drive.google.com/open?id=1fyDDUcIOSjeiP08vl1WCndcFdtboFXua), [SSD512*](https://drive.google.com/open?id=1a-64b6y6xsQr5puUsHX_wxI1orQDercM)
+
+
+2. COCO models:
+
+ * trainval35k: [SSD300*](https://drive.google.com/open?id=1vmEF7FUsWfHquXyCqO17UaXOPpRbwsdj), [SSD512*](https://drive.google.com/open?id=1IJWZKmjkcFMlvaz2gYukzFx4d6mH3py5)
+
+
+3. ILSVRC models:
+
+ * trainval1: [SSD300*](https://drive.google.com/open?id=1VWkj1oQS2RUhyJXckx3OaDYs5fx2mMCq), [SSD500](https://drive.google.com/open?id=1LcBPsd9CJbuBw4KiSuE1o1fMA-Pz2Zvw)
+
+### How to fine-tune one of the trained models on your own dataset
+
+If you want to fine-tune one of the provided trained models on your own dataset, chances are your dataset doesn't have the same number of classes as the trained model. The following tutorial explains how to deal with this problem:
+
+[`weight_sampling_tutorial.ipynb`](weight_sampling_tutorial.ipynb)
+
+### ToDo
+
+The following things are on the to-do list, ranked by priority. Contributions are welcome, but please read the [contributing guidelines](CONTRIBUTING.md).
+
+1. Add model definitions and trained weights for SSDs based on other base networks such as MobileNet, InceptionResNetV2, or DenseNet.
+2. Add support for the Theano and CNTK backends. Requires porting the custom layers and the loss function from TensorFlow to the abstract Keras backend.
+
+Currently in the works:
+
+* A new [Focal Loss](https://arxiv.org/abs/1708.02002) loss function.
+
+### Important notes
+
+* All trained models that were trained on MS COCO use the smaller anchor box scaling factors provided in all of the Jupyter notebooks. In particular, note that the '07+12+COCO' and '07++12+COCO' models use the smaller scaling factors.
+
+### Terminology
+
+* "Anchor boxes": The paper calls them "default boxes", in the original C++ code they are called "prior boxes" or "priors", and the Faster R-CNN paper calls them "anchor boxes". All terms mean the same thing, but I slightly prefer the name "anchor boxes" because I find it to be the most descriptive of these names. I call them "prior boxes" or "priors" in `keras_ssd300.py` and `keras_ssd512.py` to stay consistent with the original Caffe implementation, but everywhere else I use the name "anchor boxes" or "anchors".
+* "Labels": For the purpose of this project, datasets consist of "images" and "labels". Everything that belongs to the annotations of a given image is the "labels" of that image: Not just object category labels, but also bounding box coordinates. "Labels" is just shorter than "annotations". I also use the terms "labels" and "targets" more or less interchangeably throughout the documentation, although "targets" means labels specifically in the context of training.
+* "Predictor layer": The "predictor layers" or "predictors" are all the last convolution layers of the network, i.e. all convolution layers that do not feed into any subsequent convolution layers.
diff --git a/engine/object_detection_branch/single_shot_detector/__init__.py b/engine/object_detection_branch/single_shot_detector/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/single_shot_detector/bounding_box_utils/__init__.py b/engine/object_detection_branch/single_shot_detector/bounding_box_utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/single_shot_detector/bounding_box_utils/bounding_box_utils.py b/engine/object_detection_branch/single_shot_detector/bounding_box_utils/bounding_box_utils.py
new file mode 100644
index 0000000..36ce3dc
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/bounding_box_utils/bounding_box_utils.py
@@ -0,0 +1,383 @@
+'''
+Includes:
+* Function to compute the IoU similarity for axis-aligned, rectangular, 2D bounding boxes
+* Function for coordinate conversion for axis-aligned, rectangular, 2D bounding boxes
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+
+def convert_coordinates(tensor, start_index, conversion, border_pixels='half'):
+ '''
+ Convert coordinates for axis-aligned 2D boxes between two coordinate formats.
+
+ Creates a copy of `tensor`, i.e. does not operate in place. Currently there are
+ three supported coordinate formats that can be converted from and to each other:
+ 1) (xmin, xmax, ymin, ymax) - the 'minmax' format
+ 2) (xmin, ymin, xmax, ymax) - the 'corners' format
+ 2) (cx, cy, w, h) - the 'centroids' format
+
+ Arguments:
+ tensor (array): A Numpy nD array containing the four consecutive coordinates
+ to be converted somewhere in the last axis.
+ start_index (int): The index of the first coordinate in the last axis of `tensor`.
+ conversion (str, optional): The conversion direction. Can be 'minmax2centroids',
+ 'centroids2minmax', 'corners2centroids', 'centroids2corners', 'minmax2corners',
+ or 'corners2minmax'.
+ border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+ Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+ to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+ If 'half', then one of each of the two horizontal and vertical borders belong
+ to the boxex, but not the other.
+
+ Returns:
+ A Numpy nD array, a copy of the input tensor with the converted coordinates
+ in place of the original coordinates and the unaltered elements of the original
+ tensor elsewhere.
+ '''
+ if border_pixels == 'half':
+ d = 0
+ elif border_pixels == 'include':
+ d = 1
+ elif border_pixels == 'exclude':
+ d = -1
+
+ ind = start_index
+ tensor1 = np.copy(tensor).astype(np.float)
+ if conversion == 'minmax2centroids':
+ tensor1[..., ind] = (tensor[..., ind] + tensor[..., ind+1]) / 2.0 # Set cx
+ tensor1[..., ind+1] = (tensor[..., ind+2] + tensor[..., ind+3]) / 2.0 # Set cy
+ tensor1[..., ind+2] = tensor[..., ind+1] - tensor[..., ind] + d # Set w
+ tensor1[..., ind+3] = tensor[..., ind+3] - tensor[..., ind+2] + d # Set h
+ elif conversion == 'centroids2minmax':
+ tensor1[..., ind] = tensor[..., ind] - tensor[..., ind+2] / 2.0 # Set xmin
+ tensor1[..., ind+1] = tensor[..., ind] + tensor[..., ind+2] / 2.0 # Set xmax
+ tensor1[..., ind+2] = tensor[..., ind+1] - tensor[..., ind+3] / 2.0 # Set ymin
+ tensor1[..., ind+3] = tensor[..., ind+1] + tensor[..., ind+3] / 2.0 # Set ymax
+ elif conversion == 'corners2centroids':
+ tensor1[..., ind] = (tensor[..., ind] + tensor[..., ind+2]) / 2.0 # Set cx
+ tensor1[..., ind+1] = (tensor[..., ind+1] + tensor[..., ind+3]) / 2.0 # Set cy
+ tensor1[..., ind+2] = tensor[..., ind+2] - tensor[..., ind] + d # Set w
+ tensor1[..., ind+3] = tensor[..., ind+3] - tensor[..., ind+1] + d # Set h
+ elif conversion == 'centroids2corners':
+ tensor1[..., ind] = tensor[..., ind] - tensor[..., ind+2] / 2.0 # Set xmin
+ tensor1[..., ind+1] = tensor[..., ind+1] - tensor[..., ind+3] / 2.0 # Set ymin
+ tensor1[..., ind+2] = tensor[..., ind] + tensor[..., ind+2] / 2.0 # Set xmax
+ tensor1[..., ind+3] = tensor[..., ind+1] + tensor[..., ind+3] / 2.0 # Set ymax
+ elif (conversion == 'minmax2corners') or (conversion == 'corners2minmax'):
+ tensor1[..., ind+1] = tensor[..., ind+2]
+ tensor1[..., ind+2] = tensor[..., ind+1]
+ else:
+ raise ValueError("Unexpected conversion value. Supported values are 'minmax2centroids', 'centroids2minmax', 'corners2centroids', 'centroids2corners', 'minmax2corners', and 'corners2minmax'.")
+
+ return tensor1
+
+def convert_coordinates2(tensor, start_index, conversion):
+ '''
+ A matrix multiplication implementation of `convert_coordinates()`.
+ Supports only conversion between the 'centroids' and 'minmax' formats.
+
+ This function is marginally slower on average than `convert_coordinates()`,
+ probably because it involves more (unnecessary) arithmetic operations (unnecessary
+ because the two matrices are sparse).
+
+ For details please refer to the documentation of `convert_coordinates()`.
+ '''
+ ind = start_index
+ tensor1 = np.copy(tensor).astype(np.float)
+ if conversion == 'minmax2centroids':
+ M = np.array([[0.5, 0. , -1., 0.],
+ [0.5, 0. , 1., 0.],
+ [0. , 0.5, 0., -1.],
+ [0. , 0.5, 0., 1.]])
+ tensor1[..., ind:ind+4] = np.dot(tensor1[..., ind:ind+4], M)
+ elif conversion == 'centroids2minmax':
+ M = np.array([[ 1. , 1. , 0. , 0. ],
+ [ 0. , 0. , 1. , 1. ],
+ [-0.5, 0.5, 0. , 0. ],
+ [ 0. , 0. , -0.5, 0.5]]) # The multiplicative inverse of the matrix above
+ tensor1[..., ind:ind+4] = np.dot(tensor1[..., ind:ind+4], M)
+ else:
+ raise ValueError("Unexpected conversion value. Supported values are 'minmax2centroids' and 'centroids2minmax'.")
+
+ return tensor1
+
+def intersection_area(boxes1, boxes2, coords='centroids', mode='outer_product', border_pixels='half'):
+ '''
+ Computes the intersection areas of two sets of axis-aligned 2D rectangular boxes.
+
+ Let `boxes1` and `boxes2` contain `m` and `n` boxes, respectively.
+
+ In 'outer_product' mode, returns an `(m,n)` matrix with the intersection areas for all possible
+ combinations of the boxes in `boxes1` and `boxes2`.
+
+ In 'element-wise' mode, `m` and `n` must be broadcast-compatible. Refer to the explanation
+ of the `mode` argument for details.
+
+ Arguments:
+ boxes1 (array): Either a 1D Numpy array of shape `(4, )` containing the coordinates for one box in the
+ format specified by `coords` or a 2D Numpy array of shape `(m, 4)` containing the coordinates for `m` boxes.
+ If `mode` is set to 'element_wise', the shape must be broadcast-compatible with `boxes2`.
+ boxes2 (array): Either a 1D Numpy array of shape `(4, )` containing the coordinates for one box in the
+ format specified by `coords` or a 2D Numpy array of shape `(n, 4)` containing the coordinates for `n` boxes.
+ If `mode` is set to 'element_wise', the shape must be broadcast-compatible with `boxes1`.
+ coords (str, optional): The coordinate format in the input arrays. Can be either 'centroids' for the format
+ `(cx, cy, w, h)`, 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format
+ `(xmin, ymin, xmax, ymax)`.
+ mode (str, optional): Can be one of 'outer_product' and 'element-wise'. In 'outer_product' mode, returns an
+ `(m,n)` matrix with the intersection areas for all possible combinations of the `m` boxes in `boxes1` with the
+ `n` boxes in `boxes2`. In 'element-wise' mode, returns a 1D array and the shapes of `boxes1` and `boxes2`
+ must be boadcast-compatible. If both `boxes1` and `boxes2` have `m` boxes, then this returns an array of
+ length `m` where the i-th position contains the intersection area of `boxes1[i]` with `boxes2[i]`.
+ border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+ Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+ to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+ If 'half', then one of each of the two horizontal and vertical borders belong
+ to the boxex, but not the other.
+
+ Returns:
+ A 1D or 2D Numpy array (refer to the `mode` argument for details) of dtype float containing values with
+ the intersection areas of the boxes in `boxes1` and `boxes2`.
+ '''
+
+ # Make sure the boxes have the right shapes.
+ if boxes1.ndim > 2: raise ValueError("boxes1 must have rank either 1 or 2, but has rank {}.".format(boxes1.ndim))
+ if boxes2.ndim > 2: raise ValueError("boxes2 must have rank either 1 or 2, but has rank {}.".format(boxes2.ndim))
+
+ if boxes1.ndim == 1: boxes1 = np.expand_dims(boxes1, axis=0)
+ if boxes2.ndim == 1: boxes2 = np.expand_dims(boxes2, axis=0)
+
+ if not (boxes1.shape[1] == boxes2.shape[1] == 4): raise ValueError("All boxes must consist of 4 coordinates, but the boxes in `boxes1` and `boxes2` have {} and {} coordinates, respectively.".format(boxes1.shape[1], boxes2.shape[1]))
+ if not mode in {'outer_product', 'element-wise'}: raise ValueError("`mode` must be one of 'outer_product' and 'element-wise', but got '{}'.",format(mode))
+
+ # Convert the coordinates if necessary.
+ if coords == 'centroids':
+ boxes1 = convert_coordinates(boxes1, start_index=0, conversion='centroids2corners')
+ boxes2 = convert_coordinates(boxes2, start_index=0, conversion='centroids2corners')
+ coords = 'corners'
+ elif not (coords in {'minmax', 'corners'}):
+ raise ValueError("Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'.")
+
+ m = boxes1.shape[0] # The number of boxes in `boxes1`
+ n = boxes2.shape[0] # The number of boxes in `boxes2`
+
+ # Set the correct coordinate indices for the respective formats.
+ if coords == 'corners':
+ xmin = 0
+ ymin = 1
+ xmax = 2
+ ymax = 3
+ elif coords == 'minmax':
+ xmin = 0
+ xmax = 1
+ ymin = 2
+ ymax = 3
+
+ if border_pixels == 'half':
+ d = 0
+ elif border_pixels == 'include':
+ d = 1 # If border pixels are supposed to belong to the bounding boxes, we have to add one pixel to any difference `xmax - xmin` or `ymax - ymin`.
+ elif border_pixels == 'exclude':
+ d = -1 # If border pixels are not supposed to belong to the bounding boxes, we have to subtract one pixel from any difference `xmax - xmin` or `ymax - ymin`.
+
+ # Compute the intersection areas.
+
+ if mode == 'outer_product':
+
+ # For all possible box combinations, get the greater xmin and ymin values.
+ # This is a tensor of shape (m,n,2).
+ min_xy = np.maximum(np.tile(np.expand_dims(boxes1[:,[xmin,ymin]], axis=1), reps=(1, n, 1)),
+ np.tile(np.expand_dims(boxes2[:,[xmin,ymin]], axis=0), reps=(m, 1, 1)))
+
+ # For all possible box combinations, get the smaller xmax and ymax values.
+ # This is a tensor of shape (m,n,2).
+ max_xy = np.minimum(np.tile(np.expand_dims(boxes1[:,[xmax,ymax]], axis=1), reps=(1, n, 1)),
+ np.tile(np.expand_dims(boxes2[:,[xmax,ymax]], axis=0), reps=(m, 1, 1)))
+
+ # Compute the side lengths of the intersection rectangles.
+ side_lengths = np.maximum(0, max_xy - min_xy + d)
+
+ return side_lengths[:,:,0] * side_lengths[:,:,1]
+
+ elif mode == 'element-wise':
+
+ min_xy = np.maximum(boxes1[:,[xmin,ymin]], boxes2[:,[xmin,ymin]])
+ max_xy = np.minimum(boxes1[:,[xmax,ymax]], boxes2[:,[xmax,ymax]])
+
+ # Compute the side lengths of the intersection rectangles.
+ side_lengths = np.maximum(0, max_xy - min_xy + d)
+
+ return side_lengths[:,0] * side_lengths[:,1]
+
+def intersection_area_(boxes1, boxes2, coords='corners', mode='outer_product', border_pixels='half'):
+ '''
+ The same as 'intersection_area()' but for internal use, i.e. without all the safety checks.
+ '''
+
+ m = boxes1.shape[0] # The number of boxes in `boxes1`
+ n = boxes2.shape[0] # The number of boxes in `boxes2`
+
+ # Set the correct coordinate indices for the respective formats.
+ if coords == 'corners':
+ xmin = 0
+ ymin = 1
+ xmax = 2
+ ymax = 3
+ elif coords == 'minmax':
+ xmin = 0
+ xmax = 1
+ ymin = 2
+ ymax = 3
+
+ if border_pixels == 'half':
+ d = 0
+ elif border_pixels == 'include':
+ d = 1 # If border pixels are supposed to belong to the bounding boxes, we have to add one pixel to any difference `xmax - xmin` or `ymax - ymin`.
+ elif border_pixels == 'exclude':
+ d = -1 # If border pixels are not supposed to belong to the bounding boxes, we have to subtract one pixel from any difference `xmax - xmin` or `ymax - ymin`.
+
+ # Compute the intersection areas.
+
+ if mode == 'outer_product':
+
+ # For all possible box combinations, get the greater xmin and ymin values.
+ # This is a tensor of shape (m,n,2).
+ min_xy = np.maximum(np.tile(np.expand_dims(boxes1[:,[xmin,ymin]], axis=1), reps=(1, n, 1)),
+ np.tile(np.expand_dims(boxes2[:,[xmin,ymin]], axis=0), reps=(m, 1, 1)))
+
+ # For all possible box combinations, get the smaller xmax and ymax values.
+ # This is a tensor of shape (m,n,2).
+ max_xy = np.minimum(np.tile(np.expand_dims(boxes1[:,[xmax,ymax]], axis=1), reps=(1, n, 1)),
+ np.tile(np.expand_dims(boxes2[:,[xmax,ymax]], axis=0), reps=(m, 1, 1)))
+
+ # Compute the side lengths of the intersection rectangles.
+ side_lengths = np.maximum(0, max_xy - min_xy + d)
+
+ return side_lengths[:,:,0] * side_lengths[:,:,1]
+
+ elif mode == 'element-wise':
+
+ min_xy = np.maximum(boxes1[:,[xmin,ymin]], boxes2[:,[xmin,ymin]])
+ max_xy = np.minimum(boxes1[:,[xmax,ymax]], boxes2[:,[xmax,ymax]])
+
+ # Compute the side lengths of the intersection rectangles.
+ side_lengths = np.maximum(0, max_xy - min_xy + d)
+
+ return side_lengths[:,0] * side_lengths[:,1]
+
+
+def iou(boxes1, boxes2, coords='centroids', mode='outer_product', border_pixels='half'):
+ '''
+ Computes the intersection-over-union similarity (also known as Jaccard similarity)
+ of two sets of axis-aligned 2D rectangular boxes.
+
+ Let `boxes1` and `boxes2` contain `m` and `n` boxes, respectively.
+
+ In 'outer_product' mode, returns an `(m,n)` matrix with the IoUs for all possible
+ combinations of the boxes in `boxes1` and `boxes2`.
+
+ In 'element-wise' mode, `m` and `n` must be broadcast-compatible. Refer to the explanation
+ of the `mode` argument for details.
+
+ Arguments:
+ boxes1 (array): Either a 1D Numpy array of shape `(4, )` containing the coordinates for one box in the
+ format specified by `coords` or a 2D Numpy array of shape `(m, 4)` containing the coordinates for `m` boxes.
+ If `mode` is set to 'element_wise', the shape must be broadcast-compatible with `boxes2`.
+ boxes2 (array): Either a 1D Numpy array of shape `(4, )` containing the coordinates for one box in the
+ format specified by `coords` or a 2D Numpy array of shape `(n, 4)` containing the coordinates for `n` boxes.
+ If `mode` is set to 'element_wise', the shape must be broadcast-compatible with `boxes1`.
+ coords (str, optional): The coordinate format in the input arrays. Can be either 'centroids' for the format
+ `(cx, cy, w, h)`, 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format
+ `(xmin, ymin, xmax, ymax)`.
+ mode (str, optional): Can be one of 'outer_product' and 'element-wise'. In 'outer_product' mode, returns an
+ `(m,n)` matrix with the IoU overlaps for all possible combinations of the `m` boxes in `boxes1` with the
+ `n` boxes in `boxes2`. In 'element-wise' mode, returns a 1D array and the shapes of `boxes1` and `boxes2`
+ must be boadcast-compatible. If both `boxes1` and `boxes2` have `m` boxes, then this returns an array of
+ length `m` where the i-th position contains the IoU overlap of `boxes1[i]` with `boxes2[i]`.
+ border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+ Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+ to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+ If 'half', then one of each of the two horizontal and vertical borders belong
+ to the boxex, but not the other.
+
+ Returns:
+ A 1D or 2D Numpy array (refer to the `mode` argument for details) of dtype float containing values in [0,1],
+ the Jaccard similarity of the boxes in `boxes1` and `boxes2`. 0 means there is no overlap between two given
+ boxes, 1 means their coordinates are identical.
+ '''
+
+ # Make sure the boxes have the right shapes.
+ if boxes1.ndim > 2: raise ValueError("boxes1 must have rank either 1 or 2, but has rank {}.".format(boxes1.ndim))
+ if boxes2.ndim > 2: raise ValueError("boxes2 must have rank either 1 or 2, but has rank {}.".format(boxes2.ndim))
+
+ if boxes1.ndim == 1: boxes1 = np.expand_dims(boxes1, axis=0)
+ if boxes2.ndim == 1: boxes2 = np.expand_dims(boxes2, axis=0)
+
+ if not (boxes1.shape[1] == boxes2.shape[1] == 4): raise ValueError("All boxes must consist of 4 coordinates, but the boxes in `boxes1` and `boxes2` have {} and {} coordinates, respectively.".format(boxes1.shape[1], boxes2.shape[1]))
+ if not mode in {'outer_product', 'element-wise'}: raise ValueError("`mode` must be one of 'outer_product' and 'element-wise', but got '{}'.".format(mode))
+
+ # Convert the coordinates if necessary.
+ if coords == 'centroids':
+ boxes1 = convert_coordinates(boxes1, start_index=0, conversion='centroids2corners')
+ boxes2 = convert_coordinates(boxes2, start_index=0, conversion='centroids2corners')
+ coords = 'corners'
+ elif not (coords in {'minmax', 'corners'}):
+ raise ValueError("Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'.")
+
+ # Compute the IoU.
+
+ # Compute the interesection areas.
+
+ intersection_areas = intersection_area_(boxes1, boxes2, coords=coords, mode=mode)
+
+ m = boxes1.shape[0] # The number of boxes in `boxes1`
+ n = boxes2.shape[0] # The number of boxes in `boxes2`
+
+ # Compute the union areas.
+
+ # Set the correct coordinate indices for the respective formats.
+ if coords == 'corners':
+ xmin = 0
+ ymin = 1
+ xmax = 2
+ ymax = 3
+ elif coords == 'minmax':
+ xmin = 0
+ xmax = 1
+ ymin = 2
+ ymax = 3
+
+ if border_pixels == 'half':
+ d = 0
+ elif border_pixels == 'include':
+ d = 1 # If border pixels are supposed to belong to the bounding boxes, we have to add one pixel to any difference `xmax - xmin` or `ymax - ymin`.
+ elif border_pixels == 'exclude':
+ d = -1 # If border pixels are not supposed to belong to the bounding boxes, we have to subtract one pixel from any difference `xmax - xmin` or `ymax - ymin`.
+
+ if mode == 'outer_product':
+
+ boxes1_areas = np.tile(np.expand_dims((boxes1[:,xmax] - boxes1[:,xmin] + d) * (boxes1[:,ymax] - boxes1[:,ymin] + d), axis=1), reps=(1,n))
+ boxes2_areas = np.tile(np.expand_dims((boxes2[:,xmax] - boxes2[:,xmin] + d) * (boxes2[:,ymax] - boxes2[:,ymin] + d), axis=0), reps=(m,1))
+
+ elif mode == 'element-wise':
+
+ boxes1_areas = (boxes1[:,xmax] - boxes1[:,xmin] + d) * (boxes1[:,ymax] - boxes1[:,ymin] + d)
+ boxes2_areas = (boxes2[:,xmax] - boxes2[:,xmin] + d) * (boxes2[:,ymax] - boxes2[:,ymin] + d)
+
+ union_areas = boxes1_areas + boxes2_areas - intersection_areas
+
+ return intersection_areas / union_areas
diff --git a/engine/object_detection_branch/single_shot_detector/data_generator/__init__.py b/engine/object_detection_branch/single_shot_detector/data_generator/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/single_shot_detector/data_generator/data_augmentation_chain_constant_input_size.py b/engine/object_detection_branch/single_shot_detector/data_generator/data_augmentation_chain_constant_input_size.py
new file mode 100644
index 0000000..2c18a98
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/data_generator/data_augmentation_chain_constant_input_size.py
@@ -0,0 +1,183 @@
+'''
+The data augmentation operations of the original SSD implementation.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+
+from data_generator.object_detection_2d_photometric_ops import ConvertColor, ConvertDataType, ConvertTo3Channels, RandomBrightness, RandomContrast, RandomHue, RandomSaturation
+from data_generator.object_detection_2d_geometric_ops import RandomFlip, RandomTranslate, RandomScale
+from data_generator.object_detection_2d_image_boxes_validation_utils import BoundGenerator, BoxFilter, ImageValidator
+
+class DataAugmentationConstantInputSize:
+ '''
+ Applies a chain of photometric and geometric image transformations. For documentation, please refer
+ to the documentation of the individual transformations involved.
+
+ Important: This augmentation chain is suitable for constant-size images only.
+ '''
+
+ def __init__(self,
+ random_brightness=(-48, 48, 0.5),
+ random_contrast=(0.5, 1.8, 0.5),
+ random_saturation=(0.5, 1.8, 0.5),
+ random_hue=(18, 0.5),
+ random_flip=0.5,
+ random_translate=((0.03,0.5), (0.03,0.5), 0.5),
+ random_scale=(0.5, 2.0, 0.5),
+ n_trials_max=3,
+ clip_boxes=True,
+ overlap_criterion='area',
+ bounds_box_filter=(0.3, 1.0),
+ bounds_validator=(0.5, 1.0),
+ n_boxes_min=1,
+ background=(0,0,0),
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+
+ if (random_scale[0] >= 1) or (random_scale[1] <= 1):
+ raise ValueError("This sequence of transformations only makes sense if the minimum scaling factor is <1 and the maximum scaling factor is >1.")
+
+ self.n_trials_max = n_trials_max
+ self.clip_boxes = clip_boxes
+ self.overlap_criterion = overlap_criterion
+ self.bounds_box_filter = bounds_box_filter
+ self.bounds_validator = bounds_validator
+ self.n_boxes_min = n_boxes_min
+ self.background = background
+ self.labels_format = labels_format
+
+ # Determines which boxes are kept in an image after the transformations have been applied.
+ self.box_filter = BoxFilter(check_overlap=True,
+ check_min_area=True,
+ check_degenerate=True,
+ overlap_criterion=self.overlap_criterion,
+ overlap_bounds=self.bounds_box_filter,
+ min_area=16,
+ labels_format=self.labels_format)
+
+ # Determines whether the result of the transformations is a valid training image.
+ self.image_validator = ImageValidator(overlap_criterion=self.overlap_criterion,
+ bounds=self.bounds_validator,
+ n_boxes_min=self.n_boxes_min,
+ labels_format=self.labels_format)
+
+ # Utility distortions
+ self.convert_RGB_to_HSV = ConvertColor(current='RGB', to='HSV')
+ self.convert_HSV_to_RGB = ConvertColor(current='HSV', to='RGB')
+ self.convert_to_float32 = ConvertDataType(to='float32')
+ self.convert_to_uint8 = ConvertDataType(to='uint8')
+ self.convert_to_3_channels = ConvertTo3Channels() # Make sure all images end up having 3 channels.
+
+ # Photometric transformations
+ self.random_brightness = RandomBrightness(lower=random_brightness[0], upper=random_brightness[1], prob=random_brightness[2])
+ self.random_contrast = RandomContrast(lower=random_contrast[0], upper=random_contrast[1], prob=random_contrast[2])
+ self.random_saturation = RandomSaturation(lower=random_saturation[0], upper=random_saturation[1], prob=random_saturation[2])
+ self.random_hue = RandomHue(max_delta=random_hue[0], prob=random_hue[1])
+
+ # Geometric transformations
+ self.random_flip = RandomFlip(dim='horizontal', prob=random_flip, labels_format=self.labels_format)
+ self.random_translate = RandomTranslate(dy_minmax=random_translate[0],
+ dx_minmax=random_translate[1],
+ prob=random_translate[2],
+ clip_boxes=self.clip_boxes,
+ box_filter=self.box_filter,
+ image_validator=self.image_validator,
+ n_trials_max=self.n_trials_max,
+ background=self.background,
+ labels_format=self.labels_format)
+ self.random_zoom_in = RandomScale(min_factor=1.0,
+ max_factor=random_scale[1],
+ prob=random_scale[2],
+ clip_boxes=self.clip_boxes,
+ box_filter=self.box_filter,
+ image_validator=self.image_validator,
+ n_trials_max=self.n_trials_max,
+ background=self.background,
+ labels_format=self.labels_format)
+ self.random_zoom_out = RandomScale(min_factor=random_scale[0],
+ max_factor=1.0,
+ prob=random_scale[2],
+ clip_boxes=self.clip_boxes,
+ box_filter=self.box_filter,
+ image_validator=self.image_validator,
+ n_trials_max=self.n_trials_max,
+ background=self.background,
+ labels_format=self.labels_format)
+
+ # If we zoom in, do translation before scaling.
+ self.sequence1 = [self.convert_to_3_channels,
+ self.convert_to_float32,
+ self.random_brightness,
+ self.random_contrast,
+ self.convert_to_uint8,
+ self.convert_RGB_to_HSV,
+ self.convert_to_float32,
+ self.random_saturation,
+ self.random_hue,
+ self.convert_to_uint8,
+ self.convert_HSV_to_RGB,
+ self.random_translate,
+ self.random_zoom_in,
+ self.random_flip]
+
+ # If we zoom out, do scaling before translation.
+ self.sequence2 = [self.convert_to_3_channels,
+ self.convert_to_float32,
+ self.random_brightness,
+ self.convert_to_uint8,
+ self.convert_RGB_to_HSV,
+ self.convert_to_float32,
+ self.random_saturation,
+ self.random_hue,
+ self.convert_to_uint8,
+ self.convert_HSV_to_RGB,
+ self.convert_to_float32,
+ self.random_contrast,
+ self.convert_to_uint8,
+ self.random_zoom_out,
+ self.random_translate,
+ self.random_flip]
+
+ def __call__(self, image, labels=None):
+
+ self.random_translate.labels_format = self.labels_format
+ self.random_zoom_in.labels_format = self.labels_format
+ self.random_zoom_out.labels_format = self.labels_format
+ self.random_flip.labels_format = self.labels_format
+
+ # Choose sequence 1 with probability 0.5.
+ if np.random.choice(2):
+
+ if not (labels is None):
+ for transform in self.sequence1:
+ image, labels = transform(image, labels)
+ return image, labels
+ else:
+ for transform in self.sequence1:
+ image = transform(image)
+ return image
+ # Choose sequence 2 with probability 0.5.
+ else:
+
+ if not (labels is None):
+ for transform in self.sequence2:
+ image, labels = transform(image, labels)
+ return image, labels
+ else:
+ for transform in self.sequence2:
+ image = transform(image)
+ return image
diff --git a/engine/object_detection_branch/single_shot_detector/data_generator/data_augmentation_chain_original_ssd.py b/engine/object_detection_branch/single_shot_detector/data_generator/data_augmentation_chain_original_ssd.py
new file mode 100644
index 0000000..af8d498
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/data_generator/data_augmentation_chain_original_ssd.py
@@ -0,0 +1,280 @@
+'''
+The data augmentation operations of the original SSD implementation.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+import cv2
+import inspect
+
+from data_generator.object_detection_2d_photometric_ops import ConvertColor, ConvertDataType, ConvertTo3Channels, RandomBrightness, RandomContrast, RandomHue, RandomSaturation, RandomChannelSwap
+from data_generator.object_detection_2d_patch_sampling_ops import PatchCoordinateGenerator, RandomPatch, RandomPatchInf
+from data_generator.object_detection_2d_geometric_ops import ResizeRandomInterp, RandomFlip
+from data_generator.object_detection_2d_image_boxes_validation_utils import BoundGenerator, BoxFilter, ImageValidator
+
+class SSDRandomCrop:
+ '''
+ Performs the same random crops as defined by the `batch_sampler` instructions
+ of the original Caffe implementation of SSD. A description of this random cropping
+ strategy can also be found in the data augmentation section of the paper:
+ https://arxiv.org/abs/1512.02325
+ '''
+
+ def __init__(self, labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+ '''
+ Arguments:
+ labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+ of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+ 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+ '''
+
+ self.labels_format = labels_format
+
+ # This randomly samples one of the lower IoU bounds defined
+ # by the `sample_space` every time it is called.
+ self.bound_generator = BoundGenerator(sample_space=((None, None),
+ (0.1, None),
+ (0.3, None),
+ (0.5, None),
+ (0.7, None),
+ (0.9, None)),
+ weights=None)
+
+ # Produces coordinates for candidate patches such that the height
+ # and width of the patches are between 0.3 and 1.0 of the height
+ # and width of the respective image and the aspect ratio of the
+ # patches is between 0.5 and 2.0.
+ self.patch_coord_generator = PatchCoordinateGenerator(must_match='h_w',
+ min_scale=0.3,
+ max_scale=1.0,
+ scale_uniformly=False,
+ min_aspect_ratio = 0.5,
+ max_aspect_ratio = 2.0)
+
+ # Filters out boxes whose center point does not lie within the
+ # chosen patches.
+ self.box_filter = BoxFilter(check_overlap=True,
+ check_min_area=False,
+ check_degenerate=False,
+ overlap_criterion='center_point',
+ labels_format=self.labels_format)
+
+ # Determines whether a given patch is considered a valid patch.
+ # Defines a patch to be valid if at least one ground truth bounding box
+ # (n_boxes_min == 1) has an IoU overlap with the patch that
+ # meets the requirements defined by `bound_generator`.
+ self.image_validator = ImageValidator(overlap_criterion='iou',
+ n_boxes_min=1,
+ labels_format=self.labels_format,
+ border_pixels='half')
+
+ # Performs crops according to the parameters set in the objects above.
+ # Runs until either a valid patch is found or the original input image
+ # is returned unaltered. Runs a maximum of 50 trials to find a valid
+ # patch for each new sampled IoU threshold. Every 50 trials, the original
+ # image is returned as is with probability (1 - prob) = 0.143.
+ self.random_crop = RandomPatchInf(patch_coord_generator=self.patch_coord_generator,
+ box_filter=self.box_filter,
+ image_validator=self.image_validator,
+ bound_generator=self.bound_generator,
+ n_trials_max=50,
+ clip_boxes=True,
+ prob=0.857,
+ labels_format=self.labels_format)
+
+ def __call__(self, image, labels=None, return_inverter=False):
+ self.random_crop.labels_format = self.labels_format
+ return self.random_crop(image, labels, return_inverter)
+
+class SSDExpand:
+ '''
+ Performs the random image expansion as defined by the `train_transform_param` instructions
+ of the original Caffe implementation of SSD. A description of this expansion strategy
+ can also be found in section 3.6 ("Data Augmentation for Small Object Accuracy") of the paper:
+ https://arxiv.org/abs/1512.02325
+ '''
+
+ def __init__(self, background=(123, 117, 104), labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+ '''
+ Arguments:
+ background (list/tuple, optional): A 3-tuple specifying the RGB color value of the
+ background pixels of the translated images.
+ labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+ of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+ 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+ '''
+
+ self.labels_format = labels_format
+
+ # Generate coordinates for patches that are between 1.0 and 4.0 times
+ # the size of the input image in both spatial dimensions.
+ self.patch_coord_generator = PatchCoordinateGenerator(must_match='h_w',
+ min_scale=1.0,
+ max_scale=4.0,
+ scale_uniformly=True)
+
+ # With probability 0.5, place the input image randomly on a canvas filled with
+ # mean color values according to the parameters set above. With probability 0.5,
+ # return the input image unaltered.
+ self.expand = RandomPatch(patch_coord_generator=self.patch_coord_generator,
+ box_filter=None,
+ image_validator=None,
+ n_trials_max=1,
+ clip_boxes=False,
+ prob=0.5,
+ background=background,
+ labels_format=self.labels_format)
+
+ def __call__(self, image, labels=None, return_inverter=False):
+ self.expand.labels_format = self.labels_format
+ return self.expand(image, labels, return_inverter)
+
+class SSDPhotometricDistortions:
+ '''
+ Performs the photometric distortions defined by the `train_transform_param` instructions
+ of the original Caffe implementation of SSD.
+ '''
+
+ def __init__(self):
+
+ self.convert_RGB_to_HSV = ConvertColor(current='RGB', to='HSV')
+ self.convert_HSV_to_RGB = ConvertColor(current='HSV', to='RGB')
+ self.convert_to_float32 = ConvertDataType(to='float32')
+ self.convert_to_uint8 = ConvertDataType(to='uint8')
+ self.convert_to_3_channels = ConvertTo3Channels()
+ self.random_brightness = RandomBrightness(lower=-32, upper=32, prob=0.5)
+ self.random_contrast = RandomContrast(lower=0.5, upper=1.5, prob=0.5)
+ self.random_saturation = RandomSaturation(lower=0.5, upper=1.5, prob=0.5)
+ self.random_hue = RandomHue(max_delta=18, prob=0.5)
+ self.random_channel_swap = RandomChannelSwap(prob=0.0)
+
+ self.sequence1 = [self.convert_to_3_channels,
+ self.convert_to_float32,
+ self.random_brightness,
+ self.random_contrast,
+ self.convert_to_uint8,
+ self.convert_RGB_to_HSV,
+ self.convert_to_float32,
+ self.random_saturation,
+ self.random_hue,
+ self.convert_to_uint8,
+ self.convert_HSV_to_RGB,
+ self.random_channel_swap]
+
+ self.sequence2 = [self.convert_to_3_channels,
+ self.convert_to_float32,
+ self.random_brightness,
+ self.convert_to_uint8,
+ self.convert_RGB_to_HSV,
+ self.convert_to_float32,
+ self.random_saturation,
+ self.random_hue,
+ self.convert_to_uint8,
+ self.convert_HSV_to_RGB,
+ self.convert_to_float32,
+ self.random_contrast,
+ self.convert_to_uint8,
+ self.random_channel_swap]
+
+ def __call__(self, image, labels):
+
+ # Choose sequence 1 with probability 0.5.
+ if np.random.choice(2):
+
+ for transform in self.sequence1:
+ image, labels = transform(image, labels)
+ return image, labels
+ # Choose sequence 2 with probability 0.5.
+ else:
+
+ for transform in self.sequence2:
+ image, labels = transform(image, labels)
+ return image, labels
+
+class SSDDataAugmentation:
+ '''
+ Reproduces the data augmentation pipeline used in the training of the original
+ Caffe implementation of SSD.
+ '''
+
+ def __init__(self,
+ img_height=300,
+ img_width=300,
+ background=(123, 117, 104),
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+ '''
+ Arguments:
+ height (int): The desired height of the output images in pixels.
+ width (int): The desired width of the output images in pixels.
+ background (list/tuple, optional): A 3-tuple specifying the RGB color value of the
+ background pixels of the translated images.
+ labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+ of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+ 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+ '''
+
+ self.labels_format = labels_format
+
+ self.photometric_distortions = SSDPhotometricDistortions()
+ self.expand = SSDExpand(background=background, labels_format=self.labels_format)
+ self.random_crop = SSDRandomCrop(labels_format=self.labels_format)
+ self.random_flip = RandomFlip(dim='horizontal', prob=0.5, labels_format=self.labels_format)
+
+ # This box filter makes sure that the resized images don't contain any degenerate boxes.
+ # Resizing the images could lead the boxes to becomes smaller. For boxes that are already
+ # pretty small, that might result in boxes with height and/or width zero, which we obviously
+ # cannot allow.
+ self.box_filter = BoxFilter(check_overlap=False,
+ check_min_area=False,
+ check_degenerate=True,
+ labels_format=self.labels_format)
+
+ self.resize = ResizeRandomInterp(height=img_height,
+ width=img_width,
+ interpolation_modes=[cv2.INTER_NEAREST,
+ cv2.INTER_LINEAR,
+ cv2.INTER_CUBIC,
+ cv2.INTER_AREA,
+ cv2.INTER_LANCZOS4],
+ box_filter=self.box_filter,
+ labels_format=self.labels_format)
+
+ self.sequence = [self.photometric_distortions,
+ self.expand,
+ self.random_crop,
+ self.random_flip,
+ self.resize]
+
+ def __call__(self, image, labels, return_inverter=False):
+ self.expand.labels_format = self.labels_format
+ self.random_crop.labels_format = self.labels_format
+ self.random_flip.labels_format = self.labels_format
+ self.resize.labels_format = self.labels_format
+
+ inverters = []
+
+ for transform in self.sequence:
+ if return_inverter and ('return_inverter' in inspect.signature(transform).parameters):
+ image, labels, inverter = transform(image, labels, return_inverter=True)
+ inverters.append(inverter)
+ else:
+ image, labels = transform(image, labels)
+
+ if return_inverter:
+ return image, labels, inverters[::-1]
+ else:
+ return image, labels
diff --git a/engine/object_detection_branch/single_shot_detector/data_generator/data_augmentation_chain_satellite.py b/engine/object_detection_branch/single_shot_detector/data_generator/data_augmentation_chain_satellite.py
new file mode 100644
index 0000000..c2e2cb9
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/data_generator/data_augmentation_chain_satellite.py
@@ -0,0 +1,157 @@
+'''
+A data augmentation pipeline for datasets in bird's eye view, i.e. where there is
+no "up" or "down" in the images.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+
+from data_generator.object_detection_2d_photometric_ops import ConvertColor, ConvertDataType, ConvertTo3Channels, RandomBrightness, RandomContrast, RandomHue, RandomSaturation
+from data_generator.object_detection_2d_geometric_ops import Resize, RandomFlip, RandomRotate
+from data_generator.object_detection_2d_patch_sampling_ops import PatchCoordinateGenerator, RandomPatch
+from data_generator.object_detection_2d_image_boxes_validation_utils import BoxFilter, ImageValidator
+
+class DataAugmentationSatellite:
+ '''
+ A data augmentation pipeline for datasets in bird's eye view, i.e. where there is
+ no "up" or "down" in the images.
+
+ Applies a chain of photometric and geometric image transformations. For documentation, please refer
+ to the documentation of the individual transformations involved.
+ '''
+
+ def __init__(self,
+ resize_height,
+ resize_width,
+ random_brightness=(-48, 48, 0.5),
+ random_contrast=(0.5, 1.8, 0.5),
+ random_saturation=(0.5, 1.8, 0.5),
+ random_hue=(18, 0.5),
+ random_flip=0.5,
+ random_rotate=([90, 180, 270], 0.5),
+ min_scale=0.3,
+ max_scale=2.0,
+ min_aspect_ratio = 0.8,
+ max_aspect_ratio = 1.25,
+ n_trials_max=3,
+ clip_boxes=True,
+ overlap_criterion='area',
+ bounds_box_filter=(0.3, 1.0),
+ bounds_validator=(0.5, 1.0),
+ n_boxes_min=1,
+ background=(0,0,0),
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+
+ self.n_trials_max = n_trials_max
+ self.clip_boxes = clip_boxes
+ self.overlap_criterion = overlap_criterion
+ self.bounds_box_filter = bounds_box_filter
+ self.bounds_validator = bounds_validator
+ self.n_boxes_min = n_boxes_min
+ self.background = background
+ self.labels_format = labels_format
+
+ # Determines which boxes are kept in an image after the transformations have been applied.
+ self.box_filter_patch = BoxFilter(check_overlap=True,
+ check_min_area=False,
+ check_degenerate=False,
+ overlap_criterion=self.overlap_criterion,
+ overlap_bounds=self.bounds_box_filter,
+ labels_format=self.labels_format)
+
+ self.box_filter_resize = BoxFilter(check_overlap=False,
+ check_min_area=True,
+ check_degenerate=True,
+ min_area=16,
+ labels_format=self.labels_format)
+
+ # Determines whether the result of the transformations is a valid training image.
+ self.image_validator = ImageValidator(overlap_criterion=self.overlap_criterion,
+ bounds=self.bounds_validator,
+ n_boxes_min=self.n_boxes_min,
+ labels_format=self.labels_format)
+
+ # Utility transformations
+ self.convert_to_3_channels = ConvertTo3Channels() # Make sure all images end up having 3 channels.
+ self.convert_RGB_to_HSV = ConvertColor(current='RGB', to='HSV')
+ self.convert_HSV_to_RGB = ConvertColor(current='HSV', to='RGB')
+ self.convert_to_float32 = ConvertDataType(to='float32')
+ self.convert_to_uint8 = ConvertDataType(to='uint8')
+ self.resize = Resize(height=resize_height,
+ width=resize_width,
+ box_filter=self.box_filter_resize,
+ labels_format=self.labels_format)
+
+ # Photometric transformations
+ self.random_brightness = RandomBrightness(lower=random_brightness[0], upper=random_brightness[1], prob=random_brightness[2])
+ self.random_contrast = RandomContrast(lower=random_contrast[0], upper=random_contrast[1], prob=random_contrast[2])
+ self.random_saturation = RandomSaturation(lower=random_saturation[0], upper=random_saturation[1], prob=random_saturation[2])
+ self.random_hue = RandomHue(max_delta=random_hue[0], prob=random_hue[1])
+
+ # Geometric transformations
+ self.random_horizontal_flip = RandomFlip(dim='horizontal', prob=random_flip, labels_format=self.labels_format)
+ self.random_vertical_flip = RandomFlip(dim='vertical', prob=random_flip, labels_format=self.labels_format)
+ self.random_rotate = RandomRotate(angles=random_rotate[0], prob=random_rotate[1], labels_format=self.labels_format)
+ self.patch_coord_generator = PatchCoordinateGenerator(must_match='w_ar',
+ min_scale=min_scale,
+ max_scale=max_scale,
+ scale_uniformly=False,
+ min_aspect_ratio = min_aspect_ratio,
+ max_aspect_ratio = max_aspect_ratio)
+ self.random_patch = RandomPatch(patch_coord_generator=self.patch_coord_generator,
+ box_filter=self.box_filter_patch,
+ image_validator=self.image_validator,
+ n_trials_max=self.n_trials_max,
+ clip_boxes=self.clip_boxes,
+ prob=1.0,
+ can_fail=False,
+ labels_format=self.labels_format)
+
+ # Define the processing chain.
+ self.transformations = [self.convert_to_3_channels,
+ self.convert_to_float32,
+ self.random_brightness,
+ self.random_contrast,
+ self.convert_to_uint8,
+ self.convert_RGB_to_HSV,
+ self.convert_to_float32,
+ self.random_saturation,
+ self.random_hue,
+ self.convert_to_uint8,
+ self.convert_HSV_to_RGB,
+ self.random_horizontal_flip,
+ self.random_vertical_flip,
+ self.random_rotate,
+ self.random_patch,
+ self.resize]
+
+ def __call__(self, image, labels=None):
+
+ self.random_patch.labels_format = self.labels_format
+ self.random_horizontal_flip.labels_format = self.labels_format
+ self.random_vertical_flip.labels_format = self.labels_format
+ self.random_rotate.labels_format = self.labels_format
+ self.resize.labels_format = self.labels_format
+
+ if not (labels is None):
+ for transform in self.transformations:
+ image, labels = transform(image, labels)
+ return image, labels
+ else:
+ for transform in self.sequence1:
+ image = transform(image)
+ return image
diff --git a/engine/object_detection_branch/single_shot_detector/data_generator/data_augmentation_chain_variable_input_size.py b/engine/object_detection_branch/single_shot_detector/data_generator/data_augmentation_chain_variable_input_size.py
new file mode 100644
index 0000000..7d9f2b4
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/data_generator/data_augmentation_chain_variable_input_size.py
@@ -0,0 +1,152 @@
+'''
+A data augmentation pipeline suitable for variable-size images that produces effects
+that are similar (but not identical) to those of the original SSD data augmentation
+pipeline while being faster.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+
+from data_generator.object_detection_2d_photometric_ops import ConvertColor, ConvertDataType, ConvertTo3Channels, RandomBrightness, RandomContrast, RandomHue, RandomSaturation
+from data_generator.object_detection_2d_geometric_ops import Resize, RandomFlip
+from data_generator.object_detection_2d_patch_sampling_ops import PatchCoordinateGenerator, RandomPatch
+from data_generator.object_detection_2d_image_boxes_validation_utils import BoxFilter, ImageValidator
+
+class DataAugmentationVariableInputSize:
+ '''
+ A data augmentation pipeline suitable for variable-size images that produces effects
+ that are similar (but not identical!) to those of the original SSD data augmentation
+ pipeline while being faster.
+
+ Applies a chain of photometric and geometric image transformations. For documentation, please refer
+ to the documentation of the individual transformations involved.
+ '''
+
+ def __init__(self,
+ resize_height,
+ resize_width,
+ random_brightness=(-48, 48, 0.5),
+ random_contrast=(0.5, 1.8, 0.5),
+ random_saturation=(0.5, 1.8, 0.5),
+ random_hue=(18, 0.5),
+ random_flip=0.5,
+ min_scale=0.3,
+ max_scale=2.0,
+ min_aspect_ratio = 0.5,
+ max_aspect_ratio = 2.0,
+ n_trials_max=3,
+ clip_boxes=True,
+ overlap_criterion='area',
+ bounds_box_filter=(0.3, 1.0),
+ bounds_validator=(0.5, 1.0),
+ n_boxes_min=1,
+ background=(0,0,0),
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+
+ self.n_trials_max = n_trials_max
+ self.clip_boxes = clip_boxes
+ self.overlap_criterion = overlap_criterion
+ self.bounds_box_filter = bounds_box_filter
+ self.bounds_validator = bounds_validator
+ self.n_boxes_min = n_boxes_min
+ self.background = background
+ self.labels_format = labels_format
+
+ # Determines which boxes are kept in an image after the transformations have been applied.
+ self.box_filter_patch = BoxFilter(check_overlap=True,
+ check_min_area=False,
+ check_degenerate=False,
+ overlap_criterion=self.overlap_criterion,
+ overlap_bounds=self.bounds_box_filter,
+ labels_format=self.labels_format)
+
+ self.box_filter_resize = BoxFilter(check_overlap=False,
+ check_min_area=True,
+ check_degenerate=True,
+ min_area=16,
+ labels_format=self.labels_format)
+
+ # Determines whether the result of the transformations is a valid training image.
+ self.image_validator = ImageValidator(overlap_criterion=self.overlap_criterion,
+ bounds=self.bounds_validator,
+ n_boxes_min=self.n_boxes_min,
+ labels_format=self.labels_format)
+
+ # Utility transformations
+ self.convert_to_3_channels = ConvertTo3Channels() # Make sure all images end up having 3 channels.
+ self.convert_RGB_to_HSV = ConvertColor(current='RGB', to='HSV')
+ self.convert_HSV_to_RGB = ConvertColor(current='HSV', to='RGB')
+ self.convert_to_float32 = ConvertDataType(to='float32')
+ self.convert_to_uint8 = ConvertDataType(to='uint8')
+ self.resize = Resize(height=resize_height,
+ width=resize_width,
+ box_filter=self.box_filter_resize,
+ labels_format=self.labels_format)
+
+ # Photometric transformations
+ self.random_brightness = RandomBrightness(lower=random_brightness[0], upper=random_brightness[1], prob=random_brightness[2])
+ self.random_contrast = RandomContrast(lower=random_contrast[0], upper=random_contrast[1], prob=random_contrast[2])
+ self.random_saturation = RandomSaturation(lower=random_saturation[0], upper=random_saturation[1], prob=random_saturation[2])
+ self.random_hue = RandomHue(max_delta=random_hue[0], prob=random_hue[1])
+
+ # Geometric transformations
+ self.random_flip = RandomFlip(dim='horizontal', prob=random_flip, labels_format=self.labels_format)
+ self.patch_coord_generator = PatchCoordinateGenerator(must_match='w_ar',
+ min_scale=min_scale,
+ max_scale=max_scale,
+ scale_uniformly=False,
+ min_aspect_ratio = min_aspect_ratio,
+ max_aspect_ratio = max_aspect_ratio)
+ self.random_patch = RandomPatch(patch_coord_generator=self.patch_coord_generator,
+ box_filter=self.box_filter_patch,
+ image_validator=self.image_validator,
+ n_trials_max=self.n_trials_max,
+ clip_boxes=self.clip_boxes,
+ prob=1.0,
+ can_fail=False,
+ labels_format=self.labels_format)
+
+ # Define the processing chain
+ self.transformations = [self.convert_to_3_channels,
+ self.convert_to_float32,
+ self.random_brightness,
+ self.random_contrast,
+ self.convert_to_uint8,
+ self.convert_RGB_to_HSV,
+ self.convert_to_float32,
+ self.random_saturation,
+ self.random_hue,
+ self.convert_to_uint8,
+ self.convert_HSV_to_RGB,
+ self.random_patch,
+ self.random_flip,
+ self.resize]
+
+ def __call__(self, image, labels=None):
+
+ self.random_patch.labels_format = self.labels_format
+ self.random_flip.labels_format = self.labels_format
+ self.resize.labels_format = self.labels_format
+
+ if not (labels is None):
+ for transform in self.transformations:
+ image, labels = transform(image, labels)
+ return image, labels
+ else:
+ for transform in self.sequence1:
+ image = transform(image)
+ return image
diff --git a/engine/object_detection_branch/single_shot_detector/data_generator/object_detection_2d_data_generator.py b/engine/object_detection_branch/single_shot_detector/data_generator/object_detection_2d_data_generator.py
new file mode 100644
index 0000000..819edbe
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/data_generator/object_detection_2d_data_generator.py
@@ -0,0 +1,1223 @@
+'''
+A data generator for 2D object detection.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+
+import csv
+import inspect
+import numpy as np
+import os
+import sys
+import warnings
+from collections import defaultdict
+from copy import deepcopy
+
+import cv2
+import sklearn.utils
+from PIL import Image
+# from tqdm import tqdm, trange
+
+try:
+ import h5py
+except ImportError:
+ warnings.warn("'h5py' module is missing. The fast HDF5 dataset option will be unavailable.")
+try:
+ import json
+except ImportError:
+ warnings.warn("'json' module is missing. The JSON-parser will be unavailable.")
+try:
+ from bs4 import BeautifulSoup
+except ImportError:
+ warnings.warn("'BeautifulSoup' module is missing. The XML-parser will be unavailable.")
+try:
+ import pickle
+except ImportError:
+ warnings.warn("'pickle' module is missing. You won't be able to save parsed file lists and annotations as pickled files.")
+
+from engine.object_detection_branch.single_shot_detector.ssd_encoder_decoder.ssd_input_encoder import SSDInputEncoder
+from engine.object_detection_branch.single_shot_detector.data_generator.object_detection_2d_image_boxes_validation_utils import BoxFilter
+
+class DegenerateBatchError(Exception):
+ '''
+ An exception class to be raised if a generated batch ends up being degenerate,
+ e.g. if a generated batch is empty.
+ '''
+ pass
+
+class DatasetError(Exception):
+ '''
+ An exception class to be raised if a anything is wrong with the dataset,
+ in particular if you try to generate batches when no dataset was loaded.
+ '''
+ pass
+
+class DataGenerator:
+ '''
+ A generator to generate batches of samples and corresponding labels indefinitely.
+
+ Can shuffle the dataset consistently after each complete pass.
+
+ Currently provides three methods to parse annotation data: A general-purpose CSV parser,
+ an XML parser for the Pascal VOC datasets, and a JSON parser for the MS COCO datasets.
+ If the annotations of your dataset are in a format that is not supported by these parsers,
+ you could just add another parser method and still use this generator.
+
+ Can perform image transformations for data conversion and data augmentation,
+ for details please refer to the documentation of the `generate()` method.
+ '''
+
+ def __init__(self,
+ load_images_into_memory=False,
+ hdf5_dataset_path=None,
+ filenames=None,
+ filenames_type='text',
+ images_dir=None,
+ labels=None,
+ image_ids=None,
+ eval_neutral=None,
+ labels_output_format=('class_id', 'xmin', 'ymin', 'xmax', 'ymax'),
+ verbose=True):
+ '''
+ Initializes the data generator. You can either load a dataset directly here in the img_inference_instance,
+ e.g. an HDF5 dataset, or you can use one of the parser methods to read in a dataset.
+
+ Arguments:
+ load_images_into_memory (bool, optional): If `True`, the entire dataset will be loaded into memory.
+ This enables noticeably faster data generation than loading batches of images into memory ad hoc.
+ Be sure that you have enough memory before you activate this option.
+ hdf5_dataset_path (str, optional): The full file path of an HDF5 file that contains a dataset in the
+ format that the `create_hdf5_dataset()` method produces. If you load such an HDF5 dataset, you
+ don't need to use any of the parser methods anymore, the HDF5 dataset already contains all relevant
+ data.
+ filenames (string or list, optional): `None` or either a Python list/tuple or a string representing
+ a filepath. If a list/tuple is passed, it must contain the file names (full paths) of the
+ images to be used. Note that the list/tuple must contain the paths to the images,
+ not the images themselves. If a filepath string is passed, it must point either to
+ (1) a pickled file containing a list/tuple as described above. In this case the `filenames_type`
+ argument must be set to `pickle`.
+ Or
+ (2) a text file. Each line of the text file contains the file name (basename of the file only,
+ not the full directory path) to one image and nothing else. In this case the `filenames_type`
+ argument must be set to `text` and you must pass the path to the directory that contains the
+ images in `images_dir`.
+ filenames_type (string, optional): In case a string is passed for `filenames`, this indicates what
+ type of file `filenames` is. It can be either 'pickle' for a pickled file or 'text' for a
+ plain text file.
+ images_dir (string, optional): In case a text file is passed for `filenames`, the full paths to
+ the images will be composed from `images_dir` and the names in the text file, i.e. this
+ should be the directory that contains the images to which the text file refers.
+ If `filenames_type` is not 'text', then this argument is irrelevant.
+ labels (string or list, optional): `None` or either a Python list/tuple or a string representing
+ the path to a pickled file containing a list/tuple. The list/tuple must contain Numpy arrays
+ that represent the labels of the dataset.
+ image_ids (string or list, optional): `None` or either a Python list/tuple or a string representing
+ the path to a pickled file containing a list/tuple. The list/tuple must contain the image
+ IDs of the images in the dataset.
+ eval_neutral (string or list, optional): `None` or either a Python list/tuple or a string representing
+ the path to a pickled file containing a list/tuple. The list/tuple must contain for each image
+ a list that indicates for each ground truth object in the image whether that object is supposed
+ to be treated as neutral during an evaluation.
+ labels_output_format (list, optional): A list of five strings representing the desired order of the five
+ items class ID, xmin, ymin, xmax, ymax in the generated ground truth data (if any). The expected
+ strings are 'xmin', 'ymin', 'xmax', 'ymax', 'class_id'.
+ verbose (bool, optional): If `True`, prints out the progress for some img_inference_instance operations that may
+ take a bit longer.
+ '''
+ self.labels_output_format = labels_output_format
+ self.labels_format={'class_id': labels_output_format.index('class_id'),
+ 'xmin': labels_output_format.index('xmin'),
+ 'ymin': labels_output_format.index('ymin'),
+ 'xmax': labels_output_format.index('xmax'),
+ 'ymax': labels_output_format.index('ymax')} # This dictionary is for internal use.
+
+ self.dataset_size = 0 # As long as we haven't loaded anything yet, the dataset size is zero.
+ self.load_images_into_memory = load_images_into_memory
+ self.images = None # The only way that this list will not stay `None` is if `load_images_into_memory == True`.
+
+ # `self.filenames` is a list containing all file names of the image samples (full paths).
+ # Note that it does not contain the actual image files themselves. This list is one of the outputs of the parser methods.
+ # In case you are loading an HDF5 dataset, this list will be `None`.
+ if not filenames is None:
+ if isinstance(filenames, (list, tuple)):
+ self.filenames = filenames
+ elif isinstance(filenames, str):
+ with open(filenames, 'rb') as f:
+ if filenames_type == 'pickle':
+ self.filenames = pickle.load(f)
+ elif filenames_type == 'text':
+ self.filenames = [os.path.join(images_dir, line.strip()) for line in f]
+ else:
+ raise ValueError("`filenames_type` can be either 'text' or 'pickle'.")
+ else:
+ raise ValueError("`filenames` must be either a Python list/tuple or a string representing a filepath (to a pickled or text file). The value you passed is neither of the two.")
+ self.dataset_size = len(self.filenames)
+ self.dataset_indices = np.arange(self.dataset_size, dtype=np.int32)
+ if load_images_into_memory:
+ self.images = []
+ if verbose: it = tqdm(self.filenames, desc='Loading images into memory', file=sys.stdout)
+ else: it = self.filenames
+ for filename in it:
+ with Image.open(filename) as image:
+ self.images.append(np.array(image, dtype=np.uint8))
+ else:
+ self.filenames = None
+
+ # In case ground truth is available, `self.labels` is a list containing for each image a list (or NumPy array)
+ # of ground truth bounding boxes for that image.
+ if not labels is None:
+ if isinstance(labels, str):
+ with open(labels, 'rb') as f:
+ self.labels = pickle.load(f)
+ elif isinstance(labels, (list, tuple)):
+ self.labels = labels
+ else:
+ raise ValueError("`labels` must be either a Python list/tuple or a string representing the path to a pickled file containing a list/tuple. The value you passed is neither of the two.")
+ else:
+ self.labels = None
+
+ if not image_ids is None:
+ if isinstance(image_ids, str):
+ with open(image_ids, 'rb') as f:
+ self.image_ids = pickle.load(f)
+ elif isinstance(image_ids, (list, tuple)):
+ self.image_ids = image_ids
+ else:
+ raise ValueError("`image_ids` must be either a Python list/tuple or a string representing the path to a pickled file containing a list/tuple. The value you passed is neither of the two.")
+ else:
+ self.image_ids = None
+
+ if not eval_neutral is None:
+ if isinstance(eval_neutral, str):
+ with open(eval_neutral, 'rb') as f:
+ self.eval_neutral = pickle.load(f)
+ elif isinstance(eval_neutral, (list, tuple)):
+ self.eval_neutral = eval_neutral
+ else:
+ raise ValueError("`image_ids` must be either a Python list/tuple or a string representing the path to a pickled file containing a list/tuple. The value you passed is neither of the two.")
+ else:
+ self.eval_neutral = None
+
+ if not hdf5_dataset_path is None:
+ self.hdf5_dataset_path = hdf5_dataset_path
+ self.load_hdf5_dataset(verbose=verbose)
+ else:
+ self.hdf5_dataset = None
+
+ def load_hdf5_dataset(self, verbose=True):
+ '''
+ Loads an HDF5 dataset that is in the format that the `create_hdf5_dataset()` method
+ produces.
+
+ Arguments:
+ verbose (bool, optional): If `True`, prints out the progress while loading
+ the dataset.
+
+ Returns:
+ None.
+ '''
+
+ self.hdf5_dataset = h5py.File(self.hdf5_dataset_path, 'r')
+ self.dataset_size = len(self.hdf5_dataset['images'])
+ self.dataset_indices = np.arange(self.dataset_size, dtype=np.int32) # Instead of shuffling the HDF5 dataset or images in memory, we will shuffle this index list.
+
+ if self.load_images_into_memory:
+ self.images = []
+ if verbose: tr = trange(self.dataset_size, desc='Loading images into memory', file=sys.stdout)
+ else: tr = range(self.dataset_size)
+ for i in tr:
+ self.images.append(self.hdf5_dataset['images'][i].reshape(self.hdf5_dataset['image_shapes'][i]))
+
+ if self.hdf5_dataset.attrs['has_labels']:
+ self.labels = []
+ labels = self.hdf5_dataset['labels']
+ label_shapes = self.hdf5_dataset['label_shapes']
+ if verbose: tr = trange(self.dataset_size, desc='Loading labels', file=sys.stdout)
+ else: tr = range(self.dataset_size)
+ for i in tr:
+ self.labels.append(labels[i].reshape(label_shapes[i]))
+
+ if self.hdf5_dataset.attrs['has_image_ids']:
+ self.image_ids = []
+ image_ids = self.hdf5_dataset['image_ids']
+ if verbose: tr = trange(self.dataset_size, desc='Loading image IDs', file=sys.stdout)
+ else: tr = range(self.dataset_size)
+ for i in tr:
+ self.image_ids.append(image_ids[i])
+
+ if self.hdf5_dataset.attrs['has_eval_neutral']:
+ self.eval_neutral = []
+ eval_neutral = self.hdf5_dataset['eval_neutral']
+ if verbose: tr = trange(self.dataset_size, desc='Loading evaluation-neutrality annotations', file=sys.stdout)
+ else: tr = range(self.dataset_size)
+ for i in tr:
+ self.eval_neutral.append(eval_neutral[i])
+
+ def parse_csv(self,
+ images_dir,
+ labels_filename,
+ input_format,
+ include_classes='all',
+ random_sample=False,
+ ret=False,
+ verbose=True):
+ '''
+ Arguments:
+ images_dir (str): The path to the directory that contains the images.
+ labels_filename (str): The filepath to a CSV file that contains one ground truth bounding box per line
+ and each line contains the following six items: image file name, class ID, xmin, xmax, ymin, ymax.
+ The six items do not have to be in a specific order, but they must be the first six columns of
+ each line. The order of these items in the CSV file must be specified in `input_format`.
+ The class ID is an integer greater than zero. Class ID 0 is reserved for the background class.
+ `xmin` and `xmax` are the left-most and right-most absolute horizontal coordinates of the box,
+ `ymin` and `ymax` are the top-most and bottom-most absolute vertical coordinates of the box.
+ The image name is expected to be just the name of the image file without the directory path
+ at which the image is located.
+ input_format (list): A list of six strings representing the order of the six items
+ image file name, class ID, xmin, xmax, ymin, ymax in the input CSV file. The expected strings
+ are 'image_name', 'xmin', 'xmax', 'ymin', 'ymax', 'class_id'.
+ include_classes (list, optional): Either 'all' or a list of integers containing the class IDs that
+ are to be included in the dataset. If 'all', all ground truth boxes will be included in the dataset.
+ random_sample (float, optional): Either `False` or a float in `[0,1]`. If this is `False`, the
+ full dataset will be used by the generator. If this is a float in `[0,1]`, a randomly sampled
+ fraction of the dataset will be used, where `random_sample` is the fraction of the dataset
+ to be used. For example, if `random_sample = 0.2`, 20 precent of the dataset will be randomly selected,
+ the rest will be ommitted. The fraction refers to the number of images, not to the number
+ of boxes, i.e. each image that will be added to the dataset will always be added with all
+ of its boxes.
+ ret (bool, optional): Whether or not to return the outputs of the parser.
+ verbose (bool, optional): If `True`, prints out the progress for operations that may take a bit longer.
+
+ Returns:
+ None by default, optionally lists for whichever are available of images, image filenames, labels, and image IDs.
+ '''
+
+ # Set class members.
+ self.images_dir = images_dir
+ self.labels_filename = labels_filename
+ self.input_format = input_format
+ self.include_classes = include_classes
+
+ # Before we begin, make sure that we have a labels_filename and an input_format
+ if self.labels_filename is None or self.input_format is None:
+ raise ValueError("`labels_filename` and/or `input_format` have not been set yet. You need to pass them as arguments.")
+
+ # Erase data that might have been parsed before
+ self.filenames = []
+ self.image_ids = []
+ self.labels = []
+
+ # First, just read in the CSV file lines and sort them.
+
+ data = []
+
+ with open(self.labels_filename, newline='') as csvfile:
+ csvread = csv.reader(csvfile, delimiter=',')
+ next(csvread) # Skip the header row.
+ for row in csvread: # For every line (i.e for every bounding box) in the CSV file...
+ if self.include_classes == 'all' or int(row[self.input_format.index('class_id')].strip()) in self.include_classes: # If the class_id is among the classes that are to be included in the dataset...
+ box = [] # Store the box class and coordinates here
+ box.append(row[self.input_format.index('image_name')].strip()) # Select the image name column in the input format and append its content to `box`
+ for element in self.labels_output_format: # For each element in the output format (where the elements are the class ID and the four box coordinates)...
+ box.append(int(row[self.input_format.index(element)].strip())) # ...select the respective column in the input format and append it to `box`.
+ data.append(box)
+
+ data = sorted(data) # The data needs to be sorted, otherwise the next step won't give the correct result
+
+ # Now that we've made sure that the data is sorted by file names,
+ # we can compile the actual samples and labels lists
+
+ current_file = data[0][0] # The current image for which we're collecting the ground truth boxes
+ current_image_id = data[0][0].split('.')[0] # The image ID will be the portion of the image name before the first dot.
+ current_labels = [] # The list where we collect all ground truth boxes for a given image
+ add_to_dataset = False
+ for i, box in enumerate(data):
+
+ if box[0] == current_file: # If this box (i.e. this line of the CSV file) belongs to the current image file
+ current_labels.append(box[1:])
+ if i == len(data)-1: # If this is the last line of the CSV file
+ if random_sample: # In case we're not using the full dataset, but a random sample of it.
+ p = np.random.uniform(0,1)
+ if p >= (1-random_sample):
+ self.labels.append(np.stack(current_labels, axis=0))
+ self.filenames.append(os.path.join(self.images_dir, current_file))
+ self.image_ids.append(current_image_id)
+ else:
+ self.labels.append(np.stack(current_labels, axis=0))
+ self.filenames.append(os.path.join(self.images_dir, current_file))
+ self.image_ids.append(current_image_id)
+ else: # If this box belongs to a new image file
+ if random_sample: # In case we're not using the full dataset, but a random sample of it.
+ p = np.random.uniform(0,1)
+ if p >= (1-random_sample):
+ self.labels.append(np.stack(current_labels, axis=0))
+ self.filenames.append(os.path.join(self.images_dir, current_file))
+ self.image_ids.append(current_image_id)
+ else:
+ self.labels.append(np.stack(current_labels, axis=0))
+ self.filenames.append(os.path.join(self.images_dir, current_file))
+ self.image_ids.append(current_image_id)
+ current_labels = [] # Reset the labels list because this is a new file.
+ current_file = box[0]
+ current_image_id = box[0].split('.')[0]
+ current_labels.append(box[1:])
+ if i == len(data)-1: # If this is the last line of the CSV file
+ if random_sample: # In case we're not using the full dataset, but a random sample of it.
+ p = np.random.uniform(0,1)
+ if p >= (1-random_sample):
+ self.labels.append(np.stack(current_labels, axis=0))
+ self.filenames.append(os.path.join(self.images_dir, current_file))
+ self.image_ids.append(current_image_id)
+ else:
+ self.labels.append(np.stack(current_labels, axis=0))
+ self.filenames.append(os.path.join(self.images_dir, current_file))
+ self.image_ids.append(current_image_id)
+
+ self.dataset_size = len(self.filenames)
+ self.dataset_indices = np.arange(self.dataset_size, dtype=np.int32)
+ if self.load_images_into_memory:
+ self.images = []
+ if verbose: it = tqdm(self.filenames, desc='Loading images into memory', file=sys.stdout)
+ else: it = self.filenames
+ for filename in it:
+ with Image.open(filename) as image:
+ self.images.append(np.array(image, dtype=np.uint8))
+
+ if ret: # In case we want to return these
+ return self.images, self.filenames, self.labels, self.image_ids
+
+ def parse_xml(self,
+ images_dirs,
+ image_set_filenames,
+ annotations_dirs=[],
+ classes=['background',
+ 'aeroplane', 'bicycle', 'bird', 'boat',
+ 'bottle', 'bus', 'car', 'cat',
+ 'chair', 'cow', 'diningtable', 'dog',
+ 'horse', 'motorbike', 'person', 'pottedplant',
+ 'sheep', 'sofa', 'train', 'tvmonitor'],
+ include_classes = 'all',
+ exclude_truncated=False,
+ exclude_difficult=False,
+ ret=False,
+ verbose=True):
+ '''
+ This is an XML parser for the Pascal VOC datasets. It might be applicable to other datasets with minor changes to
+ the code, but in its current form it expects the data format and XML tags of the Pascal VOC datasets.
+
+ Arguments:
+ images_dirs (list): A list of strings, where each string is the path of a directory that
+ contains images that are to be part of the dataset. This allows you to aggregate multiple datasets
+ into one (e.g. one directory that contains the images for Pascal VOC 2007, another that contains
+ the images for Pascal VOC 2012, etc.).
+ image_set_filenames (list): A list of strings, where each string is the path of the text file with the image
+ set to be loaded. Must be one file per image directory given. These text files define what images in the
+ respective image directories are to be part of the dataset and simply contains one image ID per line
+ and nothing else.
+ annotations_dirs (list, optional): A list of strings, where each string is the path of a directory that
+ contains the annotations (XML files) that belong to the images in the respective image directories given.
+ The directories must contain one XML file per image and the name of an XML file must be the image ID
+ of the image it belongs to. The content of the XML files must be in the Pascal VOC format.
+ classes (list, optional): A list containing the names of the object classes as found in the
+ `name` XML tags. Must include the class `background` as the first list item. The order of this list
+ defines the class IDs.
+ include_classes (list, optional): Either 'all' or a list of integers containing the class IDs that
+ are to be included in the dataset. If 'all', all ground truth boxes will be included in the dataset.
+ exclude_truncated (bool, optional): If `True`, excludes boxes that are labeled as 'truncated'.
+ exclude_difficult (bool, optional): If `True`, excludes boxes that are labeled as 'difficult'.
+ ret (bool, optional): Whether or not to return the outputs of the parser.
+ verbose (bool, optional): If `True`, prints out the progress for operations that may take a bit longer.
+
+ Returns:
+ None by default, optionally lists for whichever are available of images, image filenames, labels, image IDs,
+ and a list indicating which boxes are annotated with the label "difficult".
+ '''
+ # Set class members.
+ self.images_dirs = images_dirs
+ self.annotations_dirs = annotations_dirs
+ self.image_set_filenames = image_set_filenames
+ self.classes = classes
+ self.include_classes = include_classes
+
+ # Erase data that might have been parsed before.
+ self.filenames = []
+ self.image_ids = []
+ self.labels = []
+ self.eval_neutral = []
+ if not annotations_dirs:
+ self.labels = None
+ self.eval_neutral = None
+ annotations_dirs = [None] * len(images_dirs)
+
+ for images_dir, image_set_filename, annotations_dir in zip(images_dirs, image_set_filenames, annotations_dirs):
+ # Read the image set file that so that we know all the IDs of all the images to be included in the dataset.
+ with open(image_set_filename) as f:
+ image_ids = [line.strip() for line in f] # Note: These are strings, not integers.
+ self.image_ids += image_ids
+
+ if verbose: it = tqdm(image_ids, desc="Processing image set '{}'".format(os.path.basename(image_set_filename)), file=sys.stdout)
+ else: it = image_ids
+
+ # Loop over all images in this dataset.
+ for image_id in it:
+
+ filename = '{}'.format(image_id) + '.jpg'
+ self.filenames.append(os.path.join(images_dir, filename))
+
+ if not annotations_dir is None:
+ # Parse the XML file for this image.
+ with open(os.path.join(annotations_dir, image_id + '.xml')) as f:
+ soup = BeautifulSoup(f, 'xml')
+
+ folder = soup.folder.text # In case we want to return the folder in addition to the image file name. Relevant for determining which dataset an image belongs to.
+ #filename = soup.filename.text
+
+ boxes = [] # We'll store all boxes for this image here.
+ eval_neutr = [] # We'll store whether a box is annotated as "difficult" here.
+ objects = soup.find_all('object') # Get a list of all objects in this image.
+
+ # Parse the data for each object.
+ for obj in objects:
+ class_name = obj.find('name', recursive=False).text
+ class_id = self.classes.index(class_name)
+ # Check whether this class is supposed to be included in the dataset.
+ if (not self.include_classes == 'all') and (not class_id in self.include_classes): continue
+ pose = obj.find('pose', recursive=False).text
+ truncated = int(obj.find('truncated', recursive=False).text)
+ if exclude_truncated and (truncated == 1): continue
+ difficult = int(obj.find('difficult', recursive=False).text)
+ if exclude_difficult and (difficult == 1): continue
+ # Get the bounding box coordinates.
+ bndbox = obj.find('bndbox', recursive=False)
+ xmin = int(bndbox.xmin.text)
+ ymin = int(bndbox.ymin.text)
+ xmax = int(bndbox.xmax.text)
+ ymax = int(bndbox.ymax.text)
+ item_dict = {'folder': folder,
+ 'image_name': filename,
+ 'image_id': image_id,
+ 'class_name': class_name,
+ 'class_id': class_id,
+ 'pose': pose,
+ 'truncated': truncated,
+ 'difficult': difficult,
+ 'xmin': xmin,
+ 'ymin': ymin,
+ 'xmax': xmax,
+ 'ymax': ymax}
+ box = []
+ for item in self.labels_output_format:
+ box.append(item_dict[item])
+ boxes.append(box)
+ if difficult: eval_neutr.append(True)
+ else: eval_neutr.append(False)
+
+ self.labels.append(boxes)
+ self.eval_neutral.append(eval_neutr)
+
+ self.dataset_size = len(self.filenames)
+ self.dataset_indices = np.arange(self.dataset_size, dtype=np.int32)
+ if self.load_images_into_memory:
+ self.images = []
+ if verbose: it = tqdm(self.filenames, desc='Loading images into memory', file=sys.stdout)
+ else: it = self.filenames
+ for filename in it:
+ with Image.open(filename) as image:
+ self.images.append(np.array(image, dtype=np.uint8))
+
+ if ret:
+ return self.images, self.filenames, self.labels, self.image_ids, self.eval_neutral
+
+ def parse_json(self,
+ images_dirs,
+ annotations_filenames,
+ ground_truth_available=False,
+ include_classes='all',
+ ret=False,
+ verbose=True):
+ '''
+ This is an JSON parser for the MS COCO datasets. It might be applicable to other datasets with minor changes to
+ the code, but in its current form it expects the JSON format of the MS COCO datasets.
+
+ Arguments:
+ images_dirs (list, optional): A list of strings, where each string is the path of a directory that
+ contains images that are to be part of the dataset. This allows you to aggregate multiple datasets
+ into one (e.g. one directory that contains the images for MS COCO Train 2014, another one for MS COCO
+ Val 2014, another one for MS COCO Train 2017 etc.).
+ annotations_filenames (list): A list of strings, where each string is the path of the JSON file
+ that contains the annotations for the images in the respective image directories given, i.e. one
+ JSON file per image directory that contains the annotations for all images in that directory.
+ The content of the JSON files must be in MS COCO object detection format. Note that these annotations
+ files do not necessarily need to contain ground truth information. MS COCO also provides annotations
+ files without ground truth information for the test datasets, called `image_info_[...].json`.
+ ground_truth_available (bool, optional): Set `True` if the annotations files contain ground truth information.
+ include_classes (list, optional): Either 'all' or a list of integers containing the class IDs that
+ are to be included in the dataset. If 'all', all ground truth boxes will be included in the dataset.
+ ret (bool, optional): Whether or not to return the outputs of the parser.
+ verbose (bool, optional): If `True`, prints out the progress for operations that may take a bit longer.
+
+ Returns:
+ None by default, optionally lists for whichever are available of images, image filenames, labels and image IDs.
+ '''
+ self.images_dirs = images_dirs
+ self.annotations_filenames = annotations_filenames
+ self.include_classes = include_classes
+ # Erase data that might have been parsed before.
+ self.filenames = []
+ self.image_ids = []
+ self.labels = []
+ if not ground_truth_available:
+ self.labels = None
+
+ # Build the dictionaries that map between class names and class IDs.
+ with open(annotations_filenames[0], 'r') as f:
+ annotations = json.load(f)
+ # Unfortunately the 80 MS COCO class IDs are not all consecutive. They go
+ # from 1 to 90 and some numbers are skipped. Since the IDs that we feed
+ # into a neural network must be consecutive, we'll save both the original
+ # (non-consecutive) IDs as well as transformed maps.
+ # We'll save both the map between the original
+ self.cats_to_names = {} # The map between class names (values) and their original IDs (keys)
+ self.classes_to_names = [] # A list of the class names with their indices representing the transformed IDs
+ self.classes_to_names.append('background') # Need to add the background class first so that the indexing is right.
+ self.cats_to_classes = {} # A dictionary that maps between the original (keys) and the transformed IDs (values)
+ self.classes_to_cats = {} # A dictionary that maps between the transformed (keys) and the original IDs (values)
+ for i, cat in enumerate(annotations['categories']):
+ self.cats_to_names[cat['id']] = cat['name']
+ self.classes_to_names.append(cat['name'])
+ self.cats_to_classes[cat['id']] = i + 1
+ self.classes_to_cats[i + 1] = cat['id']
+
+ # Iterate over all datasets.
+ for images_dir, annotations_filename in zip(self.images_dirs, self.annotations_filenames):
+ # Load the JSON file.
+ with open(annotations_filename, 'r') as f:
+ annotations = json.load(f)
+
+ if ground_truth_available:
+ # Create the annotations map, a dictionary whose keys are the image IDs
+ # and whose values are the annotations for the respective image ID.
+ image_ids_to_annotations = defaultdict(list)
+ for annotation in annotations['annotations']:
+ image_ids_to_annotations[annotation['image_id']].append(annotation)
+
+ if verbose: it = tqdm(annotations['images'], desc="Processing '{}'".format(os.path.basename(annotations_filename)), file=sys.stdout)
+ else: it = annotations['images']
+
+ # Loop over all images in this dataset.
+ for img in it:
+
+ self.filenames.append(os.path.join(images_dir, img['file_name']))
+ self.image_ids.append(img['id'])
+
+ if ground_truth_available:
+ # Get all annotations for this image.
+ annotations = image_ids_to_annotations[img['id']]
+ boxes = []
+ for annotation in annotations:
+ cat_id = annotation['category_id']
+ # Check if this class is supposed to be included in the dataset.
+ if (not self.include_classes == 'all') and (not cat_id in self.include_classes): continue
+ # Transform the original class ID to fit in the sequence of consecutive IDs.
+ class_id = self.cats_to_classes[cat_id]
+ xmin = annotation['bbox'][0]
+ ymin = annotation['bbox'][1]
+ width = annotation['bbox'][2]
+ height = annotation['bbox'][3]
+ # Compute `xmax` and `ymax`.
+ xmax = xmin + width
+ ymax = ymin + height
+ item_dict = {'image_name': img['file_name'],
+ 'image_id': img['id'],
+ 'class_id': class_id,
+ 'xmin': xmin,
+ 'ymin': ymin,
+ 'xmax': xmax,
+ 'ymax': ymax}
+ box = []
+ for item in self.labels_output_format:
+ box.append(item_dict[item])
+ boxes.append(box)
+ self.labels.append(boxes)
+
+ self.dataset_size = len(self.filenames)
+ self.dataset_indices = np.arange(self.dataset_size, dtype=np.int32)
+ if self.load_images_into_memory:
+ self.images = []
+ if verbose: it = tqdm(self.filenames, desc='Loading images into memory', file=sys.stdout)
+ else: it = self.filenames
+ for filename in it:
+ with Image.open(filename) as image:
+ self.images.append(np.array(image, dtype=np.uint8))
+
+ if ret:
+ return self.images, self.filenames, self.labels, self.image_ids
+
+ def create_hdf5_dataset(self,
+ file_path='dataset.h5',
+ resize=False,
+ variable_image_size=True,
+ verbose=True):
+ '''
+ Converts the currently loaded dataset into a HDF5 file. This HDF5 file contains all
+ images as uncompressed arrays in a contiguous block of memory, which allows for them
+ to be loaded faster. Such an uncompressed dataset, however, may take up considerably
+ more space on your hard drive than the sum of the source images in a compressed format
+ such as JPG or PNG.
+
+ It is recommended that you always convert the dataset into an HDF5 dataset if you
+ have enugh hard drive space since loading from an HDF5 dataset accelerates the data
+ generation noticeably.
+
+ Note that you must load a dataset (e.g. via one of the parser methods) before creating
+ an HDF5 dataset from it.
+
+ The created HDF5 dataset will remain open upon its creation so that it can be used right
+ away.
+
+ Arguments:
+ file_path (str, optional): The full file path under which to store the HDF5 dataset.
+ You can load this output file via the `DataGenerator` img_inference_instance in the future.
+ resize (tuple, optional): `False` or a 2-tuple `(height, width)` that represents the
+ target size for the images. All images in the dataset will be resized to this
+ target size before they will be written to the HDF5 file. If `False`, no resizing
+ will be performed.
+ variable_image_size (bool, optional): The only purpose of this argument is that its
+ value will be stored in the HDF5 dataset in order to be able to quickly find out
+ whether the images in the dataset all have the same size or not.
+ verbose (bool, optional): Whether or not prit out the progress of the dataset creation.
+
+ Returns:
+ None.
+ '''
+
+ self.hdf5_dataset_path = file_path
+
+ dataset_size = len(self.filenames)
+
+ # Create the HDF5 file.
+ hdf5_dataset = h5py.File(file_path, 'w')
+
+ # Create a few attributes that tell us what this dataset contains.
+ # The dataset will obviously always contain images, but maybe it will
+ # also contain labels, image IDs, etc.
+ hdf5_dataset.attrs.create(name='has_labels', data=False, shape=None, dtype=np.bool_)
+ hdf5_dataset.attrs.create(name='has_image_ids', data=False, shape=None, dtype=np.bool_)
+ hdf5_dataset.attrs.create(name='has_eval_neutral', data=False, shape=None, dtype=np.bool_)
+ # It's useful to be able to quickly check whether the images in a dataset all
+ # have the same size or not, so add a boolean attribute for that.
+ if variable_image_size and not resize:
+ hdf5_dataset.attrs.create(name='variable_image_size', data=True, shape=None, dtype=np.bool_)
+ else:
+ hdf5_dataset.attrs.create(name='variable_image_size', data=False, shape=None, dtype=np.bool_)
+
+ # Create the dataset in which the images will be stored as flattened arrays.
+ # This allows us, among other things, to store images of variable size.
+ hdf5_images = hdf5_dataset.create_dataset(name='images',
+ shape=(dataset_size,),
+ maxshape=(None),
+ dtype=h5py.special_dtype(vlen=np.uint8))
+
+ # Create the dataset that will hold the image heights, widths and channels that
+ # we need in order to reconstruct the images from the flattened arrays later.
+ hdf5_image_shapes = hdf5_dataset.create_dataset(name='image_shapes',
+ shape=(dataset_size, 3),
+ maxshape=(None, 3),
+ dtype=np.int32)
+
+ if not (self.labels is None):
+
+ # Create the dataset in which the labels will be stored as flattened arrays.
+ hdf5_labels = hdf5_dataset.create_dataset(name='labels',
+ shape=(dataset_size,),
+ maxshape=(None),
+ dtype=h5py.special_dtype(vlen=np.int32))
+
+ # Create the dataset that will hold the dimensions of the labels arrays for
+ # each image so that we can restore the labels from the flattened arrays later.
+ hdf5_label_shapes = hdf5_dataset.create_dataset(name='label_shapes',
+ shape=(dataset_size, 2),
+ maxshape=(None, 2),
+ dtype=np.int32)
+
+ hdf5_dataset.attrs.modify(name='has_labels', value=True)
+
+ if not (self.image_ids is None):
+
+ hdf5_image_ids = hdf5_dataset.create_dataset(name='image_ids',
+ shape=(dataset_size,),
+ maxshape=(None),
+ dtype=h5py.special_dtype(vlen=str))
+
+ hdf5_dataset.attrs.modify(name='has_image_ids', value=True)
+
+ if not (self.eval_neutral is None):
+
+ # Create the dataset in which the labels will be stored as flattened arrays.
+ hdf5_eval_neutral = hdf5_dataset.create_dataset(name='eval_neutral',
+ shape=(dataset_size,),
+ maxshape=(None),
+ dtype=h5py.special_dtype(vlen=np.bool_))
+
+ hdf5_dataset.attrs.modify(name='has_eval_neutral', value=True)
+
+ if verbose:
+ tr = trange(dataset_size, desc='Creating HDF5 dataset', file=sys.stdout)
+ else:
+ tr = range(dataset_size)
+
+ # Iterate over all images in the dataset.
+ for i in tr:
+
+ # Store the image.
+ with Image.open(self.filenames[i]) as image:
+
+ image = np.asarray(image, dtype=np.uint8)
+
+ # Make sure all images end up having three channels.
+ if image.ndim == 2:
+ image = np.stack([image] * 3, axis=-1)
+ elif image.ndim == 3:
+ if image.shape[2] == 1:
+ image = np.concatenate([image] * 3, axis=-1)
+ elif image.shape[2] == 4:
+ image = image[:,:,:3]
+
+ if resize:
+ image = cv2.resize(image, dsize=(resize[1], resize[0]))
+
+ # Flatten the image array and write it to the images dataset.
+ hdf5_images[i] = image.reshape(-1)
+ # Write the image's shape to the image shapes dataset.
+ hdf5_image_shapes[i] = image.shape
+
+ # Store the ground truth if we have any.
+ if not (self.labels is None):
+
+ labels = np.asarray(self.labels[i])
+ # Flatten the labels array and write it to the labels dataset.
+ hdf5_labels[i] = labels.reshape(-1)
+ # Write the labels' shape to the label shapes dataset.
+ hdf5_label_shapes[i] = labels.shape
+
+ # Store the image ID if we have one.
+ if not (self.image_ids is None):
+
+ hdf5_image_ids[i] = self.image_ids[i]
+
+ # Store the evaluation-neutrality annotations if we have any.
+ if not (self.eval_neutral is None):
+
+ hdf5_eval_neutral[i] = self.eval_neutral[i]
+
+ hdf5_dataset.close()
+ self.hdf5_dataset = h5py.File(file_path, 'r')
+ self.hdf5_dataset_path = file_path
+ self.dataset_size = len(self.hdf5_dataset['images'])
+ self.dataset_indices = np.arange(self.dataset_size, dtype=np.int32) # Instead of shuffling the HDF5 dataset, we will shuffle this index list.
+
+ def generate(self,
+ batch_size=32,
+ shuffle=True,
+ transformations=[],
+ label_encoder=None,
+ returns={'processed_images', 'encoded_labels'},
+ keep_images_without_gt=False,
+ degenerate_box_handling='remove'):
+ '''
+ Generates batches of samples and (optionally) corresponding labels indefinitely.
+
+ Can shuffle the samples consistently after each complete pass.
+
+ Optionally takes a list of arbitrary image transformations to apply to the
+ samples ad hoc.
+
+ Arguments:
+ batch_size (int, optional): The size of the batches to be generated.
+ shuffle (bool, optional): Whether or not to shuffle the dataset before each pass.
+ This option should always be `True` during training, but it can be useful to turn shuffling off
+ for debugging or if you're using the generator for prediction.
+ transformations (list, optional): A list of transformations that will be applied to the images and labels
+ in the given order. Each transformation is a callable that takes as input an image (as a Numpy array)
+ and optionally labels (also as a Numpy array) and returns an image and optionally labels in the same
+ format.
+ label_encoder (callable, optional): Only relevant if labels are given. A callable that takes as input the
+ labels of a batch (as a list of Numpy arrays) and returns some structure that represents those labels.
+ The general use case for this is to convert labels from their input format to a format that a given object
+ detection model needs as its training targets.
+ returns (set, optional): A set of strings that determines what outputs the generator yields. The generator's output
+ is always a tuple that contains the outputs specified in this set and only those. If an output is not available,
+ it will be `None`. The output tuple can contain the following outputs according to the specified keyword strings:
+ * 'processed_images': An array containing the processed images. Will always be in the outputs, so it doesn't
+ matter whether or not you include this keyword in the set.
+ * 'encoded_labels': The encoded labels tensor. Will always be in the outputs if a label encoder is given,
+ so it doesn't matter whether or not you include this keyword in the set if you pass a label encoder.
+ * 'matched_anchors': Only available if `labels_encoder` is an `SSDInputEncoder` object. The same as 'encoded_labels',
+ but containing anchor box coordinates for all matched anchor boxes instead of ground truth coordinates.
+ This can be useful to visualize what anchor boxes are being matched to each ground truth box. Only available
+ in training mode.
+ * 'processed_labels': The processed, but not yet encoded labels. This is a list that contains for each
+ batch image a Numpy array with all ground truth boxes for that image. Only available if ground truth is available.
+ * 'filenames': A list containing the file names (full paths) of the images in the batch.
+ * 'image_ids': A list containing the integer IDs of the images in the batch. Only available if there
+ are image IDs available.
+ * 'evaluation-neutral': A nested list of lists of booleans. Each list contains `True` or `False` for every ground truth
+ bounding box of the respective image depending on whether that bounding box is supposed to be evaluation-neutral (`True`)
+ or not (`False`). May return `None` if there exists no such concept for a given dataset. An example for
+ evaluation-neutrality are the ground truth boxes annotated as "difficult" in the Pascal VOC datasets, which are
+ usually treated to be neutral in a model evaluation.
+ * 'inverse_transform': A nested list that contains a list of "inverter" functions for each item in the batch.
+ These inverter functions take (predicted) labels for an image as input and apply the inverse of the transformations
+ that were applied to the original image to them. This makes it possible to let the model make predictions on a
+ transformed image and then convert these predictions back to the original image. This is mostly relevant for
+ evaluation: If you want to evaluate your model on a dataset with varying image sizes, then you are forced to
+ transform the images somehow (e.g. by resizing or cropping) to make them all the same size. Your model will then
+ predict boxes for those transformed images, but for the evaluation you will need predictions with respect to the
+ original images, not with respect to the transformed images. This means you will have to transform the predicted
+ box coordinates back to the original image sizes. Note that for each image, the inverter functions for that
+ image need to be applied in the order in which they are given in the respective list for that image.
+ * 'original_images': A list containing the original images in the batch before any processing.
+ * 'original_labels': A list containing the original ground truth boxes for the images in this batch before any
+ processing. Only available if ground truth is available.
+ The order of the outputs in the tuple is the order of the list above. If `returns` contains a keyword for an
+ output that is unavailable, that output omitted in the yielded tuples and a warning will be raised.
+ keep_images_without_gt (bool, optional): If `False`, images for which there aren't any ground truth boxes before
+ any transformations have been applied will be removed from the batch. If `True`, such images will be kept
+ in the batch.
+ degenerate_box_handling (str, optional): How to handle degenerate boxes, which are boxes that have `xmax <= xmin` and/or
+ `ymax <= ymin`. Degenerate boxes can sometimes be in the dataset, or non-degenerate boxes can become degenerate
+ after they were processed by transformations. Note that the generator checks for degenerate boxes after all
+ transformations have been applied (if any), but before the labels were passed to the `label_encoder` (if one was given).
+ Can be one of 'warn' or 'remove'. If 'warn', the generator will merely print a warning to let you know that there
+ are degenerate boxes in a batch. If 'remove', the generator will remove degenerate boxes from the batch silently.
+
+ Yields:
+ The next batch as a tuple of items as defined by the `returns` argument.
+ '''
+
+ if self.dataset_size == 0:
+ raise DatasetError("Cannot generate batches because you did not load a dataset.")
+
+ #############################################################################################
+ # Warn if any of the set returns aren't possible.
+ #############################################################################################
+
+ if self.labels is None:
+ if any([ret in returns for ret in ['original_labels', 'processed_labels', 'encoded_labels', 'matched_anchors', 'evaluation-neutral']]):
+ warnings.warn("Since no labels were given, none of 'original_labels', 'processed_labels', 'evaluation-neutral', 'encoded_labels', and 'matched_anchors' " +
+ "are possible returns, but you set `returns = {}`. The impossible returns will be `None`.".format(returns))
+ elif label_encoder is None:
+ if any([ret in returns for ret in ['encoded_labels', 'matched_anchors']]):
+ warnings.warn("Since no label encoder was given, 'encoded_labels' and 'matched_anchors' aren't possible returns, " +
+ "but you set `returns = {}`. The impossible returns will be `None`.".format(returns))
+ elif not isinstance(label_encoder, SSDInputEncoder):
+ if 'matched_anchors' in returns:
+ warnings.warn("`label_encoder` is not an `SSDInputEncoder` object, therefore 'matched_anchors' is not a possible return, " +
+ "but you set `returns = {}`. The impossible returns will be `None`.".format(returns))
+
+ #############################################################################################
+ # Do a few preparatory things like maybe shuffling the dataset initially.
+ #############################################################################################
+
+ if shuffle:
+ objects_to_shuffle = [self.dataset_indices]
+ if not (self.filenames is None):
+ objects_to_shuffle.append(self.filenames)
+ if not (self.labels is None):
+ objects_to_shuffle.append(self.labels)
+ if not (self.image_ids is None):
+ objects_to_shuffle.append(self.image_ids)
+ if not (self.eval_neutral is None):
+ objects_to_shuffle.append(self.eval_neutral)
+ shuffled_objects = sklearn.utils.shuffle(*objects_to_shuffle)
+ for i in range(len(objects_to_shuffle)):
+ objects_to_shuffle[i][:] = shuffled_objects[i]
+
+ if degenerate_box_handling == 'remove':
+ box_filter = BoxFilter(check_overlap=False,
+ check_min_area=False,
+ check_degenerate=True,
+ labels_format=self.labels_format)
+
+ # Override the labels formats of all the transformations to make sure they are set correctly.
+ if not (self.labels is None):
+ for transform in transformations:
+ transform.labels_format = self.labels_format
+
+ #############################################################################################
+ # Generate mini batches.
+ #############################################################################################
+
+ current = 0
+
+ while True:
+
+ batch_X, batch_y = [], []
+
+ if current >= self.dataset_size:
+ current = 0
+
+ #########################################################################################
+ # Maybe shuffle the dataset if a full pass over the dataset has finished.
+ #########################################################################################
+
+ if shuffle:
+ objects_to_shuffle = [self.dataset_indices]
+ if not (self.filenames is None):
+ objects_to_shuffle.append(self.filenames)
+ if not (self.labels is None):
+ objects_to_shuffle.append(self.labels)
+ if not (self.image_ids is None):
+ objects_to_shuffle.append(self.image_ids)
+ if not (self.eval_neutral is None):
+ objects_to_shuffle.append(self.eval_neutral)
+ shuffled_objects = sklearn.utils.shuffle(*objects_to_shuffle)
+ for i in range(len(objects_to_shuffle)):
+ objects_to_shuffle[i][:] = shuffled_objects[i]
+
+ #########################################################################################
+ # Get the images, (maybe) image IDs, (maybe) labels, etc. for this batch.
+ #########################################################################################
+
+ # We prioritize our options in the following order:
+ # 1) If we have the images already loaded in memory, get them from there.
+ # 2) Else, if we have an HDF5 dataset, get the images from there.
+ # 3) Else, if we have neither of the above, we'll have to load the individual image
+ # files from disk.
+ batch_indices = self.dataset_indices[current:current+batch_size]
+ if not (self.images is None):
+ for i in batch_indices:
+ batch_X.append(self.images[i])
+ if not (self.filenames is None):
+ batch_filenames = self.filenames[current:current+batch_size]
+ else:
+ batch_filenames = None
+ elif not (self.hdf5_dataset is None):
+ for i in batch_indices:
+ batch_X.append(self.hdf5_dataset['images'][i].reshape(self.hdf5_dataset['image_shapes'][i]))
+ if not (self.filenames is None):
+ batch_filenames = self.filenames[current:current+batch_size]
+ else:
+ batch_filenames = None
+ else:
+ batch_filenames = self.filenames[current:current+batch_size]
+ for filename in batch_filenames:
+ with Image.open(filename) as image:
+ batch_X.append(np.array(image, dtype=np.uint8))
+
+ # Get the labels for this batch (if there are any).
+ if not (self.labels is None):
+ batch_y = deepcopy(self.labels[current:current+batch_size])
+ else:
+ batch_y = None
+
+ if not (self.eval_neutral is None):
+ batch_eval_neutral = self.eval_neutral[current:current+batch_size]
+ else:
+ batch_eval_neutral = None
+
+ # Get the image IDs for this batch (if there are any).
+ if not (self.image_ids is None):
+ batch_image_ids = self.image_ids[current:current+batch_size]
+ else:
+ batch_image_ids = None
+
+ if 'original_images' in returns:
+ batch_original_images = deepcopy(batch_X) # The original, unaltered images
+ if 'original_labels' in returns:
+ batch_original_labels = deepcopy(batch_y) # The original, unaltered labels
+
+ current += batch_size
+
+ #########################################################################################
+ # Maybe perform image transformations.
+ #########################################################################################
+
+ batch_items_to_remove = [] # In case we need to remove any images from the batch, store their indices in this list.
+ batch_inverse_transforms = []
+
+ for i in range(len(batch_X)):
+
+ if not (self.labels is None):
+ # Convert the labels for this image to an array (in case they aren't already).
+ batch_y[i] = np.array(batch_y[i])
+ # If this image has no ground truth boxes, maybe we don't want to keep it in the batch.
+ if (batch_y[i].size == 0) and not keep_images_without_gt:
+ batch_items_to_remove.append(i)
+ batch_inverse_transforms.append([])
+ continue
+
+ # Apply any image transformations we may have received.
+ if transformations:
+
+ inverse_transforms = []
+
+ for transform in transformations:
+
+ if not (self.labels is None):
+
+ if ('inverse_transform' in returns) and ('return_inverter' in inspect.signature(transform).parameters):
+ batch_X[i], batch_y[i], inverse_transform = transform(batch_X[i], batch_y[i], return_inverter=True)
+ inverse_transforms.append(inverse_transform)
+ else:
+ batch_X[i], batch_y[i] = transform(batch_X[i], batch_y[i])
+
+ if batch_X[i] is None: # In case the transform failed to produce an output image, which is possible for some random transforms.
+ batch_items_to_remove.append(i)
+ batch_inverse_transforms.append([])
+ continue
+
+ else:
+
+ if ('inverse_transform' in returns) and ('return_inverter' in inspect.signature(transform).parameters):
+ batch_X[i], inverse_transform = transform(batch_X[i], return_inverter=True)
+ inverse_transforms.append(inverse_transform)
+ else:
+ batch_X[i] = transform(batch_X[i])
+
+ batch_inverse_transforms.append(inverse_transforms[::-1])
+
+ #########################################################################################
+ # Check for degenerate boxes in this batch item.
+ #########################################################################################
+
+ if not (self.labels is None):
+
+ xmin = self.labels_format['xmin']
+ ymin = self.labels_format['ymin']
+ xmax = self.labels_format['xmax']
+ ymax = self.labels_format['ymax']
+
+ if np.any(batch_y[i][:,xmax] - batch_y[i][:,xmin] <= 0) or np.any(batch_y[i][:,ymax] - batch_y[i][:,ymin] <= 0):
+ if degenerate_box_handling == 'warn':
+ warnings.warn("Detected degenerate ground truth bounding boxes for batch item {} with bounding boxes {}, ".format(i, batch_y[i]) +
+ "i.e. bounding boxes where xmax <= xmin and/or ymax <= ymin. " +
+ "This could mean that your dataset contains degenerate ground truth boxes, or that any image transformations you may apply might " +
+ "result in degenerate ground truth boxes, or that you are parsing the ground truth in the wrong coordinate format." +
+ "Degenerate ground truth bounding boxes may lead to NaN errors during the training.")
+ elif degenerate_box_handling == 'remove':
+ batch_y[i] = box_filter(batch_y[i])
+ if (batch_y[i].size == 0) and not keep_images_without_gt:
+ batch_items_to_remove.append(i)
+
+ #########################################################################################
+ # Remove any items we might not want to keep from the batch.
+ #########################################################################################
+
+ if batch_items_to_remove:
+ for j in sorted(batch_items_to_remove, reverse=True):
+ # This isn't efficient, but it hopefully shouldn't need to be done often anyway.
+ batch_X.pop(j)
+ batch_filenames.pop(j)
+ if batch_inverse_transforms: batch_inverse_transforms.pop(j)
+ if not (self.labels is None): batch_y.pop(j)
+ if not (self.image_ids is None): batch_image_ids.pop(j)
+ if not (self.eval_neutral is None): batch_eval_neutral.pop(j)
+ if 'original_images' in returns: batch_original_images.pop(j)
+ if 'original_labels' in returns and not (self.labels is None): batch_original_labels.pop(j)
+
+ #########################################################################################
+
+ # CAUTION: Converting `batch_X` into an array will result in an empty batch if the images have varying sizes
+ # or varying numbers of channels. At this point, all images must have the same size and the same
+ # number of channels.
+ batch_X = np.array(batch_X)
+ if (batch_X.size == 0):
+ raise DegenerateBatchError("You produced an empty batch. This might be because the images in the batch vary " +
+ "in their size and/or number of channels. Note that after all transformations " +
+ "(if any were given) have been applied to all images in the batch, all images " +
+ "must be homogenous in size along all axes.")
+
+ #########################################################################################
+ # If we have a label encoder, encode our labels.
+ #########################################################################################
+
+ if not (label_encoder is None or self.labels is None):
+
+ if ('matched_anchors' in returns) and isinstance(label_encoder, SSDInputEncoder):
+ batch_y_encoded, batch_matched_anchors = label_encoder(batch_y, diagnostics=True)
+ else:
+ batch_y_encoded = label_encoder(batch_y, diagnostics=False)
+ batch_matched_anchors = None
+
+ else:
+ batch_y_encoded = None
+ batch_matched_anchors = None
+
+ #########################################################################################
+ # Compose the output.
+ #########################################################################################
+
+ ret = []
+ if 'processed_images' in returns: ret.append(batch_X)
+ if 'encoded_labels' in returns: ret.append(batch_y_encoded)
+ if 'matched_anchors' in returns: ret.append(batch_matched_anchors)
+ if 'processed_labels' in returns: ret.append(batch_y)
+ if 'filenames' in returns: ret.append(batch_filenames)
+ if 'image_ids' in returns: ret.append(batch_image_ids)
+ if 'evaluation-neutral' in returns: ret.append(batch_eval_neutral)
+ if 'inverse_transform' in returns: ret.append(batch_inverse_transforms)
+ if 'original_images' in returns: ret.append(batch_original_images)
+ if 'original_labels' in returns: ret.append(batch_original_labels)
+
+ yield ret
+
+ def save_dataset(self,
+ filenames_path='filenames.pkl',
+ labels_path=None,
+ image_ids_path=None,
+ eval_neutral_path=None):
+ '''
+ Writes the current `filenames`, `labels`, and `image_ids` lists to the specified files.
+ This is particularly useful for large datasets with annotations that are
+ parsed from XML files, which can take quite long. If you'll be using the
+ same dataset repeatedly, you don't want to have to parse the XML label
+ files every time.
+
+ Arguments:
+ filenames_path (str): The path under which to save the filenames pickle.
+ labels_path (str): The path under which to save the labels pickle.
+ image_ids_path (str, optional): The path under which to save the image IDs pickle.
+ eval_neutral_path (str, optional): The path under which to save the pickle for
+ the evaluation-neutrality annotations.
+ '''
+ with open(filenames_path, 'wb') as f:
+ pickle.dump(self.filenames, f)
+ if not labels_path is None:
+ with open(labels_path, 'wb') as f:
+ pickle.dump(self.labels, f)
+ if not image_ids_path is None:
+ with open(image_ids_path, 'wb') as f:
+ pickle.dump(self.image_ids, f)
+ if not eval_neutral_path is None:
+ with open(eval_neutral_path, 'wb') as f:
+ pickle.dump(self.eval_neutral, f)
+
+ def get_dataset(self):
+ '''
+ Returns:
+ 4-tuple containing lists and/or `None` for the filenames, labels, image IDs,
+ and evaluation-neutrality annotations.
+ '''
+ return self.filenames, self.labels, self.image_ids, self.eval_neutral
+
+ def get_dataset_size(self):
+ '''
+ Returns:
+ The number of images in the dataset.
+ '''
+ return self.dataset_size
diff --git a/engine/object_detection_branch/single_shot_detector/data_generator/object_detection_2d_geometric_ops.py b/engine/object_detection_branch/single_shot_detector/data_generator/object_detection_2d_geometric_ops.py
new file mode 100644
index 0000000..db898ce
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/data_generator/object_detection_2d_geometric_ops.py
@@ -0,0 +1,782 @@
+'''
+Various geometric image transformations for 2D object detection, both deterministic
+and probabilistic.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+
+import numpy as np
+import random
+
+import cv2
+
+from engine.object_detection_branch.single_shot_detector.data_generator.object_detection_2d_image_boxes_validation_utils import BoxFilter, ImageValidator
+
+
+class Resize:
+ '''
+ Resizes images to a specified height and width in pixels.
+ '''
+
+ def __init__(self,
+ height,
+ width,
+ interpolation_mode=cv2.INTER_LINEAR,
+ box_filter=None,
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+ '''
+ Arguments:
+ height (int): The desired height of the output images in pixels.
+ width (int): The desired width of the output images in pixels.
+ interpolation_mode (int, optional): An integer that denotes a valid
+ OpenCV interpolation mode. For example, integers 0 through 5 are
+ valid interpolation modes.
+ box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given.
+ A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria
+ after the transformation. Refer to the `BoxFilter` documentation for details. If `None`,
+ the validity of the bounding boxes is not checked.
+ labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+ of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+ 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+ '''
+ if not (isinstance(box_filter, BoxFilter) or box_filter is None):
+ raise ValueError("`box_filter` must be either `None` or a `BoxFilter` object.")
+ self.out_height = height
+ self.out_width = width
+ self.interpolation_mode = interpolation_mode
+ self.box_filter = box_filter
+ self.labels_format = labels_format
+
+ def __call__(self, image, labels=None, return_inverter=False):
+
+ img_height, img_width = image.shape[:2]
+
+ xmin = self.labels_format['xmin']
+ ymin = self.labels_format['ymin']
+ xmax = self.labels_format['xmax']
+ ymax = self.labels_format['ymax']
+
+ image = cv2.resize(image,
+ dsize=(self.out_width, self.out_height),
+ interpolation=self.interpolation_mode)
+
+ if return_inverter:
+ def inverter(labels):
+ labels = np.copy(labels)
+ labels[:, [ymin+1, ymax+1]] = np.round(labels[:, [ymin+1, ymax+1]] * (img_height / self.out_height), decimals=0)
+ labels[:, [xmin+1, xmax+1]] = np.round(labels[:, [xmin+1, xmax+1]] * (img_width / self.out_width), decimals=0)
+ return labels
+
+ if labels is None:
+ if return_inverter:
+ return image, inverter
+ else:
+ return image
+ else:
+ labels = np.copy(labels)
+ labels[:, [ymin, ymax]] = np.round(labels[:, [ymin, ymax]] * (self.out_height / img_height), decimals=0)
+ labels[:, [xmin, xmax]] = np.round(labels[:, [xmin, xmax]] * (self.out_width / img_width), decimals=0)
+
+ if not (self.box_filter is None):
+ self.box_filter.labels_format = self.labels_format
+ labels = self.box_filter(labels=labels,
+ image_height=self.out_height,
+ image_width=self.out_width)
+
+ if return_inverter:
+ return image, labels, inverter
+ else:
+ return image, labels
+
+class ResizeRandomInterp:
+ '''
+ Resizes images to a specified height and width in pixels using a radnomly
+ selected interpolation mode.
+ '''
+
+ def __init__(self,
+ height,
+ width,
+ interpolation_modes=[cv2.INTER_NEAREST,
+ cv2.INTER_LINEAR,
+ cv2.INTER_CUBIC,
+ cv2.INTER_AREA,
+ cv2.INTER_LANCZOS4],
+ box_filter=None,
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+ '''
+ Arguments:
+ height (int): The desired height of the output image in pixels.
+ width (int): The desired width of the output image in pixels.
+ interpolation_modes (list/tuple, optional): A list/tuple of integers
+ that represent valid OpenCV interpolation modes. For example,
+ integers 0 through 5 are valid interpolation modes.
+ box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given.
+ A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria
+ after the transformation. Refer to the `BoxFilter` documentation for details. If `None`,
+ the validity of the bounding boxes is not checked.
+ labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+ of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+ 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+ '''
+ if not (isinstance(interpolation_modes, (list, tuple))):
+ raise ValueError("`interpolation_mode` must be a list or tuple.")
+ self.height = height
+ self.width = width
+ self.interpolation_modes = interpolation_modes
+ self.box_filter = box_filter
+ self.labels_format = labels_format
+ self.resize = Resize(height=self.height,
+ width=self.width,
+ box_filter=self.box_filter,
+ labels_format=self.labels_format)
+
+ def __call__(self, image, labels=None, return_inverter=False):
+ self.resize.interpolation_mode = np.random.choice(self.interpolation_modes)
+ self.resize.labels_format = self.labels_format
+ return self.resize(image, labels, return_inverter)
+
+class Flip:
+ '''
+ Flips images horizontally or vertically.
+ '''
+ def __init__(self,
+ dim='horizontal',
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+ '''
+ Arguments:
+ dim (str, optional): Can be either of 'horizontal' and 'vertical'.
+ If 'horizontal', images will be flipped horizontally, i.e. along
+ the vertical axis. If 'horizontal', images will be flipped vertically,
+ i.e. along the horizontal axis.
+ labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+ of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+ 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+ '''
+ if not (dim in {'horizontal', 'vertical'}): raise ValueError("`dim` can be one of 'horizontal' and 'vertical'.")
+ self.dim = dim
+ self.labels_format = labels_format
+
+ def __call__(self, image, labels=None, return_inverter=False):
+
+ img_height, img_width = image.shape[:2]
+
+ xmin = self.labels_format['xmin']
+ ymin = self.labels_format['ymin']
+ xmax = self.labels_format['xmax']
+ ymax = self.labels_format['ymax']
+
+ if self.dim == 'horizontal':
+ image = image[:,::-1]
+ if labels is None:
+ return image
+ else:
+ labels = np.copy(labels)
+ labels[:, [xmin, xmax]] = img_width - labels[:, [xmax, xmin]]
+ return image, labels
+ else:
+ image = image[::-1]
+ if labels is None:
+ return image
+ else:
+ labels = np.copy(labels)
+ labels[:, [ymin, ymax]] = img_height - labels[:, [ymax, ymin]]
+ return image, labels
+
+class RandomFlip:
+ '''
+ Randomly flips images horizontally or vertically. The randomness only refers
+ to whether or not the image will be flipped.
+ '''
+ def __init__(self,
+ dim='horizontal',
+ prob=0.5,
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+ '''
+ Arguments:
+ dim (str, optional): Can be either of 'horizontal' and 'vertical'.
+ If 'horizontal', images will be flipped horizontally, i.e. along
+ the vertical axis. If 'horizontal', images will be flipped vertically,
+ i.e. along the horizontal axis.
+ prob (float, optional): `(1 - prob)` determines the probability with which the original,
+ unaltered image is returned.
+ labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+ of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+ 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+ '''
+ self.dim = dim
+ self.prob = prob
+ self.labels_format = labels_format
+ self.flip = Flip(dim=self.dim, labels_format=self.labels_format)
+
+ def __call__(self, image, labels=None):
+ p = np.random.uniform(0,1)
+ if p >= (1.0-self.prob):
+ self.flip.labels_format = self.labels_format
+ return self.flip(image, labels)
+ elif labels is None:
+ return image
+ else:
+ return image, labels
+
+class Translate:
+ '''
+ Translates images horizontally and/or vertically.
+ '''
+
+ def __init__(self,
+ dy,
+ dx,
+ clip_boxes=True,
+ box_filter=None,
+ background=(0,0,0),
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+ '''
+ Arguments:
+ dy (float): The fraction of the image height by which to translate images along the
+ vertical axis. Positive values translate images downwards, negative values
+ translate images upwards.
+ dx (float): The fraction of the image width by which to translate images along the
+ horizontal axis. Positive values translate images to the right, negative values
+ translate images to the left.
+ clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given.
+ If `True`, any ground truth bounding boxes will be clipped to lie entirely within the
+ image after the translation.
+ box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given.
+ A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria
+ after the transformation. Refer to the `BoxFilter` documentation for details. If `None`,
+ the validity of the bounding boxes is not checked.
+ background (list/tuple, optional): A 3-tuple specifying the RGB color value of the
+ background pixels of the translated images.
+ labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+ of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+ 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+ '''
+
+ if not (isinstance(box_filter, BoxFilter) or box_filter is None):
+ raise ValueError("`box_filter` must be either `None` or a `BoxFilter` object.")
+ self.dy_rel = dy
+ self.dx_rel = dx
+ self.clip_boxes = clip_boxes
+ self.box_filter = box_filter
+ self.background = background
+ self.labels_format = labels_format
+
+ def __call__(self, image, labels=None):
+
+ img_height, img_width = image.shape[:2]
+
+ # Compute the translation matrix.
+ dy_abs = int(round(img_height * self.dy_rel))
+ dx_abs = int(round(img_width * self.dx_rel))
+ M = np.float32([[1, 0, dx_abs],
+ [0, 1, dy_abs]])
+
+ # Translate the image.
+ image = cv2.warpAffine(image,
+ M=M,
+ dsize=(img_width, img_height),
+ borderMode=cv2.BORDER_CONSTANT,
+ borderValue=self.background)
+
+ if labels is None:
+ return image
+ else:
+ xmin = self.labels_format['xmin']
+ ymin = self.labels_format['ymin']
+ xmax = self.labels_format['xmax']
+ ymax = self.labels_format['ymax']
+
+ labels = np.copy(labels)
+ # Translate the box coordinates to the translated image's coordinate system.
+ labels[:,[xmin,xmax]] += dx_abs
+ labels[:,[ymin,ymax]] += dy_abs
+
+ # Compute all valid boxes for this patch.
+ if not (self.box_filter is None):
+ self.box_filter.labels_format = self.labels_format
+ labels = self.box_filter(labels=labels,
+ image_height=img_height,
+ image_width=img_width)
+
+ if self.clip_boxes:
+ labels[:,[ymin,ymax]] = np.clip(labels[:,[ymin,ymax]], a_min=0, a_max=img_height-1)
+ labels[:,[xmin,xmax]] = np.clip(labels[:,[xmin,xmax]], a_min=0, a_max=img_width-1)
+
+ return image, labels
+
+class RandomTranslate:
+ '''
+ Randomly translates images horizontally and/or vertically.
+ '''
+
+ def __init__(self,
+ dy_minmax=(0.03,0.3),
+ dx_minmax=(0.03,0.3),
+ prob=0.5,
+ clip_boxes=True,
+ box_filter=None,
+ image_validator=None,
+ n_trials_max=3,
+ background=(0,0,0),
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+ '''
+ Arguments:
+ dy_minmax (list/tuple, optional): A 2-tuple `(min, max)` of non-negative floats that
+ determines the minimum and maximum relative translation of images along the vertical
+ axis both upward and downward. That is, images will be randomly translated by at least
+ `min` and at most `max` either upward or downward. For example, if `dy_minmax == (0.05,0.3)`,
+ an image of size `(100,100)` will be translated by at least 5 and at most 30 pixels
+ either upward or downward. The translation direction is chosen randomly.
+ dx_minmax (list/tuple, optional): A 2-tuple `(min, max)` of non-negative floats that
+ determines the minimum and maximum relative translation of images along the horizontal
+ axis both to the left and right. That is, images will be randomly translated by at least
+ `min` and at most `max` either left or right. For example, if `dx_minmax == (0.05,0.3)`,
+ an image of size `(100,100)` will be translated by at least 5 and at most 30 pixels
+ either left or right. The translation direction is chosen randomly.
+ prob (float, optional): `(1 - prob)` determines the probability with which the original,
+ unaltered image is returned.
+ clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given.
+ If `True`, any ground truth bounding boxes will be clipped to lie entirely within the
+ image after the translation.
+ box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given.
+ A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria
+ after the transformation. Refer to the `BoxFilter` documentation for details. If `None`,
+ the validity of the bounding boxes is not checked.
+ image_validator (ImageValidator, optional): Only relevant if ground truth bounding boxes are given.
+ An `ImageValidator` object to determine whether a translated image is valid. If `None`,
+ any outcome is valid.
+ n_trials_max (int, optional): Only relevant if ground truth bounding boxes are given.
+ Determines the maxmial number of trials to produce a valid image. If no valid image could
+ be produced in `n_trials_max` trials, returns the unaltered input image.
+ background (list/tuple, optional): A 3-tuple specifying the RGB color value of the
+ background pixels of the translated images.
+ labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+ of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+ 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+ '''
+ if dy_minmax[0] > dy_minmax[1]:
+ raise ValueError("It must be `dy_minmax[0] <= dy_minmax[1]`.")
+ if dx_minmax[0] > dx_minmax[1]:
+ raise ValueError("It must be `dx_minmax[0] <= dx_minmax[1]`.")
+ if dy_minmax[0] < 0 or dx_minmax[0] < 0:
+ raise ValueError("It must be `dy_minmax[0] >= 0` and `dx_minmax[0] >= 0`.")
+ if not (isinstance(image_validator, ImageValidator) or image_validator is None):
+ raise ValueError("`image_validator` must be either `None` or an `ImageValidator` object.")
+ self.dy_minmax = dy_minmax
+ self.dx_minmax = dx_minmax
+ self.prob = prob
+ self.clip_boxes = clip_boxes
+ self.box_filter = box_filter
+ self.image_validator = image_validator
+ self.n_trials_max = n_trials_max
+ self.background = background
+ self.labels_format = labels_format
+ self.translate = Translate(dy=0,
+ dx=0,
+ clip_boxes=self.clip_boxes,
+ box_filter=self.box_filter,
+ background=self.background,
+ labels_format=self.labels_format)
+
+ def __call__(self, image, labels=None):
+
+ p = np.random.uniform(0,1)
+ if p >= (1.0-self.prob):
+
+ img_height, img_width = image.shape[:2]
+
+ xmin = self.labels_format['xmin']
+ ymin = self.labels_format['ymin']
+ xmax = self.labels_format['xmax']
+ ymax = self.labels_format['ymax']
+
+ # Override the preset labels format.
+ if not self.image_validator is None:
+ self.image_validator.labels_format = self.labels_format
+ self.translate.labels_format = self.labels_format
+
+ for _ in range(max(1, self.n_trials_max)):
+
+ # Pick the relative amount by which to translate.
+ dy_abs = np.random.uniform(self.dy_minmax[0], self.dy_minmax[1])
+ dx_abs = np.random.uniform(self.dx_minmax[0], self.dx_minmax[1])
+ # Pick the direction in which to translate.
+ dy = np.random.choice([-dy_abs, dy_abs])
+ dx = np.random.choice([-dx_abs, dx_abs])
+ self.translate.dy_rel = dy
+ self.translate.dx_rel = dx
+
+ if (labels is None) or (self.image_validator is None):
+ # We either don't have any boxes or if we do, we will accept any outcome as valid.
+ return self.translate(image, labels)
+ else:
+ # Translate the box coordinates to the translated image's coordinate system.
+ new_labels = np.copy(labels)
+ new_labels[:, [ymin, ymax]] += int(round(img_height * dy))
+ new_labels[:, [xmin, xmax]] += int(round(img_width * dx))
+
+ # Check if the patch is valid.
+ if self.image_validator(labels=new_labels,
+ image_height=img_height,
+ image_width=img_width):
+ return self.translate(image, labels)
+
+ # If all attempts failed, return the unaltered input image.
+ if labels is None:
+ return image
+
+ else:
+ return image, labels
+
+ elif labels is None:
+ return image
+
+ else:
+ return image, labels
+
+class Scale:
+ '''
+ Scales images, i.e. zooms in or out.
+ '''
+
+ def __init__(self,
+ factor,
+ clip_boxes=True,
+ box_filter=None,
+ background=(0,0,0),
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+ '''
+ Arguments:
+ factor (float): The fraction of the image size by which to scale images. Must be positive.
+ clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given.
+ If `True`, any ground truth bounding boxes will be clipped to lie entirely within the
+ image after the translation.
+ box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given.
+ A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria
+ after the transformation. Refer to the `BoxFilter` documentation for details. If `None`,
+ the validity of the bounding boxes is not checked.
+ background (list/tuple, optional): A 3-tuple specifying the RGB color value of the potential
+ background pixels of the scaled images.
+ labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+ of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+ 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+ '''
+
+ if factor <= 0:
+ raise ValueError("It must be `factor > 0`.")
+ if not (isinstance(box_filter, BoxFilter) or box_filter is None):
+ raise ValueError("`box_filter` must be either `None` or a `BoxFilter` object.")
+ self.factor = factor
+ self.clip_boxes = clip_boxes
+ self.box_filter = box_filter
+ self.background = background
+ self.labels_format = labels_format
+
+ def __call__(self, image, labels=None):
+
+ img_height, img_width = image.shape[:2]
+
+ # Compute the rotation matrix.
+ M = cv2.getRotationMatrix2D(center=(img_width / 2, img_height / 2),
+ angle=0,
+ scale=self.factor)
+
+ # Scale the image.
+ image = cv2.warpAffine(image,
+ M=M,
+ dsize=(img_width, img_height),
+ borderMode=cv2.BORDER_CONSTANT,
+ borderValue=self.background)
+
+ if labels is None:
+ return image
+ else:
+ xmin = self.labels_format['xmin']
+ ymin = self.labels_format['ymin']
+ xmax = self.labels_format['xmax']
+ ymax = self.labels_format['ymax']
+
+ labels = np.copy(labels)
+ # Scale the bounding boxes accordingly.
+ # Transform two opposite corner points of the rectangular boxes using the rotation matrix `M`.
+ toplefts = np.array([labels[:,xmin], labels[:,ymin], np.ones(labels.shape[0])])
+ bottomrights = np.array([labels[:,xmax], labels[:,ymax], np.ones(labels.shape[0])])
+ new_toplefts = (np.dot(M, toplefts)).T
+ new_bottomrights = (np.dot(M, bottomrights)).T
+ labels[:,[xmin,ymin]] = np.round(new_toplefts, decimals=0).astype(np.int)
+ labels[:,[xmax,ymax]] = np.round(new_bottomrights, decimals=0).astype(np.int)
+
+ # Compute all valid boxes for this patch.
+ if not (self.box_filter is None):
+ self.box_filter.labels_format = self.labels_format
+ labels = self.box_filter(labels=labels,
+ image_height=img_height,
+ image_width=img_width)
+
+ if self.clip_boxes:
+ labels[:,[ymin,ymax]] = np.clip(labels[:,[ymin,ymax]], a_min=0, a_max=img_height-1)
+ labels[:,[xmin,xmax]] = np.clip(labels[:,[xmin,xmax]], a_min=0, a_max=img_width-1)
+
+ return image, labels
+
+class RandomScale:
+ '''
+ Randomly scales images.
+ '''
+
+ def __init__(self,
+ min_factor=0.5,
+ max_factor=1.5,
+ prob=0.5,
+ clip_boxes=True,
+ box_filter=None,
+ image_validator=None,
+ n_trials_max=3,
+ background=(0,0,0),
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+ '''
+ Arguments:
+ min_factor (float, optional): The minimum fraction of the image size by which to scale images.
+ Must be positive.
+ max_factor (float, optional): The maximum fraction of the image size by which to scale images.
+ Must be positive.
+ prob (float, optional): `(1 - prob)` determines the probability with which the original,
+ unaltered image is returned.
+ clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given.
+ If `True`, any ground truth bounding boxes will be clipped to lie entirely within the
+ image after the translation.
+ box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given.
+ A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria
+ after the transformation. Refer to the `BoxFilter` documentation for details. If `None`,
+ the validity of the bounding boxes is not checked.
+ image_validator (ImageValidator, optional): Only relevant if ground truth bounding boxes are given.
+ An `ImageValidator` object to determine whether a scaled image is valid. If `None`,
+ any outcome is valid.
+ n_trials_max (int, optional): Only relevant if ground truth bounding boxes are given.
+ Determines the maxmial number of trials to produce a valid image. If no valid image could
+ be produced in `n_trials_max` trials, returns the unaltered input image.
+ background (list/tuple, optional): A 3-tuple specifying the RGB color value of the potential
+ background pixels of the scaled images.
+ labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+ of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+ 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+ '''
+
+ if not (0 < min_factor <= max_factor):
+ raise ValueError("It must be `0 < min_factor <= max_factor`.")
+ if not (isinstance(image_validator, ImageValidator) or image_validator is None):
+ raise ValueError("`image_validator` must be either `None` or an `ImageValidator` object.")
+ self.min_factor = min_factor
+ self.max_factor = max_factor
+ self.prob = prob
+ self.clip_boxes = clip_boxes
+ self.box_filter = box_filter
+ self.image_validator = image_validator
+ self.n_trials_max = n_trials_max
+ self.background = background
+ self.labels_format = labels_format
+ self.scale = Scale(factor=1.0,
+ clip_boxes=self.clip_boxes,
+ box_filter=self.box_filter,
+ background=self.background,
+ labels_format=self.labels_format)
+
+ def __call__(self, image, labels=None):
+
+ p = np.random.uniform(0,1)
+ if p >= (1.0-self.prob):
+
+ img_height, img_width = image.shape[:2]
+
+ xmin = self.labels_format['xmin']
+ ymin = self.labels_format['ymin']
+ xmax = self.labels_format['xmax']
+ ymax = self.labels_format['ymax']
+
+ # Override the preset labels format.
+ if not self.image_validator is None:
+ self.image_validator.labels_format = self.labels_format
+ self.scale.labels_format = self.labels_format
+
+ for _ in range(max(1, self.n_trials_max)):
+
+ # Pick a scaling factor.
+ factor = np.random.uniform(self.min_factor, self.max_factor)
+ self.scale.factor = factor
+
+ if (labels is None) or (self.image_validator is None):
+ # We either don't have any boxes or if we do, we will accept any outcome as valid.
+ return self.scale(image, labels)
+ else:
+ # Scale the bounding boxes accordingly.
+ # Transform two opposite corner points of the rectangular boxes using the rotation matrix `M`.
+ toplefts = np.array([labels[:,xmin], labels[:,ymin], np.ones(labels.shape[0])])
+ bottomrights = np.array([labels[:,xmax], labels[:,ymax], np.ones(labels.shape[0])])
+
+ # Compute the rotation matrix.
+ M = cv2.getRotationMatrix2D(center=(img_width / 2, img_height / 2),
+ angle=0,
+ scale=factor)
+
+ new_toplefts = (np.dot(M, toplefts)).T
+ new_bottomrights = (np.dot(M, bottomrights)).T
+
+ new_labels = np.copy(labels)
+ new_labels[:,[xmin,ymin]] = np.around(new_toplefts, decimals=0).astype(np.int)
+ new_labels[:,[xmax,ymax]] = np.around(new_bottomrights, decimals=0).astype(np.int)
+
+ # Check if the patch is valid.
+ if self.image_validator(labels=new_labels,
+ image_height=img_height,
+ image_width=img_width):
+ return self.scale(image, labels)
+
+ # If all attempts failed, return the unaltered input image.
+ if labels is None:
+ return image
+
+ else:
+ return image, labels
+
+ elif labels is None:
+ return image
+
+ else:
+ return image, labels
+
+class Rotate:
+ '''
+ Rotates images counter-clockwise by 90, 180, or 270 degrees.
+ '''
+
+ def __init__(self,
+ angle,
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+ '''
+ Arguments:
+ angle (int): The angle in degrees by which to rotate the images counter-clockwise.
+ Only 90, 180, and 270 are valid values.
+ labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+ of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+ 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+ '''
+
+ if not angle in {90, 180, 270}:
+ raise ValueError("`angle` must be in the set {90, 180, 270}.")
+ self.angle = angle
+ self.labels_format = labels_format
+
+ def __call__(self, image, labels=None):
+
+ img_height, img_width = image.shape[:2]
+
+ # Compute the rotation matrix.
+ M = cv2.getRotationMatrix2D(center=(img_width / 2, img_height / 2),
+ angle=self.angle,
+ scale=1)
+
+ # Get the sine and cosine from the rotation matrix.
+ cos_angle = np.abs(M[0, 0])
+ sin_angle = np.abs(M[0, 1])
+
+ # Compute the new bounding dimensions of the image.
+ img_width_new = int(img_height * sin_angle + img_width * cos_angle)
+ img_height_new = int(img_height * cos_angle + img_width * sin_angle)
+
+ # Adjust the rotation matrix to take into account the translation.
+ M[1, 2] += (img_height_new - img_height) / 2
+ M[0, 2] += (img_width_new - img_width) / 2
+
+ # Rotate the image.
+ image = cv2.warpAffine(image,
+ M=M,
+ dsize=(img_width_new, img_height_new))
+
+ if labels is None:
+ return image
+ else:
+ xmin = self.labels_format['xmin']
+ ymin = self.labels_format['ymin']
+ xmax = self.labels_format['xmax']
+ ymax = self.labels_format['ymax']
+
+ labels = np.copy(labels)
+ # Rotate the bounding boxes accordingly.
+ # Transform two opposite corner points of the rectangular boxes using the rotation matrix `M`.
+ toplefts = np.array([labels[:,xmin], labels[:,ymin], np.ones(labels.shape[0])])
+ bottomrights = np.array([labels[:,xmax], labels[:,ymax], np.ones(labels.shape[0])])
+ new_toplefts = (np.dot(M, toplefts)).T
+ new_bottomrights = (np.dot(M, bottomrights)).T
+ labels[:,[xmin,ymin]] = np.round(new_toplefts, decimals=0).astype(np.int)
+ labels[:,[xmax,ymax]] = np.round(new_bottomrights, decimals=0).astype(np.int)
+
+ if self.angle == 90:
+ # ymin and ymax were switched by the rotation.
+ labels[:,[ymax,ymin]] = labels[:,[ymin,ymax]]
+ elif self.angle == 180:
+ # ymin and ymax were switched by the rotation,
+ # and also xmin and xmax were switched.
+ labels[:,[ymax,ymin]] = labels[:,[ymin,ymax]]
+ labels[:,[xmax,xmin]] = labels[:,[xmin,xmax]]
+ elif self.angle == 270:
+ # xmin and xmax were switched by the rotation.
+ labels[:,[xmax,xmin]] = labels[:,[xmin,xmax]]
+
+ return image, labels
+
+class RandomRotate:
+ '''
+ Randomly rotates images counter-clockwise.
+ '''
+
+ def __init__(self,
+ angles=[90, 180, 270],
+ prob=0.5,
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+ '''
+ Arguments:
+ angle (list): The list of angles in degrees from which one is randomly selected to rotate
+ the images counter-clockwise. Only 90, 180, and 270 are valid values.
+ prob (float, optional): `(1 - prob)` determines the probability with which the original,
+ unaltered image is returned.
+ labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+ of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+ 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+ '''
+ for angle in angles:
+ if not angle in {90, 180, 270}:
+ raise ValueError("`angles` can only contain the values 90, 180, and 270.")
+ self.angles = angles
+ self.prob = prob
+ self.labels_format = labels_format
+ self.rotate = Rotate(angle=90, labels_format=self.labels_format)
+
+ def __call__(self, image, labels=None):
+
+ p = np.random.uniform(0,1)
+ if p >= (1.0-self.prob):
+ # Pick a rotation angle.
+ self.rotate.angle = random.choice(self.angles)
+ self.rotate.labels_format = self.labels_format
+ return self.rotate(image, labels)
+
+ elif labels is None:
+ return image
+
+ else:
+ return image, labels
diff --git a/engine/object_detection_branch/single_shot_detector/data_generator/object_detection_2d_image_boxes_validation_utils.py b/engine/object_detection_branch/single_shot_detector/data_generator/object_detection_2d_image_boxes_validation_utils.py
new file mode 100644
index 0000000..3516749
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/data_generator/object_detection_2d_image_boxes_validation_utils.py
@@ -0,0 +1,324 @@
+'''
+Utilities for 2D object detection related to answering the following questions:
+1. Given an image size and bounding boxes, which bounding boxes meet certain
+ requirements with respect to the image size?
+2. Given an image size and bounding boxes, is an image of that size valid with
+ respect to the bounding boxes according to certain requirements?
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+
+import numpy as np
+
+from engine.object_detection_branch.single_shot_detector.bounding_box_utils.bounding_box_utils import iou
+
+
+class BoundGenerator:
+ '''
+ Generates pairs of floating point values that represent lower and upper bounds
+ from a given sample space.
+ '''
+ def __init__(self,
+ sample_space=((0.1, None),
+ (0.3, None),
+ (0.5, None),
+ (0.7, None),
+ (0.9, None),
+ (None, None)),
+ weights=None):
+ '''
+ Arguments:
+ sample_space (list or tuple): A list, tuple, or array-like object of shape
+ `(n, 2)` that contains `n` samples to choose from, where each sample
+ is a 2-tuple of scalars and/or `None` values.
+ weights (list or tuple, optional): A list or tuple representing the distribution
+ over the sample space. If `None`, a uniform distribution will be assumed.
+ '''
+
+ if (not (weights is None)) and len(weights) != len(sample_space):
+ raise ValueError("`weights` must either be `None` for uniform distribution or have the same length as `sample_space`.")
+
+ self.sample_space = []
+ for bound_pair in sample_space:
+ if len(bound_pair) != 2:
+ raise ValueError("All elements of the sample space must be 2-tuples.")
+ bound_pair = list(bound_pair)
+ if bound_pair[0] is None: bound_pair[0] = 0.0
+ if bound_pair[1] is None: bound_pair[1] = 1.0
+ if bound_pair[0] > bound_pair[1]:
+ raise ValueError("For all sample space elements, the lower bound cannot be greater than the upper bound.")
+ self.sample_space.append(bound_pair)
+
+ self.sample_space_size = len(self.sample_space)
+
+ if weights is None:
+ self.weights = [1.0/self.sample_space_size] * self.sample_space_size
+ else:
+ self.weights = weights
+
+ def __call__(self):
+ '''
+ Returns:
+ An item of the sample space, i.e. a 2-tuple of scalars.
+ '''
+ i = np.random.choice(self.sample_space_size, p=self.weights)
+ return self.sample_space[i]
+
+class BoxFilter:
+ '''
+ Returns all bounding boxes that are valid with respect to a the defined criteria.
+ '''
+
+ def __init__(self,
+ check_overlap=True,
+ check_min_area=True,
+ check_degenerate=True,
+ overlap_criterion='center_point',
+ overlap_bounds=(0.3, 1.0),
+ min_area=16,
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4},
+ border_pixels='half'):
+ '''
+ Arguments:
+ check_overlap (bool, optional): Whether or not to enforce the overlap requirements defined by
+ `overlap_criterion` and `overlap_bounds`. Sometimes you might want to use the box filter only
+ to enforce a certain minimum area for all boxes (see next argument), in such cases you can
+ turn the overlap requirements off.
+ check_min_area (bool, optional): Whether or not to enforce the minimum area requirement defined
+ by `min_area`. If `True`, any boxes that have an area (in pixels) that is smaller than `min_area`
+ will be removed from the labels of an image. Bounding boxes below a certain area aren't useful
+ training examples. An object that takes up only, say, 5 pixels in an image is probably not
+ recognizable anymore, neither for a human, nor for an object detection model. It makes sense
+ to remove such boxes.
+ check_degenerate (bool, optional): Whether or not to check for and remove degenerate bounding boxes.
+ Degenerate bounding boxes are boxes that have `xmax <= xmin` and/or `ymax <= ymin`. In particular,
+ boxes with a width and/or height of zero are degenerate. It is obviously important to filter out
+ such boxes, so you should only set this option to `False` if you are certain that degenerate
+ boxes are not possible in your data and processing chain.
+ overlap_criterion (str, optional): Can be either of 'center_point', 'iou', or 'area'. Determines
+ which boxes are considered valid with respect to a given image. If set to 'center_point',
+ a given bounding box is considered valid if its center point lies within the image.
+ If set to 'area', a given bounding box is considered valid if the quotient of its intersection
+ area with the image and its own area is within the given `overlap_bounds`. If set to 'iou', a given
+ bounding box is considered valid if its IoU with the image is within the given `overlap_bounds`.
+ overlap_bounds (list or BoundGenerator, optional): Only relevant if `overlap_criterion` is 'area' or 'iou'.
+ Determines the lower and upper bounds for `overlap_criterion`. Can be either a 2-tuple of scalars
+ representing a lower bound and an upper bound, or a `BoundGenerator` object, which provides
+ the possibility to generate bounds randomly.
+ min_area (int, optional): Only relevant if `check_min_area` is `True`. Defines the minimum area in
+ pixels that a bounding box must have in order to be valid. Boxes with an area smaller than this
+ will be removed.
+ labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+ of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+ 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+ border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+ Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+ to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+ If 'half', then one of each of the two horizontal and vertical borders belong
+ to the boxex, but not the other.
+ '''
+ if not isinstance(overlap_bounds, (list, tuple, BoundGenerator)):
+ raise ValueError("`overlap_bounds` must be either a 2-tuple of scalars or a `BoundGenerator` object.")
+ if isinstance(overlap_bounds, (list, tuple)) and (overlap_bounds[0] > overlap_bounds[1]):
+ raise ValueError("The lower bound must not be greater than the upper bound.")
+ if not (overlap_criterion in {'iou', 'area', 'center_point'}):
+ raise ValueError("`overlap_criterion` must be one of 'iou', 'area', or 'center_point'.")
+ self.overlap_criterion = overlap_criterion
+ self.overlap_bounds = overlap_bounds
+ self.min_area = min_area
+ self.check_overlap = check_overlap
+ self.check_min_area = check_min_area
+ self.check_degenerate = check_degenerate
+ self.labels_format = labels_format
+ self.border_pixels = border_pixels
+
+ def __call__(self,
+ labels,
+ image_height=None,
+ image_width=None):
+ '''
+ Arguments:
+ labels (array): The labels to be filtered. This is an array with shape `(m,n)`, where
+ `m` is the number of bounding boxes and `n` is the number of elements that defines
+ each bounding box (box coordinates, class ID, etc.). The box coordinates are expected
+ to be in the image's coordinate system.
+ image_height (int): Only relevant if `check_overlap == True`. The height of the image
+ (in pixels) to compare the box coordinates to.
+ image_width (int): `check_overlap == True`. The width of the image (in pixels) to compare
+ the box coordinates to.
+
+ Returns:
+ An array containing the labels of all boxes that are valid.
+ '''
+
+ labels = np.copy(labels)
+
+ xmin = self.labels_format['xmin']
+ ymin = self.labels_format['ymin']
+ xmax = self.labels_format['xmax']
+ ymax = self.labels_format['ymax']
+
+ # Record the boxes that pass all checks here.
+ requirements_met = np.ones(shape=labels.shape[0], dtype=np.bool)
+
+ if self.check_degenerate:
+
+ non_degenerate = (labels[:,xmax] > labels[:,xmin]) * (labels[:,ymax] > labels[:,ymin])
+ requirements_met *= non_degenerate
+
+ if self.check_min_area:
+
+ min_area_met = (labels[:,xmax] - labels[:,xmin]) * (labels[:,ymax] - labels[:,ymin]) >= self.min_area
+ requirements_met *= min_area_met
+
+ if self.check_overlap:
+
+ # Get the lower and upper bounds.
+ if isinstance(self.overlap_bounds, BoundGenerator):
+ lower, upper = self.overlap_bounds()
+ else:
+ lower, upper = self.overlap_bounds
+
+ # Compute which boxes are valid.
+
+ if self.overlap_criterion == 'iou':
+ # Compute the patch coordinates.
+ image_coords = np.array([0, 0, image_width, image_height])
+ # Compute the IoU between the patch and all of the ground truth boxes.
+ image_boxes_iou = iou(image_coords, labels[:, [xmin, ymin, xmax, ymax]], coords='corners', mode='element-wise', border_pixels=self.border_pixels)
+ requirements_met *= (image_boxes_iou > lower) * (image_boxes_iou <= upper)
+
+ elif self.overlap_criterion == 'area':
+ if self.border_pixels == 'half':
+ d = 0
+ elif self.border_pixels == 'include':
+ d = 1 # If border pixels are supposed to belong to the bounding boxes, we have to add one pixel to any difference `xmax - xmin` or `ymax - ymin`.
+ elif self.border_pixels == 'exclude':
+ d = -1 # If border pixels are not supposed to belong to the bounding boxes, we have to subtract one pixel from any difference `xmax - xmin` or `ymax - ymin`.
+ # Compute the areas of the boxes.
+ box_areas = (labels[:,xmax] - labels[:,xmin] + d) * (labels[:,ymax] - labels[:,ymin] + d)
+ # Compute the intersection area between the patch and all of the ground truth boxes.
+ clipped_boxes = np.copy(labels)
+ clipped_boxes[:,[ymin,ymax]] = np.clip(labels[:,[ymin,ymax]], a_min=0, a_max=image_height-1)
+ clipped_boxes[:,[xmin,xmax]] = np.clip(labels[:,[xmin,xmax]], a_min=0, a_max=image_width-1)
+ intersection_areas = (clipped_boxes[:,xmax] - clipped_boxes[:,xmin] + d) * (clipped_boxes[:,ymax] - clipped_boxes[:,ymin] + d) # +1 because the border pixels belong to the box areas.
+ # Check which boxes meet the overlap requirements.
+ if lower == 0.0:
+ mask_lower = intersection_areas > lower * box_areas # If `self.lower == 0`, we want to make sure that boxes with area 0 don't count, hence the ">" sign instead of the ">=" sign.
+ else:
+ mask_lower = intersection_areas >= lower * box_areas # Especially for the case `self.lower == 1` we want the ">=" sign, otherwise no boxes would count at all.
+ mask_upper = intersection_areas <= upper * box_areas
+ requirements_met *= mask_lower * mask_upper
+
+ elif self.overlap_criterion == 'center_point':
+ # Compute the center points of the boxes.
+ cy = (labels[:,ymin] + labels[:,ymax]) / 2
+ cx = (labels[:,xmin] + labels[:,xmax]) / 2
+ # Check which of the boxes have center points within the cropped patch remove those that don't.
+ requirements_met *= (cy >= 0.0) * (cy <= image_height-1) * (cx >= 0.0) * (cx <= image_width-1)
+
+ return labels[requirements_met]
+
+class ImageValidator:
+ '''
+ Returns `True` if a given minimum number of bounding boxes meets given overlap
+ requirements with an image of a given height and width.
+ '''
+
+ def __init__(self,
+ overlap_criterion='center_point',
+ bounds=(0.3, 1.0),
+ n_boxes_min=1,
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4},
+ border_pixels='half'):
+ '''
+ Arguments:
+ overlap_criterion (str, optional): Can be either of 'center_point', 'iou', or 'area'. Determines
+ which boxes are considered valid with respect to a given image. If set to 'center_point',
+ a given bounding box is considered valid if its center point lies within the image.
+ If set to 'area', a given bounding box is considered valid if the quotient of its intersection
+ area with the image and its own area is within `lower` and `upper`. If set to 'iou', a given
+ bounding box is considered valid if its IoU with the image is within `lower` and `upper`.
+ bounds (list or BoundGenerator, optional): Only relevant if `overlap_criterion` is 'area' or 'iou'.
+ Determines the lower and upper bounds for `overlap_criterion`. Can be either a 2-tuple of scalars
+ representing a lower bound and an upper bound, or a `BoundGenerator` object, which provides
+ the possibility to generate bounds randomly.
+ n_boxes_min (int or str, optional): Either a non-negative integer or the string 'all'.
+ Determines the minimum number of boxes that must meet the `overlap_criterion` with respect to
+ an image of the given height and width in order for the image to be a valid image.
+ If set to 'all', an image is considered valid if all given boxes meet the `overlap_criterion`.
+ labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+ of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+ 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+ border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+ Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+ to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+ If 'half', then one of each of the two horizontal and vertical borders belong
+ to the boxex, but not the other.
+ '''
+ if not ((isinstance(n_boxes_min, int) and n_boxes_min > 0) or n_boxes_min == 'all'):
+ raise ValueError("`n_boxes_min` must be a positive integer or 'all'.")
+ self.overlap_criterion = overlap_criterion
+ self.bounds = bounds
+ self.n_boxes_min = n_boxes_min
+ self.labels_format = labels_format
+ self.border_pixels = border_pixels
+ self.box_filter = BoxFilter(check_overlap=True,
+ check_min_area=False,
+ check_degenerate=False,
+ overlap_criterion=self.overlap_criterion,
+ overlap_bounds=self.bounds,
+ labels_format=self.labels_format,
+ border_pixels=self.border_pixels)
+
+ def __call__(self,
+ labels,
+ image_height,
+ image_width):
+ '''
+ Arguments:
+ labels (array): The labels to be tested. The box coordinates are expected
+ to be in the image's coordinate system.
+ image_height (int): The height of the image to compare the box coordinates to.
+ image_width (int): The width of the image to compare the box coordinates to.
+
+ Returns:
+ A boolean indicating whether an imgae of the given height and width is
+ valid with respect to the given bounding boxes.
+ '''
+
+ self.box_filter.overlap_bounds = self.bounds
+ self.box_filter.labels_format = self.labels_format
+
+ # Get all boxes that meet the overlap requirements.
+ valid_labels = self.box_filter(labels=labels,
+ image_height=image_height,
+ image_width=image_width)
+
+ # Check whether enough boxes meet the requirements.
+ if isinstance(self.n_boxes_min, int):
+ # The image is valid if at least `self.n_boxes_min` ground truth boxes meet the requirements.
+ if len(valid_labels) >= self.n_boxes_min:
+ return True
+ else:
+ return False
+ elif self.n_boxes_min == 'all':
+ # The image is valid if all ground truth boxes meet the requirements.
+ if len(valid_labels) == len(labels):
+ return True
+ else:
+ return False
diff --git a/engine/object_detection_branch/single_shot_detector/data_generator/object_detection_2d_misc_utils.py b/engine/object_detection_branch/single_shot_detector/data_generator/object_detection_2d_misc_utils.py
new file mode 100644
index 0000000..1a4397f
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/data_generator/object_detection_2d_misc_utils.py
@@ -0,0 +1,73 @@
+'''
+Miscellaneous data generator utilities.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+
+def apply_inverse_transforms(y_pred_decoded, inverse_transforms):
+ '''
+ Takes a list or Numpy array of decoded predictions and applies a given list of
+ transforms to them. The list of inverse transforms would usually contain the
+ inverter functions that some of the image transformations that come with this
+ data generator return. This function would normally be used to transform predictions
+ that were made on a transformed image back to the original image.
+
+ Arguments:
+ y_pred_decoded (list or array): Either a list of length `batch_size` that
+ contains Numpy arrays that contain the predictions for each batch item
+ or a Numpy array. If this is a list of Numpy arrays, the arrays would
+ usually have the shape `(num_predictions, 6)`, where `num_predictions`
+ is different for each batch item. If this is a Numpy array, it would
+ usually have the shape `(batch_size, num_predictions, 6)`. The last axis
+ would usually contain the class ID, confidence score, and four bounding
+ box coordinates for each prediction.
+ inverse_predictions (list): A nested list of length `batch_size` that contains
+ for each batch item a list of functions that take one argument (one element
+ of `y_pred_decoded` if it is a list or one slice along the first axis of
+ `y_pred_decoded` if it is an array) and return an output of the same shape
+ and data type.
+
+ Returns:
+ The transformed predictions, which have the same structure as `y_pred_decoded`.
+ '''
+
+ if isinstance(y_pred_decoded, list):
+
+ y_pred_decoded_inv = []
+
+ for i in range(len(y_pred_decoded)):
+ y_pred_decoded_inv.append(np.copy(y_pred_decoded[i]))
+ if y_pred_decoded_inv[i].size > 0: # If there are any predictions for this batch item.
+ for inverter in inverse_transforms[i]:
+ if not (inverter is None):
+ y_pred_decoded_inv[i] = inverter(y_pred_decoded_inv[i])
+
+ elif isinstance(y_pred_decoded, np.ndarray):
+
+ y_pred_decoded_inv = np.copy(y_pred_decoded)
+
+ for i in range(len(y_pred_decoded)):
+ if y_pred_decoded_inv[i].size > 0: # If there are any predictions for this batch item.
+ for inverter in inverse_transforms[i]:
+ if not (inverter is None):
+ y_pred_decoded_inv[i] = inverter(y_pred_decoded_inv[i])
+
+ else:
+ raise ValueError("`y_pred_decoded` must be either a list or a Numpy array.")
+
+ return y_pred_decoded_inv
diff --git a/engine/object_detection_branch/single_shot_detector/data_generator/object_detection_2d_patch_sampling_ops.py b/engine/object_detection_branch/single_shot_detector/data_generator/object_detection_2d_patch_sampling_ops.py
new file mode 100644
index 0000000..bec7002
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/data_generator/object_detection_2d_patch_sampling_ops.py
@@ -0,0 +1,881 @@
+'''
+Various patch sampling operations for data augmentation in 2D object detection.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+
+from data_generator.object_detection_2d_image_boxes_validation_utils import BoundGenerator, BoxFilter, ImageValidator
+
+class PatchCoordinateGenerator:
+ '''
+ Generates random patch coordinates that meet specified requirements.
+ '''
+
+ def __init__(self,
+ img_height=None,
+ img_width=None,
+ must_match='h_w',
+ min_scale=0.3,
+ max_scale=1.0,
+ scale_uniformly=False,
+ min_aspect_ratio = 0.5,
+ max_aspect_ratio = 2.0,
+ patch_ymin=None,
+ patch_xmin=None,
+ patch_height=None,
+ patch_width=None,
+ patch_aspect_ratio=None):
+ '''
+ Arguments:
+ img_height (int): The height of the image for which the patch coordinates
+ shall be generated. Doesn't have to be known upon construction.
+ img_width (int): The width of the image for which the patch coordinates
+ shall be generated. Doesn't have to be known upon construction.
+ must_match (str, optional): Can be either of 'h_w', 'h_ar', and 'w_ar'.
+ Specifies which two of the three quantities height, width, and aspect
+ ratio determine the shape of the generated patch. The respective third
+ quantity will be computed from the other two. For example,
+ if `must_match == 'h_w'`, then the patch's height and width will be
+ set to lie within [min_scale, max_scale] of the image size or to
+ `patch_height` and/or `patch_width`, if given. The patch's aspect ratio
+ is the dependent variable in this case, it will be computed from the
+ height and width. Any given values for `patch_aspect_ratio`,
+ `min_aspect_ratio`, or `max_aspect_ratio` will be ignored.
+ min_scale (float, optional): The minimum size of a dimension of the patch
+ as a fraction of the respective dimension of the image. Can be greater
+ than 1. For example, if the image width is 200 and `min_scale == 0.5`,
+ then the width of the generated patch will be at least 100. If `min_scale == 1.5`,
+ the width of the generated patch will be at least 300.
+ max_scale (float, optional): The maximum size of a dimension of the patch
+ as a fraction of the respective dimension of the image. Can be greater
+ than 1. For example, if the image width is 200 and `max_scale == 1.0`,
+ then the width of the generated patch will be at most 200. If `max_scale == 1.5`,
+ the width of the generated patch will be at most 300. Must be greater than
+ `min_scale`.
+ scale_uniformly (bool, optional): If `True` and if `must_match == 'h_w'`,
+ the patch height and width will be scaled uniformly, otherwise they will
+ be scaled independently.
+ min_aspect_ratio (float, optional): Determines the minimum aspect ratio
+ for the generated patches.
+ max_aspect_ratio (float, optional): Determines the maximum aspect ratio
+ for the generated patches.
+ patch_ymin (int, optional): `None` or the vertical coordinate of the top left
+ corner of the generated patches. If this is not `None`, the position of the
+ patches along the vertical axis is fixed. If this is `None`, then the
+ vertical position of generated patches will be chosen randomly such that
+ the overlap of a patch and the image along the vertical dimension is
+ always maximal.
+ patch_xmin (int, optional): `None` or the horizontal coordinate of the top left
+ corner of the generated patches. If this is not `None`, the position of the
+ patches along the horizontal axis is fixed. If this is `None`, then the
+ horizontal position of generated patches will be chosen randomly such that
+ the overlap of a patch and the image along the horizontal dimension is
+ always maximal.
+ patch_height (int, optional): `None` or the fixed height of the generated patches.
+ patch_width (int, optional): `None` or the fixed width of the generated patches.
+ patch_aspect_ratio (float, optional): `None` or the fixed aspect ratio of the
+ generated patches.
+ '''
+
+ if not (must_match in {'h_w', 'h_ar', 'w_ar'}):
+ raise ValueError("`must_match` must be either of 'h_w', 'h_ar' and 'w_ar'.")
+ if min_scale >= max_scale:
+ raise ValueError("It must be `min_scale < max_scale`.")
+ if min_aspect_ratio >= max_aspect_ratio:
+ raise ValueError("It must be `min_aspect_ratio < max_aspect_ratio`.")
+ if scale_uniformly and not ((patch_height is None) and (patch_width is None)):
+ raise ValueError("If `scale_uniformly == True`, `patch_height` and `patch_width` must both be `None`.")
+ self.img_height = img_height
+ self.img_width = img_width
+ self.must_match = must_match
+ self.min_scale = min_scale
+ self.max_scale = max_scale
+ self.scale_uniformly = scale_uniformly
+ self.min_aspect_ratio = min_aspect_ratio
+ self.max_aspect_ratio = max_aspect_ratio
+ self.patch_ymin = patch_ymin
+ self.patch_xmin = patch_xmin
+ self.patch_height = patch_height
+ self.patch_width = patch_width
+ self.patch_aspect_ratio = patch_aspect_ratio
+
+ def __call__(self):
+ '''
+ Returns:
+ A 4-tuple `(ymin, xmin, height, width)` that represents the coordinates
+ of the generated patch.
+ '''
+
+ # Get the patch height and width.
+
+ if self.must_match == 'h_w': # Aspect is the dependent variable.
+ if not self.scale_uniformly:
+ # Get the height.
+ if self.patch_height is None:
+ patch_height = int(np.random.uniform(self.min_scale, self.max_scale) * self.img_height)
+ else:
+ patch_height = self.patch_height
+ # Get the width.
+ if self.patch_width is None:
+ patch_width = int(np.random.uniform(self.min_scale, self.max_scale) * self.img_width)
+ else:
+ patch_width = self.patch_width
+ else:
+ scaling_factor = np.random.uniform(self.min_scale, self.max_scale)
+ patch_height = int(scaling_factor * self.img_height)
+ patch_width = int(scaling_factor * self.img_width)
+
+ elif self.must_match == 'h_ar': # Width is the dependent variable.
+ # Get the height.
+ if self.patch_height is None:
+ patch_height = int(np.random.uniform(self.min_scale, self.max_scale) * self.img_height)
+ else:
+ patch_height = self.patch_height
+ # Get the aspect ratio.
+ if self.patch_aspect_ratio is None:
+ patch_aspect_ratio = np.random.uniform(self.min_aspect_ratio, self.max_aspect_ratio)
+ else:
+ patch_aspect_ratio = self.patch_aspect_ratio
+ # Get the width.
+ patch_width = int(patch_height * patch_aspect_ratio)
+
+ elif self.must_match == 'w_ar': # Height is the dependent variable.
+ # Get the width.
+ if self.patch_width is None:
+ patch_width = int(np.random.uniform(self.min_scale, self.max_scale) * self.img_width)
+ else:
+ patch_width = self.patch_width
+ # Get the aspect ratio.
+ if self.patch_aspect_ratio is None:
+ patch_aspect_ratio = np.random.uniform(self.min_aspect_ratio, self.max_aspect_ratio)
+ else:
+ patch_aspect_ratio = self.patch_aspect_ratio
+ # Get the height.
+ patch_height = int(patch_width / patch_aspect_ratio)
+
+ # Get the top left corner coordinates of the patch.
+
+ if self.patch_ymin is None:
+ # Compute how much room we have along the vertical axis to place the patch.
+ # A negative number here means that we want to sample a patch that is larger than the original image
+ # in the vertical dimension, in which case the patch will be placed such that it fully contains the
+ # image in the vertical dimension.
+ y_range = self.img_height - patch_height
+ # Select a random top left corner for the sample position from the possible positions.
+ if y_range >= 0: patch_ymin = np.random.randint(0, y_range + 1) # There are y_range + 1 possible positions for the crop in the vertical dimension.
+ else: patch_ymin = np.random.randint(y_range, 1) # The possible positions for the image on the background canvas in the vertical dimension.
+ else:
+ patch_ymin = self.patch_ymin
+
+ if self.patch_xmin is None:
+ # Compute how much room we have along the horizontal axis to place the patch.
+ # A negative number here means that we want to sample a patch that is larger than the original image
+ # in the horizontal dimension, in which case the patch will be placed such that it fully contains the
+ # image in the horizontal dimension.
+ x_range = self.img_width - patch_width
+ # Select a random top left corner for the sample position from the possible positions.
+ if x_range >= 0: patch_xmin = np.random.randint(0, x_range + 1) # There are x_range + 1 possible positions for the crop in the horizontal dimension.
+ else: patch_xmin = np.random.randint(x_range, 1) # The possible positions for the image on the background canvas in the horizontal dimension.
+ else:
+ patch_xmin = self.patch_xmin
+
+ return (patch_ymin, patch_xmin, patch_height, patch_width)
+
+class CropPad:
+ '''
+ Crops and/or pads an image deterministically.
+
+ Depending on the given output patch size and the position (top left corner) relative
+ to the input image, the image will be cropped and/or padded along one or both spatial
+ dimensions.
+
+ For example, if the output patch lies entirely within the input image, this will result
+ in a regular crop. If the input image lies entirely within the output patch, this will
+ result in the image being padded in every direction. All other cases are mixed cases
+ where the image might be cropped in some directions and padded in others.
+
+ The output patch can be arbitrary in both size and position as long as it overlaps
+ with the input image.
+ '''
+
+ def __init__(self,
+ patch_ymin,
+ patch_xmin,
+ patch_height,
+ patch_width,
+ clip_boxes=True,
+ box_filter=None,
+ background=(0,0,0),
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+ '''
+ Arguments:
+ patch_ymin (int, optional): The vertical coordinate of the top left corner of the output
+ patch relative to the image coordinate system. Can be negative (i.e. lie outside the image)
+ as long as the resulting patch still overlaps with the image.
+ patch_ymin (int, optional): The horizontal coordinate of the top left corner of the output
+ patch relative to the image coordinate system. Can be negative (i.e. lie outside the image)
+ as long as the resulting patch still overlaps with the image.
+ patch_height (int): The height of the patch to be sampled from the image. Can be greater
+ than the height of the input image.
+ patch_width (int): The width of the patch to be sampled from the image. Can be greater
+ than the width of the input image.
+ clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given.
+ If `True`, any ground truth bounding boxes will be clipped to lie entirely within the
+ sampled patch.
+ box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given.
+ A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria
+ after the transformation. Refer to the `BoxFilter` documentation for details. If `None`,
+ the validity of the bounding boxes is not checked.
+ background (list/tuple, optional): A 3-tuple specifying the RGB color value of the potential
+ background pixels of the scaled images. In the case of single-channel images,
+ the first element of `background` will be used as the background pixel value.
+ labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+ of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+ 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+ '''
+ #if (patch_height <= 0) or (patch_width <= 0):
+ # raise ValueError("Patch height and width must both be positive.")
+ #if (patch_ymin + patch_height < 0) or (patch_xmin + patch_width < 0):
+ # raise ValueError("A patch with the given coordinates cannot overlap with an input image.")
+ if not (isinstance(box_filter, BoxFilter) or box_filter is None):
+ raise ValueError("`box_filter` must be either `None` or a `BoxFilter` object.")
+ self.patch_height = patch_height
+ self.patch_width = patch_width
+ self.patch_ymin = patch_ymin
+ self.patch_xmin = patch_xmin
+ self.clip_boxes = clip_boxes
+ self.box_filter = box_filter
+ self.background = background
+ self.labels_format = labels_format
+
+ def __call__(self, image, labels=None, return_inverter=False):
+
+ img_height, img_width = image.shape[:2]
+
+ if (self.patch_ymin > img_height) or (self.patch_xmin > img_width):
+ raise ValueError("The given patch doesn't overlap with the input image.")
+
+ labels = np.copy(labels)
+
+ xmin = self.labels_format['xmin']
+ ymin = self.labels_format['ymin']
+ xmax = self.labels_format['xmax']
+ ymax = self.labels_format['ymax']
+
+ # Top left corner of the patch relative to the image coordinate system:
+ patch_ymin = self.patch_ymin
+ patch_xmin = self.patch_xmin
+
+ # Create a canvas of the size of the patch we want to end up with.
+ if image.ndim == 3:
+ canvas = np.zeros(shape=(self.patch_height, self.patch_width, 3), dtype=np.uint8)
+ canvas[:, :] = self.background
+ elif image.ndim == 2:
+ canvas = np.zeros(shape=(self.patch_height, self.patch_width), dtype=np.uint8)
+ canvas[:, :] = self.background[0]
+
+ # Perform the crop.
+ if patch_ymin < 0 and patch_xmin < 0: # Pad the image at the top and on the left.
+ image_crop_height = min(img_height, self.patch_height + patch_ymin) # The number of pixels of the image that will end up on the canvas in the vertical direction.
+ image_crop_width = min(img_width, self.patch_width + patch_xmin) # The number of pixels of the image that will end up on the canvas in the horizontal direction.
+ canvas[-patch_ymin:-patch_ymin + image_crop_height, -patch_xmin:-patch_xmin + image_crop_width] = image[:image_crop_height, :image_crop_width]
+
+ elif patch_ymin < 0 and patch_xmin >= 0: # Pad the image at the top and crop it on the left.
+ image_crop_height = min(img_height, self.patch_height + patch_ymin) # The number of pixels of the image that will end up on the canvas in the vertical direction.
+ image_crop_width = min(self.patch_width, img_width - patch_xmin) # The number of pixels of the image that will end up on the canvas in the horizontal direction.
+ canvas[-patch_ymin:-patch_ymin + image_crop_height, :image_crop_width] = image[:image_crop_height, patch_xmin:patch_xmin + image_crop_width]
+
+ elif patch_ymin >= 0 and patch_xmin < 0: # Crop the image at the top and pad it on the left.
+ image_crop_height = min(self.patch_height, img_height - patch_ymin) # The number of pixels of the image that will end up on the canvas in the vertical direction.
+ image_crop_width = min(img_width, self.patch_width + patch_xmin) # The number of pixels of the image that will end up on the canvas in the horizontal direction.
+ canvas[:image_crop_height, -patch_xmin:-patch_xmin + image_crop_width] = image[patch_ymin:patch_ymin + image_crop_height, :image_crop_width]
+
+ elif patch_ymin >= 0 and patch_xmin >= 0: # Crop the image at the top and on the left.
+ image_crop_height = min(self.patch_height, img_height - patch_ymin) # The number of pixels of the image that will end up on the canvas in the vertical direction.
+ image_crop_width = min(self.patch_width, img_width - patch_xmin) # The number of pixels of the image that will end up on the canvas in the horizontal direction.
+ canvas[:image_crop_height, :image_crop_width] = image[patch_ymin:patch_ymin + image_crop_height, patch_xmin:patch_xmin + image_crop_width]
+
+ image = canvas
+
+ if return_inverter:
+ def inverter(labels):
+ labels = np.copy(labels)
+ labels[:, [ymin+1, ymax+1]] += patch_ymin
+ labels[:, [xmin+1, xmax+1]] += patch_xmin
+ return labels
+
+ if not (labels is None):
+
+ # Translate the box coordinates to the patch's coordinate system.
+ labels[:, [ymin, ymax]] -= patch_ymin
+ labels[:, [xmin, xmax]] -= patch_xmin
+
+ # Compute all valid boxes for this patch.
+ if not (self.box_filter is None):
+ self.box_filter.labels_format = self.labels_format
+ labels = self.box_filter(labels=labels,
+ image_height=self.patch_height,
+ image_width=self.patch_width)
+
+ if self.clip_boxes:
+ labels[:,[ymin,ymax]] = np.clip(labels[:,[ymin,ymax]], a_min=0, a_max=self.patch_height-1)
+ labels[:,[xmin,xmax]] = np.clip(labels[:,[xmin,xmax]], a_min=0, a_max=self.patch_width-1)
+
+ if return_inverter:
+ return image, labels, inverter
+ else:
+ return image, labels
+
+ else:
+ if return_inverter:
+ return image, inverter
+ else:
+ return image
+
+class Crop:
+ '''
+ Crops off the specified numbers of pixels from the borders of images.
+
+ This is just a convenience interface for `CropPad`.
+ '''
+
+ def __init__(self,
+ crop_top,
+ crop_bottom,
+ crop_left,
+ crop_right,
+ clip_boxes=True,
+ box_filter=None,
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+ self.crop_top = crop_top
+ self.crop_bottom = crop_bottom
+ self.crop_left = crop_left
+ self.crop_right = crop_right
+ self.clip_boxes = clip_boxes
+ self.box_filter = box_filter
+ self.labels_format = labels_format
+ self.crop = CropPad(patch_ymin=self.crop_top,
+ patch_xmin=self.crop_left,
+ patch_height=None,
+ patch_width=None,
+ clip_boxes=self.clip_boxes,
+ box_filter=self.box_filter,
+ labels_format=self.labels_format)
+
+ def __call__(self, image, labels=None, return_inverter=False):
+
+ img_height, img_width = image.shape[:2]
+
+ self.crop.patch_height = img_height - self.crop_top - self.crop_bottom
+ self.crop.patch_width = img_width - self.crop_left - self.crop_right
+ self.crop.labels_format = self.labels_format
+
+ return self.crop(image, labels, return_inverter)
+
+class Pad:
+ '''
+ Pads images by the specified numbers of pixels on each side.
+
+ This is just a convenience interface for `CropPad`.
+ '''
+
+ def __init__(self,
+ pad_top,
+ pad_bottom,
+ pad_left,
+ pad_right,
+ background=(0,0,0),
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+ self.pad_top = pad_top
+ self.pad_bottom = pad_bottom
+ self.pad_left = pad_left
+ self.pad_right = pad_right
+ self.background = background
+ self.labels_format = labels_format
+ self.pad = CropPad(patch_ymin=-self.pad_top,
+ patch_xmin=-self.pad_left,
+ patch_height=None,
+ patch_width=None,
+ clip_boxes=False,
+ box_filter=None,
+ background=self.background,
+ labels_format=self.labels_format)
+
+ def __call__(self, image, labels=None, return_inverter=False):
+
+ img_height, img_width = image.shape[:2]
+
+ self.pad.patch_height = img_height + self.pad_top + self.pad_bottom
+ self.pad.patch_width = img_width + self.pad_left + self.pad_right
+ self.pad.labels_format = self.labels_format
+
+ return self.pad(image, labels, return_inverter)
+
+class RandomPatch:
+ '''
+ Randomly samples a patch from an image. The randomness refers to whatever
+ randomness may be introduced by the patch coordinate generator, the box filter,
+ and the patch validator.
+
+ Input images may be cropped and/or padded along either or both of the two
+ spatial dimensions as necessary in order to obtain the required patch.
+
+ As opposed to `RandomPatchInf`, it is possible for this transform to fail to produce
+ an output image at all, in which case it will return `None`. This is useful, because
+ if this transform is used to generate patches of a fixed size or aspect ratio, then
+ the caller needs to be able to rely on the output image satisfying the set size or
+ aspect ratio. It might therefore not be an option to return the unaltered input image
+ as other random transforms do when they fail to produce a valid transformed image.
+ '''
+
+ def __init__(self,
+ patch_coord_generator,
+ box_filter=None,
+ image_validator=None,
+ n_trials_max=3,
+ clip_boxes=True,
+ prob=1.0,
+ background=(0,0,0),
+ can_fail=False,
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+ '''
+ Arguments:
+ patch_coord_generator (PatchCoordinateGenerator): A `PatchCoordinateGenerator` object
+ to generate the positions and sizes of the patches to be sampled from the input images.
+ box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given.
+ A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria
+ after the transformation. Refer to the `BoxFilter` documentation for details. If `None`,
+ the validity of the bounding boxes is not checked.
+ image_validator (ImageValidator, optional): Only relevant if ground truth bounding boxes are given.
+ An `ImageValidator` object to determine whether a sampled patch is valid. If `None`,
+ any outcome is valid.
+ n_trials_max (int, optional): Only relevant if ground truth bounding boxes are given.
+ Determines the maxmial number of trials to sample a valid patch. If no valid patch could
+ be sampled in `n_trials_max` trials, returns one `None` in place of each regular output.
+ clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given.
+ If `True`, any ground truth bounding boxes will be clipped to lie entirely within the
+ sampled patch.
+ prob (float, optional): `(1 - prob)` determines the probability with which the original,
+ unaltered image is returned.
+ background (list/tuple, optional): A 3-tuple specifying the RGB color value of the potential
+ background pixels of the scaled images. In the case of single-channel images,
+ the first element of `background` will be used as the background pixel value.
+ can_fail (bool, optional): If `True`, will return `None` if no valid patch could be found after
+ `n_trials_max` trials. If `False`, will return the unaltered input image in such a case.
+ labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+ of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+ 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+ '''
+ if not isinstance(patch_coord_generator, PatchCoordinateGenerator):
+ raise ValueError("`patch_coord_generator` must be an instance of `PatchCoordinateGenerator`.")
+ if not (isinstance(image_validator, ImageValidator) or image_validator is None):
+ raise ValueError("`image_validator` must be either `None` or an `ImageValidator` object.")
+ self.patch_coord_generator = patch_coord_generator
+ self.box_filter = box_filter
+ self.image_validator = image_validator
+ self.n_trials_max = n_trials_max
+ self.clip_boxes = clip_boxes
+ self.prob = prob
+ self.background = background
+ self.can_fail = can_fail
+ self.labels_format = labels_format
+ self.sample_patch = CropPad(patch_ymin=None,
+ patch_xmin=None,
+ patch_height=None,
+ patch_width=None,
+ clip_boxes=self.clip_boxes,
+ box_filter=self.box_filter,
+ background=self.background,
+ labels_format=self.labels_format)
+
+ def __call__(self, image, labels=None, return_inverter=False):
+
+ p = np.random.uniform(0,1)
+ if p >= (1.0-self.prob):
+
+ img_height, img_width = image.shape[:2]
+ self.patch_coord_generator.img_height = img_height
+ self.patch_coord_generator.img_width = img_width
+
+ xmin = self.labels_format['xmin']
+ ymin = self.labels_format['ymin']
+ xmax = self.labels_format['xmax']
+ ymax = self.labels_format['ymax']
+
+ # Override the preset labels format.
+ if not self.image_validator is None:
+ self.image_validator.labels_format = self.labels_format
+ self.sample_patch.labels_format = self.labels_format
+
+ for _ in range(max(1, self.n_trials_max)):
+
+ # Generate patch coordinates.
+ patch_ymin, patch_xmin, patch_height, patch_width = self.patch_coord_generator()
+
+ self.sample_patch.patch_ymin = patch_ymin
+ self.sample_patch.patch_xmin = patch_xmin
+ self.sample_patch.patch_height = patch_height
+ self.sample_patch.patch_width = patch_width
+
+ if (labels is None) or (self.image_validator is None):
+ # We either don't have any boxes or if we do, we will accept any outcome as valid.
+ return self.sample_patch(image, labels, return_inverter)
+ else:
+ # Translate the box coordinates to the patch's coordinate system.
+ new_labels = np.copy(labels)
+ new_labels[:, [ymin, ymax]] -= patch_ymin
+ new_labels[:, [xmin, xmax]] -= patch_xmin
+ # Check if the patch is valid.
+ if self.image_validator(labels=new_labels,
+ image_height=patch_height,
+ image_width=patch_width):
+ return self.sample_patch(image, labels, return_inverter)
+
+ # If we weren't able to sample a valid patch...
+ if self.can_fail:
+ # ...return `None`.
+ if labels is None:
+ if return_inverter:
+ return None, None
+ else:
+ return None
+ else:
+ if return_inverter:
+ return None, None, None
+ else:
+ return None, None
+ else:
+ # ...return the unaltered input image.
+ if labels is None:
+ if return_inverter:
+ return image, None
+ else:
+ return image
+ else:
+ if return_inverter:
+ return image, labels, None
+ else:
+ return image, labels
+
+ else:
+ if return_inverter:
+ def inverter(labels):
+ return labels
+
+ if labels is None:
+ if return_inverter:
+ return image, inverter
+ else:
+ return image
+ else:
+ if return_inverter:
+ return image, labels, inverter
+ else:
+ return image, labels
+
+class RandomPatchInf:
+ '''
+ Randomly samples a patch from an image. The randomness refers to whatever
+ randomness may be introduced by the patch coordinate generator, the box filter,
+ and the patch validator.
+
+ Input images may be cropped and/or padded along either or both of the two
+ spatial dimensions as necessary in order to obtain the required patch.
+
+ This operation is very similar to `RandomPatch`, except that:
+ 1. This operation runs indefinitely until either a valid patch is found or
+ the input image is returned unaltered, i.e. it cannot fail.
+ 2. If a bound generator is given, a new pair of bounds will be generated
+ every `n_trials_max` iterations.
+ '''
+
+ def __init__(self,
+ patch_coord_generator,
+ box_filter=None,
+ image_validator=None,
+ bound_generator=None,
+ n_trials_max=50,
+ clip_boxes=True,
+ prob=0.857,
+ background=(0,0,0),
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+ '''
+ Arguments:
+ patch_coord_generator (PatchCoordinateGenerator): A `PatchCoordinateGenerator` object
+ to generate the positions and sizes of the patches to be sampled from the input images.
+ box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given.
+ A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria
+ after the transformation. Refer to the `BoxFilter` documentation for details. If `None`,
+ the validity of the bounding boxes is not checked.
+ image_validator (ImageValidator, optional): Only relevant if ground truth bounding boxes are given.
+ An `ImageValidator` object to determine whether a sampled patch is valid. If `None`,
+ any outcome is valid.
+ bound_generator (BoundGenerator, optional): A `BoundGenerator` object to generate upper and
+ lower bound values for the patch validator. Every `n_trials_max` trials, a new pair of
+ upper and lower bounds will be generated until a valid patch is found or the original image
+ is returned. This bound generator overrides the bound generator of the patch validator.
+ n_trials_max (int, optional): Only relevant if ground truth bounding boxes are given.
+ The sampler will run indefinitely until either a valid patch is found or the original image
+ is returned, but this determines the maxmial number of trials to sample a valid patch for each
+ selected pair of lower and upper bounds before a new pair is picked.
+ clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given.
+ If `True`, any ground truth bounding boxes will be clipped to lie entirely within the
+ sampled patch.
+ prob (float, optional): `(1 - prob)` determines the probability with which the original,
+ unaltered image is returned.
+ background (list/tuple, optional): A 3-tuple specifying the RGB color value of the potential
+ background pixels of the scaled images. In the case of single-channel images,
+ the first element of `background` will be used as the background pixel value.
+ labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+ of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+ 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+ '''
+
+ if not isinstance(patch_coord_generator, PatchCoordinateGenerator):
+ raise ValueError("`patch_coord_generator` must be an instance of `PatchCoordinateGenerator`.")
+ if not (isinstance(image_validator, ImageValidator) or image_validator is None):
+ raise ValueError("`image_validator` must be either `None` or an `ImageValidator` object.")
+ if not (isinstance(bound_generator, BoundGenerator) or bound_generator is None):
+ raise ValueError("`bound_generator` must be either `None` or a `BoundGenerator` object.")
+ self.patch_coord_generator = patch_coord_generator
+ self.box_filter = box_filter
+ self.image_validator = image_validator
+ self.bound_generator = bound_generator
+ self.n_trials_max = n_trials_max
+ self.clip_boxes = clip_boxes
+ self.prob = prob
+ self.background = background
+ self.labels_format = labels_format
+ self.sample_patch = CropPad(patch_ymin=None,
+ patch_xmin=None,
+ patch_height=None,
+ patch_width=None,
+ clip_boxes=self.clip_boxes,
+ box_filter=self.box_filter,
+ background=self.background,
+ labels_format=self.labels_format)
+
+ def __call__(self, image, labels=None, return_inverter=False):
+
+ img_height, img_width = image.shape[:2]
+ self.patch_coord_generator.img_height = img_height
+ self.patch_coord_generator.img_width = img_width
+
+ xmin = self.labels_format['xmin']
+ ymin = self.labels_format['ymin']
+ xmax = self.labels_format['xmax']
+ ymax = self.labels_format['ymax']
+
+ # Override the preset labels format.
+ if not self.image_validator is None:
+ self.image_validator.labels_format = self.labels_format
+ self.sample_patch.labels_format = self.labels_format
+
+ while True: # Keep going until we either find a valid patch or return the original image.
+
+ p = np.random.uniform(0,1)
+ if p >= (1.0-self.prob):
+
+ # In case we have a bound generator, pick a lower and upper bound for the patch validator.
+ if not ((self.image_validator is None) or (self.bound_generator is None)):
+ self.image_validator.bounds = self.bound_generator()
+
+ # Use at most `self.n_trials_max` attempts to find a crop
+ # that meets our requirements.
+ for _ in range(max(1, self.n_trials_max)):
+
+ # Generate patch coordinates.
+ patch_ymin, patch_xmin, patch_height, patch_width = self.patch_coord_generator()
+
+ self.sample_patch.patch_ymin = patch_ymin
+ self.sample_patch.patch_xmin = patch_xmin
+ self.sample_patch.patch_height = patch_height
+ self.sample_patch.patch_width = patch_width
+
+ # Check if the resulting patch meets the aspect ratio requirements.
+ aspect_ratio = patch_width / patch_height
+ if not (self.patch_coord_generator.min_aspect_ratio <= aspect_ratio <= self.patch_coord_generator.max_aspect_ratio):
+ continue
+
+ if (labels is None) or (self.image_validator is None):
+ # We either don't have any boxes or if we do, we will accept any outcome as valid.
+ return self.sample_patch(image, labels, return_inverter)
+ else:
+ # Translate the box coordinates to the patch's coordinate system.
+ new_labels = np.copy(labels)
+ new_labels[:, [ymin, ymax]] -= patch_ymin
+ new_labels[:, [xmin, xmax]] -= patch_xmin
+ # Check if the patch contains the minimum number of boxes we require.
+ if self.image_validator(labels=new_labels,
+ image_height=patch_height,
+ image_width=patch_width):
+ return self.sample_patch(image, labels, return_inverter)
+ else:
+ if return_inverter:
+ def inverter(labels):
+ return labels
+
+ if labels is None:
+ if return_inverter:
+ return image, inverter
+ else:
+ return image
+ else:
+ if return_inverter:
+ return image, labels, inverter
+ else:
+ return image, labels
+
+class RandomMaxCropFixedAR:
+ '''
+ Crops the largest possible patch of a given fixed aspect ratio
+ from an image.
+
+ Since the aspect ratio of the sampled patches is constant, they
+ can subsequently be resized to the same size without distortion.
+ '''
+
+ def __init__(self,
+ patch_aspect_ratio,
+ box_filter=None,
+ image_validator=None,
+ n_trials_max=3,
+ clip_boxes=True,
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+ '''
+ Arguments:
+ patch_aspect_ratio (float): The fixed aspect ratio that all sampled patches will have.
+ box_filter (BoxFilter, optional): Only relevant if ground truth bounding boxes are given.
+ A `BoxFilter` object to filter out bounding boxes that don't meet the given criteria
+ after the transformation. Refer to the `BoxFilter` documentation for details. If `None`,
+ the validity of the bounding boxes is not checked.
+ image_validator (ImageValidator, optional): Only relevant if ground truth bounding boxes are given.
+ An `ImageValidator` object to determine whether a sampled patch is valid. If `None`,
+ any outcome is valid.
+ n_trials_max (int, optional): Only relevant if ground truth bounding boxes are given.
+ Determines the maxmial number of trials to sample a valid patch. If no valid patch could
+ be sampled in `n_trials_max` trials, returns `None`.
+ clip_boxes (bool, optional): Only relevant if ground truth bounding boxes are given.
+ If `True`, any ground truth bounding boxes will be clipped to lie entirely within the
+ sampled patch.
+ labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+ of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+ 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+ '''
+
+ self.patch_aspect_ratio = patch_aspect_ratio
+ self.box_filter = box_filter
+ self.image_validator = image_validator
+ self.n_trials_max = n_trials_max
+ self.clip_boxes = clip_boxes
+ self.labels_format = labels_format
+ self.random_patch = RandomPatch(patch_coord_generator=PatchCoordinateGenerator(), # Just a dummy object
+ box_filter=self.box_filter,
+ image_validator=self.image_validator,
+ n_trials_max=self.n_trials_max,
+ clip_boxes=self.clip_boxes,
+ prob=1.0,
+ can_fail=False,
+ labels_format=self.labels_format)
+
+ def __call__(self, image, labels=None, return_inverter=False):
+
+ img_height, img_width = image.shape[:2]
+
+ # The ratio of the input image aspect ratio and patch aspect ratio determines the maximal possible crop.
+ image_aspect_ratio = img_width / img_height
+
+ if image_aspect_ratio < self.patch_aspect_ratio:
+ patch_width = img_width
+ patch_height = int(round(patch_width / self.patch_aspect_ratio))
+ else:
+ patch_height = img_height
+ patch_width = int(round(patch_height * self.patch_aspect_ratio))
+
+ # Now that we know the desired height and width for the patch,
+ # instantiate an appropriate patch coordinate generator.
+ patch_coord_generator = PatchCoordinateGenerator(img_height=img_height,
+ img_width=img_width,
+ must_match='h_w',
+ patch_height=patch_height,
+ patch_width=patch_width)
+
+ # The rest of the work is done by `RandomPatch`.
+ self.random_patch.patch_coord_generator = patch_coord_generator
+ self.random_patch.labels_format = self.labels_format
+ return self.random_patch(image, labels, return_inverter)
+
+class RandomPadFixedAR:
+ '''
+ Adds the minimal possible padding to an image that results in a patch
+ of the given fixed aspect ratio that contains the entire image.
+
+ Since the aspect ratio of the resulting images is constant, they
+ can subsequently be resized to the same size without distortion.
+ '''
+
+ def __init__(self,
+ patch_aspect_ratio,
+ background=(0,0,0),
+ labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+ '''
+ Arguments:
+ patch_aspect_ratio (float): The fixed aspect ratio that all sampled patches will have.
+ background (list/tuple, optional): A 3-tuple specifying the RGB color value of the potential
+ background pixels of the scaled images. In the case of single-channel images,
+ the first element of `background` will be used as the background pixel value.
+ labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels
+ of an image contains which bounding box coordinate. The dictionary maps at least the keywords
+ 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array.
+ '''
+
+ self.patch_aspect_ratio = patch_aspect_ratio
+ self.background = background
+ self.labels_format = labels_format
+ self.random_patch = RandomPatch(patch_coord_generator=PatchCoordinateGenerator(), # Just a dummy object
+ box_filter=None,
+ image_validator=None,
+ n_trials_max=1,
+ clip_boxes=False,
+ background=self.background,
+ prob=1.0,
+ labels_format=self.labels_format)
+
+ def __call__(self, image, labels=None, return_inverter=False):
+
+ img_height, img_width = image.shape[:2]
+
+ if img_width < img_height:
+ patch_height = img_height
+ patch_width = int(round(patch_height * self.patch_aspect_ratio))
+ else:
+ patch_width = img_width
+ patch_height = int(round(patch_width / self.patch_aspect_ratio))
+
+ # Now that we know the desired height and width for the patch,
+ # instantiate an appropriate patch coordinate generator.
+ patch_coord_generator = PatchCoordinateGenerator(img_height=img_height,
+ img_width=img_width,
+ must_match='h_w',
+ patch_height=patch_height,
+ patch_width=patch_width)
+
+ # The rest of the work is done by `RandomPatch`.
+ self.random_patch.patch_coord_generator = patch_coord_generator
+ self.random_patch.labels_format = self.labels_format
+ return self.random_patch(image, labels, return_inverter)
diff --git a/engine/object_detection_branch/single_shot_detector/data_generator/object_detection_2d_photometric_ops.py b/engine/object_detection_branch/single_shot_detector/data_generator/object_detection_2d_photometric_ops.py
new file mode 100644
index 0000000..375b7aa
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/data_generator/object_detection_2d_photometric_ops.py
@@ -0,0 +1,485 @@
+'''
+Various photometric image transformations, both deterministic and probabilistic.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+import cv2
+
+class ConvertColor:
+ '''
+ Converts images between RGB, HSV and grayscale color spaces. This is just a wrapper
+ around `cv2.cvtColor()`.
+ '''
+ def __init__(self, current='RGB', to='HSV', keep_3ch=True):
+ '''
+ Arguments:
+ current (str, optional): The current color space of the images. Can be
+ one of 'RGB' and 'HSV'.
+ to (str, optional): The target color space of the images. Can be one of
+ 'RGB', 'HSV', and 'GRAY'.
+ keep_3ch (bool, optional): Only relevant if `to == GRAY`.
+ If `True`, the resulting grayscale images will have three channels.
+ '''
+ if not ((current in {'RGB', 'HSV'}) and (to in {'RGB', 'HSV', 'GRAY'})):
+ raise NotImplementedError
+ self.current = current
+ self.to = to
+ self.keep_3ch = keep_3ch
+
+ def __call__(self, image, labels=None):
+ if self.current == 'RGB' and self.to == 'HSV':
+ image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
+ elif self.current == 'RGB' and self.to == 'GRAY':
+ image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+ if self.keep_3ch:
+ image = np.stack([image] * 3, axis=-1)
+ elif self.current == 'HSV' and self.to == 'RGB':
+ image = cv2.cvtColor(image, cv2.COLOR_HSV2RGB)
+ elif self.current == 'HSV' and self.to == 'GRAY':
+ image = cv2.cvtColor(image, cv2.COLOR_HSV2GRAY)
+ if self.keep_3ch:
+ image = np.stack([image] * 3, axis=-1)
+ if labels is None:
+ return image
+ else:
+ return image, labels
+
+class ConvertDataType:
+ '''
+ Converts images represented as Numpy arrays between `uint8` and `float32`.
+ Serves as a helper for certain photometric distortions. This is just a wrapper
+ around `np.ndarray.astype()`.
+ '''
+ def __init__(self, to='uint8'):
+ '''
+ Arguments:
+ to (string, optional): To which datatype to convert the input images.
+ Can be either of 'uint8' and 'float32'.
+ '''
+ if not (to == 'uint8' or to == 'float32'):
+ raise ValueError("`to` can be either of 'uint8' or 'float32'.")
+ self.to = to
+
+ def __call__(self, image, labels=None):
+ if self.to == 'uint8':
+ image = np.round(image, decimals=0).astype(np.uint8)
+ else:
+ image = image.astype(np.float32)
+ if labels is None:
+ return image
+ else:
+ return image, labels
+
+class ConvertTo3Channels:
+ '''
+ Converts 1-channel and 4-channel images to 3-channel images. Does nothing to images that
+ already have 3 channels. In the case of 4-channel images, the fourth channel will be
+ discarded.
+ '''
+ def __init__(self):
+ pass
+
+ def __call__(self, image, labels=None):
+ if image.ndim == 2:
+ image = np.stack([image] * 3, axis=-1)
+ elif image.ndim == 3:
+ if image.shape[2] == 1:
+ image = np.concatenate([image] * 3, axis=-1)
+ elif image.shape[2] == 4:
+ image = image[:,:,:3]
+ if labels is None:
+ return image
+ else:
+ return image, labels
+
+class Hue:
+ '''
+ Changes the hue of HSV images.
+
+ Important:
+ - Expects HSV input.
+ - Expects input array to be of `dtype` `float`.
+ '''
+ def __init__(self, delta):
+ '''
+ Arguments:
+ delta (int): An integer in the closed interval `[-180, 180]` that determines the hue change, where
+ a change by integer `delta` means a change by `2 * delta` degrees. Read up on the HSV color format
+ if you need more information.
+ '''
+ if not (-180 <= delta <= 180): raise ValueError("`delta` must be in the closed interval `[-180, 180]`.")
+ self.delta = delta
+
+ def __call__(self, image, labels=None):
+ image[:, :, 0] = (image[:, :, 0] + self.delta) % 180.0
+ if labels is None:
+ return image
+ else:
+ return image, labels
+
+class RandomHue:
+ '''
+ Randomly changes the hue of HSV images.
+
+ Important:
+ - Expects HSV input.
+ - Expects input array to be of `dtype` `float`.
+ '''
+ def __init__(self, max_delta=18, prob=0.5):
+ '''
+ Arguments:
+ max_delta (int): An integer in the closed interval `[0, 180]` that determines the maximal absolute
+ hue change.
+ prob (float, optional): `(1 - prob)` determines the probability with which the original,
+ unaltered image is returned.
+ '''
+ if not (0 <= max_delta <= 180): raise ValueError("`max_delta` must be in the closed interval `[0, 180]`.")
+ self.max_delta = max_delta
+ self.prob = prob
+ self.change_hue = Hue(delta=0)
+
+ def __call__(self, image, labels=None):
+ p = np.random.uniform(0,1)
+ if p >= (1.0-self.prob):
+ self.change_hue.delta = np.random.uniform(-self.max_delta, self.max_delta)
+ return self.change_hue(image, labels)
+ elif labels is None:
+ return image
+ else:
+ return image, labels
+
+class Saturation:
+ '''
+ Changes the saturation of HSV images.
+
+ Important:
+ - Expects HSV input.
+ - Expects input array to be of `dtype` `float`.
+ '''
+ def __init__(self, factor):
+ '''
+ Arguments:
+ factor (float): A float greater than zero that determines saturation change, where
+ values less than one result in less saturation and values greater than one result
+ in more saturation.
+ '''
+ if factor <= 0.0: raise ValueError("It must be `factor > 0`.")
+ self.factor = factor
+
+ def __call__(self, image, labels=None):
+ image[:,:,1] = np.clip(image[:,:,1] * self.factor, 0, 255)
+ if labels is None:
+ return image
+ else:
+ return image, labels
+
+class RandomSaturation:
+ '''
+ Randomly changes the saturation of HSV images.
+
+ Important:
+ - Expects HSV input.
+ - Expects input array to be of `dtype` `float`.
+ '''
+ def __init__(self, lower=0.3, upper=2.0, prob=0.5):
+ '''
+ Arguments:
+ lower (float, optional): A float greater than zero, the lower bound for the random
+ saturation change.
+ upper (float, optional): A float greater than zero, the upper bound for the random
+ saturation change. Must be greater than `lower`.
+ prob (float, optional): `(1 - prob)` determines the probability with which the original,
+ unaltered image is returned.
+ '''
+ if lower >= upper: raise ValueError("`upper` must be greater than `lower`.")
+ self.lower = lower
+ self.upper = upper
+ self.prob = prob
+ self.change_saturation = Saturation(factor=1.0)
+
+ def __call__(self, image, labels=None):
+ p = np.random.uniform(0,1)
+ if p >= (1.0-self.prob):
+ self.change_saturation.factor = np.random.uniform(self.lower, self.upper)
+ return self.change_saturation(image, labels)
+ elif labels is None:
+ return image
+ else:
+ return image, labels
+
+class Brightness:
+ '''
+ Changes the brightness of RGB images.
+
+ Important:
+ - Expects RGB input.
+ - Expects input array to be of `dtype` `float`.
+ '''
+ def __init__(self, delta):
+ '''
+ Arguments:
+ delta (int): An integer, the amount to add to or subtract from the intensity
+ of every pixel.
+ '''
+ self.delta = delta
+
+ def __call__(self, image, labels=None):
+ image = np.clip(image + self.delta, 0, 255)
+ if labels is None:
+ return image
+ else:
+ return image, labels
+
+class RandomBrightness:
+ '''
+ Randomly changes the brightness of RGB images.
+
+ Important:
+ - Expects RGB input.
+ - Expects input array to be of `dtype` `float`.
+ '''
+ def __init__(self, lower=-84, upper=84, prob=0.5):
+ '''
+ Arguments:
+ lower (int, optional): An integer, the lower bound for the random brightness change.
+ upper (int, optional): An integer, the upper bound for the random brightness change.
+ Must be greater than `lower`.
+ prob (float, optional): `(1 - prob)` determines the probability with which the original,
+ unaltered image is returned.
+ '''
+ if lower >= upper: raise ValueError("`upper` must be greater than `lower`.")
+ self.lower = float(lower)
+ self.upper = float(upper)
+ self.prob = prob
+ self.change_brightness = Brightness(delta=0)
+
+ def __call__(self, image, labels=None):
+ p = np.random.uniform(0,1)
+ if p >= (1.0-self.prob):
+ self.change_brightness.delta = np.random.uniform(self.lower, self.upper)
+ return self.change_brightness(image, labels)
+ elif labels is None:
+ return image
+ else:
+ return image, labels
+
+class Contrast:
+ '''
+ Changes the contrast of RGB images.
+
+ Important:
+ - Expects RGB input.
+ - Expects input array to be of `dtype` `float`.
+ '''
+ def __init__(self, factor):
+ '''
+ Arguments:
+ factor (float): A float greater than zero that determines contrast change, where
+ values less than one result in less contrast and values greater than one result
+ in more contrast.
+ '''
+ if factor <= 0.0: raise ValueError("It must be `factor > 0`.")
+ self.factor = factor
+
+ def __call__(self, image, labels=None):
+ image = np.clip(127.5 + self.factor * (image - 127.5), 0, 255)
+ if labels is None:
+ return image
+ else:
+ return image, labels
+
+class RandomContrast:
+ '''
+ Randomly changes the contrast of RGB images.
+
+ Important:
+ - Expects RGB input.
+ - Expects input array to be of `dtype` `float`.
+ '''
+ def __init__(self, lower=0.5, upper=1.5, prob=0.5):
+ '''
+ Arguments:
+ lower (float, optional): A float greater than zero, the lower bound for the random
+ contrast change.
+ upper (float, optional): A float greater than zero, the upper bound for the random
+ contrast change. Must be greater than `lower`.
+ prob (float, optional): `(1 - prob)` determines the probability with which the original,
+ unaltered image is returned.
+ '''
+ if lower >= upper: raise ValueError("`upper` must be greater than `lower`.")
+ self.lower = lower
+ self.upper = upper
+ self.prob = prob
+ self.change_contrast = Contrast(factor=1.0)
+
+ def __call__(self, image, labels=None):
+ p = np.random.uniform(0,1)
+ if p >= (1.0-self.prob):
+ self.change_contrast.factor = np.random.uniform(self.lower, self.upper)
+ return self.change_contrast(image, labels)
+ elif labels is None:
+ return image
+ else:
+ return image, labels
+
+class Gamma:
+ '''
+ Changes the gamma value of RGB images.
+
+ Important: Expects RGB input.
+ '''
+ def __init__(self, gamma):
+ '''
+ Arguments:
+ gamma (float): A float greater than zero that determines gamma change.
+ '''
+ if gamma <= 0.0: raise ValueError("It must be `gamma > 0`.")
+ self.gamma = gamma
+ self.gamma_inv = 1.0 / gamma
+ # Build a lookup table mapping the pixel values [0, 255] to
+ # their adjusted gamma values.
+ self.table = np.array([((i / 255.0) ** self.gamma_inv) * 255 for i in np.arange(0, 256)]).astype("uint8")
+
+ def __call__(self, image, labels=None):
+ image = cv2.LUT(image, table)
+ if labels is None:
+ return image
+ else:
+ return image, labels
+
+class RandomGamma:
+ '''
+ Randomly changes the gamma value of RGB images.
+
+ Important: Expects RGB input.
+ '''
+ def __init__(self, lower=0.25, upper=2.0, prob=0.5):
+ '''
+ Arguments:
+ lower (float, optional): A float greater than zero, the lower bound for the random
+ gamma change.
+ upper (float, optional): A float greater than zero, the upper bound for the random
+ gamma change. Must be greater than `lower`.
+ prob (float, optional): `(1 - prob)` determines the probability with which the original,
+ unaltered image is returned.
+ '''
+ if lower >= upper: raise ValueError("`upper` must be greater than `lower`.")
+ self.lower = lower
+ self.upper = upper
+ self.prob = prob
+
+ def __call__(self, image, labels=None):
+ p = np.random.uniform(0,1)
+ if p >= (1.0-self.prob):
+ gamma = np.random.uniform(self.lower, self.upper)
+ change_gamma = Gamma(gamma=gamma)
+ return change_gamma(image, labels)
+ elif labels is None:
+ return image
+ else:
+ return image, labels
+
+class HistogramEqualization:
+ '''
+ Performs histogram equalization on HSV images.
+
+ Importat: Expects HSV input.
+ '''
+ def __init__(self):
+ pass
+
+ def __call__(self, image, labels=None):
+ image[:,:,2] = cv2.equalizeHist(image[:,:,2])
+ if labels is None:
+ return image
+ else:
+ return image, labels
+
+class RandomHistogramEqualization:
+ '''
+ Randomly performs histogram equalization on HSV images. The randomness only refers
+ to whether or not the equalization is performed.
+
+ Importat: Expects HSV input.
+ '''
+ def __init__(self, prob=0.5):
+ '''
+ Arguments:
+ prob (float, optional): `(1 - prob)` determines the probability with which the original,
+ unaltered image is returned.
+ '''
+ self.prob = prob
+ self.equalize = HistogramEqualization()
+
+ def __call__(self, image, labels=None):
+ p = np.random.uniform(0,1)
+ if p >= (1.0-self.prob):
+ return self.equalize(image, labels)
+ elif labels is None:
+ return image
+ else:
+ return image, labels
+
+class ChannelSwap:
+ '''
+ Swaps the channels of images.
+ '''
+ def __init__(self, order):
+ '''
+ Arguments:
+ order (tuple): A tuple of integers that defines the desired channel order
+ of the input images after the channel swap.
+ '''
+ self.order = order
+
+ def __call__(self, image, labels=None):
+ image = image[:,:,self.order]
+ if labels is None:
+ return image
+ else:
+ return image, labels
+
+class RandomChannelSwap:
+ '''
+ Randomly swaps the channels of RGB images.
+
+ Important: Expects RGB input.
+ '''
+ def __init__(self, prob=0.5):
+ '''
+ Arguments:
+ prob (float, optional): `(1 - prob)` determines the probability with which the original,
+ unaltered image is returned.
+ '''
+ self.prob = prob
+ # All possible permutations of the three image channels except the original order.
+ self.permutations = ((0, 2, 1),
+ (1, 0, 2), (1, 2, 0),
+ (2, 0, 1), (2, 1, 0))
+ self.swap_channels = ChannelSwap(order=(0, 1, 2))
+
+ def __call__(self, image, labels=None):
+ p = np.random.uniform(0,1)
+ if p >= (1.0-self.prob):
+ i = np.random.randint(5) # There are 6 possible permutations.
+ self.swap_channels.order = self.permutations[i]
+ return self.swap_channels(image, labels)
+ elif labels is None:
+ return image
+ else:
+ return image, labels
diff --git a/engine/object_detection_branch/single_shot_detector/eval_utils/__init__.py b/engine/object_detection_branch/single_shot_detector/eval_utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/single_shot_detector/eval_utils/average_precision_evaluator.py b/engine/object_detection_branch/single_shot_detector/eval_utils/average_precision_evaluator.py
new file mode 100644
index 0000000..e1c52f9
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/eval_utils/average_precision_evaluator.py
@@ -0,0 +1,906 @@
+'''
+An evaluator to compute the Pascal VOC-style mean average precision (both the pre-2010
+and post-2010 algorithm versions) of a given Keras SSD model on a given dataset.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+from math import ceil
+from tqdm import trange
+import sys
+import warnings
+
+from data_generator.object_detection_2d_data_generator import DataGenerator
+from data_generator.object_detection_2d_geometric_ops import Resize
+from data_generator.object_detection_2d_patch_sampling_ops import RandomPadFixedAR
+from data_generator.object_detection_2d_photometric_ops import ConvertTo3Channels
+from ssd_encoder_decoder.ssd_output_decoder import decode_detections
+from data_generator.object_detection_2d_misc_utils import apply_inverse_transforms
+
+from bounding_box_utils.bounding_box_utils import iou
+
+class Evaluator:
+ '''
+ Computes the mean average precision of the given Keras SSD model on the given dataset.
+
+ Can compute the Pascal-VOC-style average precision in both the pre-2010 (k-point sampling)
+ and post-2010 (integration) algorithm versions.
+
+ Optionally also returns the average precisions, precisions, and recalls.
+
+ The algorithm is identical to the official Pascal VOC pre-2010 detection evaluation algorithm
+ in its default settings, but can be cusomized in a number of ways.
+ '''
+
+ def __init__(self,
+ model,
+ n_classes,
+ data_generator,
+ model_mode='inference',
+ pred_format={'class_id': 0, 'conf': 1, 'xmin': 2, 'ymin': 3, 'xmax': 4, 'ymax': 5},
+ gt_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}):
+ '''
+ Arguments:
+ model (Keras model): A Keras SSD model object.
+ n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO.
+ data_generator (DataGenerator): A `DataGenerator` object with the evaluation dataset.
+ model_mode (str, optional): The mode in which the model was created, i.e. 'training', 'inference' or 'inference_fast'.
+ This is needed in order to know whether the model output is already decoded or still needs to be decoded. Refer to
+ the model documentation for the meaning of the individual modes.
+ pred_format (dict, optional): A dictionary that defines which index in the last axis of the model's decoded predictions
+ contains which bounding box coordinate. The dictionary must map the keywords 'class_id', 'conf' (for the confidence),
+ 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis.
+ gt_format (list, optional): A dictionary that defines which index of a ground truth bounding box contains which of the five
+ items class ID, xmin, ymin, xmax, ymax. The expected strings are 'xmin', 'ymin', 'xmax', 'ymax', 'class_id'.
+ '''
+
+ if not isinstance(data_generator, DataGenerator):
+ warnings.warn("`data_generator` is not a `DataGenerator` object, which will cause undefined behavior.")
+
+ self.model = model
+ self.data_generator = data_generator
+ self.n_classes = n_classes
+ self.model_mode = model_mode
+ self.pred_format = pred_format
+ self.gt_format = gt_format
+
+ # The following lists all contain per-class data, i.e. all list have the length `n_classes + 1`,
+ # where one element is for the background class, i.e. that element is just a dummy entry.
+ self.prediction_results = None
+ self.num_gt_per_class = None
+ self.true_positives = None
+ self.false_positives = None
+ self.cumulative_true_positives = None
+ self.cumulative_false_positives = None
+ self.cumulative_precisions = None # "Cumulative" means that the i-th element in each list represents the precision for the first i highest condidence predictions for that class.
+ self.cumulative_recalls = None # "Cumulative" means that the i-th element in each list represents the recall for the first i highest condidence predictions for that class.
+ self.average_precisions = None
+ self.mean_average_precision = None
+
+ def __call__(self,
+ img_height,
+ img_width,
+ batch_size,
+ data_generator_mode='resize',
+ round_confidences=False,
+ matching_iou_threshold=0.5,
+ border_pixels='include',
+ sorting_algorithm='quicksort',
+ average_precision_mode='sample',
+ num_recall_points=11,
+ ignore_neutral_boxes=True,
+ return_precisions=False,
+ return_recalls=False,
+ return_average_precisions=False,
+ verbose=True,
+ decoding_confidence_thresh=0.01,
+ decoding_iou_threshold=0.45,
+ decoding_top_k=200,
+ decoding_pred_coords='centroids',
+ decoding_normalize_coords=True):
+ '''
+ Computes the mean average precision of the given Keras SSD model on the given dataset.
+
+ Optionally also returns the averages precisions, precisions, and recalls.
+
+ All the individual steps of the overall evaluation algorithm can also be called separately
+ (check out the other methods of this class), but this runs the overall algorithm all at once.
+
+ Arguments:
+ img_height (int): The input image height for the model.
+ img_width (int): The input image width for the model.
+ batch_size (int): The batch size for the evaluation.
+ data_generator_mode (str, optional): Either of 'resize' and 'pad'. If 'resize', the input images will
+ be resized (i.e. warped) to `(img_height, img_width)`. This mode does not preserve the aspect ratios of the images.
+ If 'pad', the input images will be first padded so that they have the aspect ratio defined by `img_height`
+ and `img_width` and then resized to `(img_height, img_width)`. This mode preserves the aspect ratios of the images.
+ round_confidences (int, optional): `False` or an integer that is the number of decimals that the prediction
+ confidences will be rounded to. If `False`, the confidences will not be rounded.
+ matching_iou_threshold (float, optional): A prediction will be considered a true positive if it has a Jaccard overlap
+ of at least `matching_iou_threshold` with any ground truth bounding box of the same class.
+ border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+ Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+ to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+ If 'half', then one of each of the two horizontal and vertical borders belong
+ to the boxex, but not the other.
+ sorting_algorithm (str, optional): Which sorting algorithm the matching algorithm should use. This argument accepts
+ any valid sorting algorithm for Numpy's `argsort()` function. You will usually want to choose between 'quicksort'
+ (fastest and most memory efficient, but not stable) and 'mergesort' (slight slower and less memory efficient, but stable).
+ The official Matlab evaluation algorithm uses a stable sorting algorithm, so this algorithm is only guaranteed
+ to behave identically if you choose 'mergesort' as the sorting algorithm, but it will almost always behave identically
+ even if you choose 'quicksort' (but no guarantees).
+ average_precision_mode (str, optional): Can be either 'sample' or 'integrate'. In the case of 'sample', the average precision
+ will be computed according to the Pascal VOC formula that was used up until VOC 2009, where the precision will be sampled
+ for `num_recall_points` recall values. In the case of 'integrate', the average precision will be computed according to the
+ Pascal VOC formula that was used from VOC 2010 onward, where the average precision will be computed by numerically integrating
+ over the whole preciscion-recall curve instead of sampling individual points from it. 'integrate' mode is basically just
+ the limit case of 'sample' mode as the number of sample points increases.
+ num_recall_points (int, optional): The number of points to sample from the precision-recall-curve to compute the average
+ precisions. In other words, this is the number of equidistant recall values for which the resulting precision will be
+ computed. 11 points is the value used in the official Pascal VOC 2007 detection evaluation algorithm.
+ ignore_neutral_boxes (bool, optional): In case the data generator provides annotations indicating whether a ground truth
+ bounding box is supposed to either count or be neutral for the evaluation, this argument decides what to do with these
+ annotations. If `False`, even boxes that are annotated as neutral will be counted into the evaluation. If `True`,
+ neutral boxes will be ignored for the evaluation. An example for evaluation-neutrality are the ground truth boxes
+ annotated as "difficult" in the Pascal VOC datasets, which are usually treated as neutral for the evaluation.
+ return_precisions (bool, optional): If `True`, returns a nested list containing the cumulative precisions for each class.
+ return_recalls (bool, optional): If `True`, returns a nested list containing the cumulative recalls for each class.
+ return_average_precisions (bool, optional): If `True`, returns a list containing the average precision for each class.
+ verbose (bool, optional): If `True`, will print out the progress during runtime.
+ decoding_confidence_thresh (float, optional): Only relevant if the model is in 'training' mode.
+ A float in [0,1), the minimum classification confidence in a specific positive class in order to be considered
+ for the non-maximum suppression stage for the respective class. A lower value will result in a larger part of the
+ selection process being done by the non-maximum suppression stage, while a larger value will result in a larger
+ part of the selection process happening in the confidence thresholding stage.
+ decoding_iou_threshold (float, optional): Only relevant if the model is in 'training' mode. A float in [0,1].
+ All boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed
+ from the set of predictions for a given class, where 'maximal' refers to the box score.
+ decoding_top_k (int, optional): Only relevant if the model is in 'training' mode. The number of highest scoring
+ predictions to be kept for each batch item after the non-maximum suppression stage.
+ decoding_input_coords (str, optional): Only relevant if the model is in 'training' mode. The box coordinate format
+ that the model outputs. Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width, and height),
+ 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+ decoding_normalize_coords (bool, optional): Only relevant if the model is in 'training' mode. Set to `True` if the model
+ outputs relative coordinates. Do not set this to `True` if the model already outputs absolute coordinates,
+ as that would result in incorrect coordinates.
+
+ Returns:
+ A float, the mean average precision, plus any optional returns specified in the arguments.
+ '''
+
+ #############################################################################################
+ # Predict on the entire dataset.
+ #############################################################################################
+
+ self.predict_on_dataset(img_height=img_height,
+ img_width=img_width,
+ batch_size=batch_size,
+ data_generator_mode=data_generator_mode,
+ decoding_confidence_thresh=decoding_confidence_thresh,
+ decoding_iou_threshold=decoding_iou_threshold,
+ decoding_top_k=decoding_top_k,
+ decoding_pred_coords=decoding_pred_coords,
+ decoding_normalize_coords=decoding_normalize_coords,
+ decoding_border_pixels=border_pixels,
+ round_confidences=round_confidences,
+ verbose=verbose,
+ ret=False)
+
+ #############################################################################################
+ # Get the total number of ground truth boxes for each class.
+ #############################################################################################
+
+ self.get_num_gt_per_class(ignore_neutral_boxes=ignore_neutral_boxes,
+ verbose=False,
+ ret=False)
+
+ #############################################################################################
+ # Match predictions to ground truth boxes for all classes.
+ #############################################################################################
+
+ self.match_predictions(ignore_neutral_boxes=ignore_neutral_boxes,
+ matching_iou_threshold=matching_iou_threshold,
+ border_pixels=border_pixels,
+ sorting_algorithm=sorting_algorithm,
+ verbose=verbose,
+ ret=False)
+
+ #############################################################################################
+ # Compute the cumulative precision and recall for all classes.
+ #############################################################################################
+
+ self.compute_precision_recall(verbose=verbose, ret=False)
+
+ #############################################################################################
+ # Compute the average precision for this class.
+ #############################################################################################
+
+ self.compute_average_precisions(mode=average_precision_mode,
+ num_recall_points=num_recall_points,
+ verbose=verbose,
+ ret=False)
+
+ #############################################################################################
+ # Compute the mean average precision.
+ #############################################################################################
+
+ mean_average_precision = self.compute_mean_average_precision(ret=True)
+
+ #############################################################################################
+
+ # Compile the returns.
+ if return_precisions or return_recalls or return_average_precisions:
+ ret = [mean_average_precision]
+ if return_average_precisions:
+ ret.append(self.average_precisions)
+ if return_precisions:
+ ret.append(self.cumulative_precisions)
+ if return_recalls:
+ ret.append(self.cumulative_recalls)
+ return ret
+ else:
+ return mean_average_precision
+
+ def predict_on_dataset(self,
+ img_height,
+ img_width,
+ batch_size,
+ data_generator_mode='resize',
+ decoding_confidence_thresh=0.01,
+ decoding_iou_threshold=0.45,
+ decoding_top_k=200,
+ decoding_pred_coords='centroids',
+ decoding_normalize_coords=True,
+ decoding_border_pixels='include',
+ round_confidences=False,
+ verbose=True,
+ ret=False):
+ '''
+ Runs predictions for the given model over the entire dataset given by `data_generator`.
+
+ Arguments:
+ img_height (int): The input image height for the model.
+ img_width (int): The input image width for the model.
+ batch_size (int): The batch size for the evaluation.
+ data_generator_mode (str, optional): Either of 'resize' and 'pad'. If 'resize', the input images will
+ be resized (i.e. warped) to `(img_height, img_width)`. This mode does not preserve the aspect ratios of the images.
+ If 'pad', the input images will be first padded so that they have the aspect ratio defined by `img_height`
+ and `img_width` and then resized to `(img_height, img_width)`. This mode preserves the aspect ratios of the images.
+ decoding_confidence_thresh (float, optional): Only relevant if the model is in 'training' mode.
+ A float in [0,1), the minimum classification confidence in a specific positive class in order to be considered
+ for the non-maximum suppression stage for the respective class. A lower value will result in a larger part of the
+ selection process being done by the non-maximum suppression stage, while a larger value will result in a larger
+ part of the selection process happening in the confidence thresholding stage.
+ decoding_iou_threshold (float, optional): Only relevant if the model is in 'training' mode. A float in [0,1].
+ All boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed
+ from the set of predictions for a given class, where 'maximal' refers to the box score.
+ decoding_top_k (int, optional): Only relevant if the model is in 'training' mode. The number of highest scoring
+ predictions to be kept for each batch item after the non-maximum suppression stage.
+ decoding_input_coords (str, optional): Only relevant if the model is in 'training' mode. The box coordinate format
+ that the model outputs. Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width, and height),
+ 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+ decoding_normalize_coords (bool, optional): Only relevant if the model is in 'training' mode. Set to `True` if the model
+ outputs relative coordinates. Do not set this to `True` if the model already outputs absolute coordinates,
+ as that would result in incorrect coordinates.
+ round_confidences (int, optional): `False` or an integer that is the number of decimals that the prediction
+ confidences will be rounded to. If `False`, the confidences will not be rounded.
+ verbose (bool, optional): If `True`, will print out the progress during runtime.
+ ret (bool, optional): If `True`, returns the predictions.
+
+ Returns:
+ None by default. Optionally, a nested list containing the predictions for each class.
+ '''
+
+ class_id_pred = self.pred_format['class_id']
+ conf_pred = self.pred_format['conf']
+ xmin_pred = self.pred_format['xmin']
+ ymin_pred = self.pred_format['ymin']
+ xmax_pred = self.pred_format['xmax']
+ ymax_pred = self.pred_format['ymax']
+
+ #############################################################################################
+ # Configure the data generator for the evaluation.
+ #############################################################################################
+
+ convert_to_3_channels = ConvertTo3Channels()
+ resize = Resize(height=img_height,width=img_width, labels_format=self.gt_format)
+ if data_generator_mode == 'resize':
+ transformations = [convert_to_3_channels,
+ resize]
+ elif data_generator_mode == 'pad':
+ random_pad = RandomPadFixedAR(patch_aspect_ratio=img_width/img_height, labels_format=self.gt_format)
+ transformations = [convert_to_3_channels,
+ random_pad,
+ resize]
+ else:
+ raise ValueError("`data_generator_mode` can be either of 'resize' or 'pad', but received '{}'.".format(data_generator_mode))
+
+ # Set the generator parameters.
+ generator = self.data_generator.generate(batch_size=batch_size,
+ shuffle=False,
+ transformations=transformations,
+ label_encoder=None,
+ returns={'processed_images',
+ 'image_ids',
+ 'evaluation-neutral',
+ 'inverse_transform',
+ 'original_labels'},
+ keep_images_without_gt=True,
+ degenerate_box_handling='remove')
+
+ # If we don't have any real image IDs, generate pseudo-image IDs.
+ # This is just to make the evaluator compatible both with datasets that do and don't
+ # have image IDs.
+ if self.data_generator.image_ids is None:
+ self.data_generator.image_ids = list(range(self.data_generator.get_dataset_size()))
+
+ #############################################################################################
+ # Predict over all batches of the dataset and store the predictions.
+ #############################################################################################
+
+ # We have to generate a separate results list for each class.
+ results = [list() for _ in range(self.n_classes + 1)]
+
+ # Create a dictionary that maps image IDs to ground truth annotations.
+ # We'll need it below.
+ image_ids_to_labels = {}
+
+ # Compute the number of batches to iterate over the entire dataset.
+ n_images = self.data_generator.get_dataset_size()
+ n_batches = int(ceil(n_images / batch_size))
+ if verbose:
+ print("Number of images in the evaluation dataset: {}".format(n_images))
+ print()
+ tr = trange(n_batches, file=sys.stdout)
+ tr.set_description('Producing predictions batch-wise')
+ else:
+ tr = range(n_batches)
+
+ # Loop over all batches.
+ for j in tr:
+ # Generate batch.
+ batch_X, batch_image_ids, batch_eval_neutral, batch_inverse_transforms, batch_orig_labels = next(generator)
+ # Predict.
+ y_pred = self.model.predict(batch_X)
+ # If the model was created in 'training' mode, the raw predictions need to
+ # be decoded and filtered, otherwise that's already taken care of.
+ if self.model_mode == 'training':
+ # Decode.
+ y_pred = decode_detections(y_pred,
+ confidence_thresh=decoding_confidence_thresh,
+ iou_threshold=decoding_iou_threshold,
+ top_k=decoding_top_k,
+ input_coords=decoding_pred_coords,
+ normalize_coords=decoding_normalize_coords,
+ img_height=img_height,
+ img_width=img_width,
+ border_pixels=decoding_border_pixels)
+ else:
+ # Filter out the all-zeros dummy elements of `y_pred`.
+ y_pred_filtered = []
+ for i in range(len(y_pred)):
+ y_pred_filtered.append(y_pred[i][y_pred[i,:,0] != 0])
+ y_pred = y_pred_filtered
+ # Convert the predicted box coordinates for the original images.
+ y_pred = apply_inverse_transforms(y_pred, batch_inverse_transforms)
+
+ # Iterate over all batch items.
+ for k, batch_item in enumerate(y_pred):
+
+ image_id = batch_image_ids[k]
+
+ for box in batch_item:
+ class_id = int(box[class_id_pred])
+ # Round the box coordinates to reduce the required memory.
+ if round_confidences:
+ confidence = round(box[conf_pred], round_confidences)
+ else:
+ confidence = box[conf_pred]
+ xmin = round(box[xmin_pred], 1)
+ ymin = round(box[ymin_pred], 1)
+ xmax = round(box[xmax_pred], 1)
+ ymax = round(box[ymax_pred], 1)
+ prediction = (image_id, confidence, xmin, ymin, xmax, ymax)
+ # Append the predicted box to the results list for its class.
+ results[class_id].append(prediction)
+
+ self.prediction_results = results
+
+ if ret:
+ return results
+
+ def write_predictions_to_txt(self,
+ classes=None,
+ out_file_prefix='comp3_det_test_',
+ verbose=True):
+ '''
+ Writes the predictions for all classes to separate text files according to the Pascal VOC results format.
+
+ Arguments:
+ classes (list, optional): `None` or a list of strings containing the class names of all classes in the dataset,
+ including some arbitrary name for the background class. This list will be used to name the output text files.
+ The ordering of the names in the list represents the ordering of the classes as they are predicted by the model,
+ i.e. the element with index 3 in this list should correspond to the class with class ID 3 in the model's predictions.
+ If `None`, the output text files will be named by their class IDs.
+ out_file_prefix (str, optional): A prefix for the output text file names. The suffix to each output text file name will
+ be the respective class name followed by the `.txt` file extension. This string is also how you specify the directory
+ in which the results are to be saved.
+ verbose (bool, optional): If `True`, will print out the progress during runtime.
+
+ Returns:
+ None.
+ '''
+
+ if self.prediction_results is None:
+ raise ValueError("There are no prediction results. You must run `predict_on_dataset()` before calling this method.")
+
+ # We generate a separate results file for each class.
+ for class_id in range(1, self.n_classes + 1):
+
+ if verbose:
+ print("Writing results file for class {}/{}.".format(class_id, self.n_classes))
+
+ if classes is None:
+ class_suffix = '{:04d}'.format(class_id)
+ else:
+ class_suffix = classes[class_id]
+
+ results_file = open('{}{}.txt'.format(out_file_prefix, class_suffix), 'w')
+
+ for prediction in self.prediction_results[class_id]:
+
+ prediction_list = list(prediction)
+ prediction_list[0] = '{:06d}'.format(int(prediction_list[0]))
+ prediction_list[1] = round(prediction_list[1], 4)
+ prediction_txt = ' '.join(map(str, prediction_list)) + '\n'
+ results_file.write(prediction_txt)
+
+ results_file.close()
+
+ if verbose:
+ print("All results files saved.")
+
+ def get_num_gt_per_class(self,
+ ignore_neutral_boxes=True,
+ verbose=True,
+ ret=False):
+ '''
+ Counts the number of ground truth boxes for each class across the dataset.
+
+ Arguments:
+ ignore_neutral_boxes (bool, optional): In case the data generator provides annotations indicating whether a ground truth
+ bounding box is supposed to either count or be neutral for the evaluation, this argument decides what to do with these
+ annotations. If `True`, only non-neutral ground truth boxes will be counted, otherwise all ground truth boxes will
+ be counted.
+ verbose (bool, optional): If `True`, will print out the progress during runtime.
+ ret (bool, optional): If `True`, returns the list of counts.
+
+ Returns:
+ None by default. Optionally, a list containing a count of the number of ground truth boxes for each class across the
+ entire dataset.
+ '''
+
+ if self.data_generator.labels is None:
+ raise ValueError("Computing the number of ground truth boxes per class not possible, no ground truth given.")
+
+ num_gt_per_class = np.zeros(shape=(self.n_classes+1), dtype=np.int)
+
+ class_id_index = self.gt_format['class_id']
+
+ ground_truth = self.data_generator.labels
+
+ if verbose:
+ print('Computing the number of positive ground truth boxes per class.')
+ tr = trange(len(ground_truth), file=sys.stdout)
+ else:
+ tr = range(len(ground_truth))
+
+ # Iterate over the ground truth for all images in the dataset.
+ for i in tr:
+
+ boxes = np.asarray(ground_truth[i])
+
+ # Iterate over all ground truth boxes for the current image.
+ for j in range(boxes.shape[0]):
+
+ if ignore_neutral_boxes and not (self.data_generator.eval_neutral is None):
+ if not self.data_generator.eval_neutral[i][j]:
+ # If this box is not supposed to be evaluation-neutral,
+ # increment the counter for the respective class ID.
+ class_id = boxes[j, class_id_index]
+ num_gt_per_class[class_id] += 1
+ else:
+ # If there is no such thing as evaluation-neutral boxes for
+ # our dataset, always increment the counter for the respective
+ # class ID.
+ class_id = boxes[j, class_id_index]
+ num_gt_per_class[class_id] += 1
+
+ self.num_gt_per_class = num_gt_per_class
+
+ if ret:
+ return num_gt_per_class
+
+ def match_predictions(self,
+ ignore_neutral_boxes=True,
+ matching_iou_threshold=0.5,
+ border_pixels='include',
+ sorting_algorithm='quicksort',
+ verbose=True,
+ ret=False):
+ '''
+ Matches predictions to ground truth boxes.
+
+ Note that `predict_on_dataset()` must be called before calling this method.
+
+ Arguments:
+ ignore_neutral_boxes (bool, optional): In case the data generator provides annotations indicating whether a ground truth
+ bounding box is supposed to either count or be neutral for the evaluation, this argument decides what to do with these
+ annotations. If `False`, even boxes that are annotated as neutral will be counted into the evaluation. If `True`,
+ neutral boxes will be ignored for the evaluation. An example for evaluation-neutrality are the ground truth boxes
+ annotated as "difficult" in the Pascal VOC datasets, which are usually treated as neutral for the evaluation.
+ matching_iou_threshold (float, optional): A prediction will be considered a true positive if it has a Jaccard overlap
+ of at least `matching_iou_threshold` with any ground truth bounding box of the same class.
+ border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+ Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+ to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+ If 'half', then one of each of the two horizontal and vertical borders belong
+ to the boxex, but not the other.
+ sorting_algorithm (str, optional): Which sorting algorithm the matching algorithm should use. This argument accepts
+ any valid sorting algorithm for Numpy's `argsort()` function. You will usually want to choose between 'quicksort'
+ (fastest and most memory efficient, but not stable) and 'mergesort' (slight slower and less memory efficient, but stable).
+ The official Matlab evaluation algorithm uses a stable sorting algorithm, so this algorithm is only guaranteed
+ to behave identically if you choose 'mergesort' as the sorting algorithm, but it will almost always behave identically
+ even if you choose 'quicksort' (but no guarantees).
+ verbose (bool, optional): If `True`, will print out the progress during runtime.
+ ret (bool, optional): If `True`, returns the true and false positives.
+
+ Returns:
+ None by default. Optionally, four nested lists containing the true positives, false positives, cumulative true positives,
+ and cumulative false positives for each class.
+ '''
+
+ if self.data_generator.labels is None:
+ raise ValueError("Matching predictions to ground truth boxes not possible, no ground truth given.")
+
+ if self.prediction_results is None:
+ raise ValueError("There are no prediction results. You must run `predict_on_dataset()` before calling this method.")
+
+ class_id_gt = self.gt_format['class_id']
+ xmin_gt = self.gt_format['xmin']
+ ymin_gt = self.gt_format['ymin']
+ xmax_gt = self.gt_format['xmax']
+ ymax_gt = self.gt_format['ymax']
+
+ # Convert the ground truth to a more efficient format for what we need
+ # to do, which is access ground truth by image ID repeatedly.
+ ground_truth = {}
+ eval_neutral_available = not (self.data_generator.eval_neutral is None) # Whether or not we have annotations to decide whether ground truth boxes should be neutral or not.
+ for i in range(len(self.data_generator.image_ids)):
+ image_id = str(self.data_generator.image_ids[i])
+ labels = self.data_generator.labels[i]
+ if ignore_neutral_boxes and eval_neutral_available:
+ ground_truth[image_id] = (np.asarray(labels), np.asarray(self.data_generator.eval_neutral[i]))
+ else:
+ ground_truth[image_id] = np.asarray(labels)
+
+ true_positives = [[]] # The false positives for each class, sorted by descending confidence.
+ false_positives = [[]] # The true positives for each class, sorted by descending confidence.
+ cumulative_true_positives = [[]]
+ cumulative_false_positives = [[]]
+
+ # Iterate over all classes.
+ for class_id in range(1, self.n_classes + 1):
+
+ predictions = self.prediction_results[class_id]
+
+ # Store the matching results in these lists:
+ true_pos = np.zeros(len(predictions), dtype=np.int) # 1 for every prediction that is a true positive, 0 otherwise
+ false_pos = np.zeros(len(predictions), dtype=np.int) # 1 for every prediction that is a false positive, 0 otherwise
+
+ # In case there are no predictions at all for this class, we're done here.
+ if len(predictions) == 0:
+ print("No predictions for class {}/{}".format(class_id, self.n_classes))
+ true_positives.append(true_pos)
+ false_positives.append(false_pos)
+ continue
+
+ # Convert the predictions list for this class into a structured array so that we can sort it by confidence.
+
+ # Get the number of characters needed to store the image ID strings in the structured array.
+ num_chars_per_image_id = len(str(predictions[0][0])) + 6 # Keep a few characters buffer in case some image IDs are longer than others.
+ # Create the data type for the structured array.
+ preds_data_type = np.dtype([('image_id', 'U{}'.format(num_chars_per_image_id)),
+ ('confidence', 'f4'),
+ ('xmin', 'f4'),
+ ('ymin', 'f4'),
+ ('xmax', 'f4'),
+ ('ymax', 'f4')])
+ # Create the structured array
+ predictions = np.array(predictions, dtype=preds_data_type)
+
+ # Sort the detections by decreasing confidence.
+ descending_indices = np.argsort(-predictions['confidence'], kind=sorting_algorithm)
+ predictions_sorted = predictions[descending_indices]
+
+ if verbose:
+ tr = trange(len(predictions), file=sys.stdout)
+ tr.set_description("Matching predictions to ground truth, class {}/{}.".format(class_id, self.n_classes))
+ else:
+ tr = range(len(predictions.shape))
+
+ # Keep track of which ground truth boxes were already matched to a detection.
+ gt_matched = {}
+
+ # Iterate over all predictions.
+ for i in tr:
+
+ prediction = predictions_sorted[i]
+ image_id = prediction['image_id']
+ pred_box = np.asarray(list(prediction[['xmin', 'ymin', 'xmax', 'ymax']])) # Convert the structured array element to a regular array.
+
+ # Get the relevant ground truth boxes for this prediction,
+ # i.e. all ground truth boxes that match the prediction's
+ # image ID and class ID.
+
+ # The ground truth could either be a tuple with `(ground_truth_boxes, eval_neutral_boxes)`
+ # or only `ground_truth_boxes`.
+ if ignore_neutral_boxes and eval_neutral_available:
+ gt, eval_neutral = ground_truth[image_id]
+ else:
+ gt = ground_truth[image_id]
+ gt = np.asarray(gt)
+ class_mask = gt[:,class_id_gt] == class_id
+ gt = gt[class_mask]
+ if ignore_neutral_boxes and eval_neutral_available:
+ eval_neutral = eval_neutral[class_mask]
+
+ if gt.size == 0:
+ # If the image doesn't contain any objects of this class,
+ # the prediction becomes a false positive.
+ false_pos[i] = 1
+ continue
+
+ # Compute the IoU of this prediction with all ground truth boxes of the same class.
+ overlaps = iou(boxes1=gt[:,[xmin_gt, ymin_gt, xmax_gt, ymax_gt]],
+ boxes2=pred_box,
+ coords='corners',
+ mode='element-wise',
+ border_pixels=border_pixels)
+
+ # For each detection, match the ground truth box with the highest overlap.
+ # It's possible that the same ground truth box will be matched to multiple
+ # detections.
+ gt_match_index = np.argmax(overlaps)
+ gt_match_overlap = overlaps[gt_match_index]
+
+ if gt_match_overlap < matching_iou_threshold:
+ # False positive, IoU threshold violated:
+ # Those predictions whose matched overlap is below the threshold become
+ # false positives.
+ false_pos[i] = 1
+ else:
+ if not (ignore_neutral_boxes and eval_neutral_available) or (eval_neutral[gt_match_index] == False):
+ # If this is not a ground truth that is supposed to be evaluation-neutral
+ # (i.e. should be skipped for the evaluation) or if we don't even have the
+ # concept of neutral boxes.
+ if not (image_id in gt_matched):
+ # True positive:
+ # If the matched ground truth box for this prediction hasn't been matched to a
+ # different prediction already, we have a true positive.
+ true_pos[i] = 1
+ gt_matched[image_id] = np.zeros(shape=(gt.shape[0]), dtype=np.bool)
+ gt_matched[image_id][gt_match_index] = True
+ elif not gt_matched[image_id][gt_match_index]:
+ # True positive:
+ # If the matched ground truth box for this prediction hasn't been matched to a
+ # different prediction already, we have a true positive.
+ true_pos[i] = 1
+ gt_matched[image_id][gt_match_index] = True
+ else:
+ # False positive, duplicate detection:
+ # If the matched ground truth box for this prediction has already been matched
+ # to a different prediction previously, it is a duplicate detection for an
+ # already detected object, which counts as a false positive.
+ false_pos[i] = 1
+
+ true_positives.append(true_pos)
+ false_positives.append(false_pos)
+
+ cumulative_true_pos = np.cumsum(true_pos) # Cumulative sums of the true positives
+ cumulative_false_pos = np.cumsum(false_pos) # Cumulative sums of the false positives
+
+ cumulative_true_positives.append(cumulative_true_pos)
+ cumulative_false_positives.append(cumulative_false_pos)
+
+ self.true_positives = true_positives
+ self.false_positives = false_positives
+ self.cumulative_true_positives = cumulative_true_positives
+ self.cumulative_false_positives = cumulative_false_positives
+
+ if ret:
+ return true_positives, false_positives, cumulative_true_positives, cumulative_false_positives
+
+ def compute_precision_recall(self, verbose=True, ret=False):
+ '''
+ Computes the precisions and recalls for all classes.
+
+ Note that `match_predictions()` must be called before calling this method.
+
+ Arguments:
+ verbose (bool, optional): If `True`, will print out the progress during runtime.
+ ret (bool, optional): If `True`, returns the precisions and recalls.
+
+ Returns:
+ None by default. Optionally, two nested lists containing the cumulative precisions and recalls for each class.
+ '''
+
+ if (self.cumulative_true_positives is None) or (self.cumulative_false_positives is None):
+ raise ValueError("True and false positives not available. You must run `match_predictions()` before you call this method.")
+
+ if (self.num_gt_per_class is None):
+ raise ValueError("Number of ground truth boxes per class not available. You must run `get_num_gt_per_class()` before you call this method.")
+
+ cumulative_precisions = [[]]
+ cumulative_recalls = [[]]
+
+ # Iterate over all classes.
+ for class_id in range(1, self.n_classes + 1):
+
+ if verbose:
+ print("Computing precisions and recalls, class {}/{}".format(class_id, self.n_classes))
+
+ tp = self.cumulative_true_positives[class_id]
+ fp = self.cumulative_false_positives[class_id]
+
+
+ cumulative_precision = np.where(tp + fp > 0, tp / (tp + fp), 0) # 1D array with shape `(num_predictions,)`
+ cumulative_recall = tp / self.num_gt_per_class[class_id] # 1D array with shape `(num_predictions,)`
+
+ cumulative_precisions.append(cumulative_precision)
+ cumulative_recalls.append(cumulative_recall)
+
+ self.cumulative_precisions = cumulative_precisions
+ self.cumulative_recalls = cumulative_recalls
+
+ if ret:
+ return cumulative_precisions, cumulative_recalls
+
+ def compute_average_precisions(self, mode='sample', num_recall_points=11, verbose=True, ret=False):
+ '''
+ Computes the average precision for each class.
+
+ Can compute the Pascal-VOC-style average precision in both the pre-2010 (k-point sampling)
+ and post-2010 (integration) algorithm versions.
+
+ Note that `compute_precision_recall()` must be called before calling this method.
+
+ Arguments:
+ mode (str, optional): Can be either 'sample' or 'integrate'. In the case of 'sample', the average precision will be computed
+ according to the Pascal VOC formula that was used up until VOC 2009, where the precision will be sampled for `num_recall_points`
+ recall values. In the case of 'integrate', the average precision will be computed according to the Pascal VOC formula that
+ was used from VOC 2010 onward, where the average precision will be computed by numerically integrating over the whole
+ preciscion-recall curve instead of sampling individual points from it. 'integrate' mode is basically just the limit case
+ of 'sample' mode as the number of sample points increases. For details, see the references below.
+ num_recall_points (int, optional): Only relevant if mode is 'sample'. The number of points to sample from the precision-recall-curve
+ to compute the average precisions. In other words, this is the number of equidistant recall values for which the resulting
+ precision will be computed. 11 points is the value used in the official Pascal VOC pre-2010 detection evaluation algorithm.
+ verbose (bool, optional): If `True`, will print out the progress during runtime.
+ ret (bool, optional): If `True`, returns the average precisions.
+
+ Returns:
+ None by default. Optionally, a list containing average precision for each class.
+
+ References:
+ http://host.robots.ox.ac.uk/pascal/VOC/voc2012/htmldoc/devkit_doc.html#sec:ap
+ '''
+
+ if (self.cumulative_precisions is None) or (self.cumulative_recalls is None):
+ raise ValueError("Precisions and recalls not available. You must run `compute_precision_recall()` before you call this method.")
+
+ if not (mode in {'sample', 'integrate'}):
+ raise ValueError("`mode` can be either 'sample' or 'integrate', but received '{}'".format(mode))
+
+ average_precisions = [0.0]
+
+ # Iterate over all classes.
+ for class_id in range(1, self.n_classes + 1):
+
+ if verbose:
+ print("Computing average precision, class {}/{}".format(class_id, self.n_classes))
+
+ cumulative_precision = self.cumulative_precisions[class_id]
+ cumulative_recall = self.cumulative_recalls[class_id]
+ average_precision = 0.0
+
+ if mode == 'sample':
+
+ for t in np.linspace(start=0, stop=1, num=num_recall_points, endpoint=True):
+
+ cum_prec_recall_greater_t = cumulative_precision[cumulative_recall >= t]
+
+ if cum_prec_recall_greater_t.size == 0:
+ precision = 0.0
+ else:
+ precision = np.amax(cum_prec_recall_greater_t)
+
+ average_precision += precision
+
+ average_precision /= num_recall_points
+
+ elif mode == 'integrate':
+
+ # We will compute the precision at all unique recall values.
+ unique_recalls, unique_recall_indices, unique_recall_counts = np.unique(cumulative_recall, return_index=True, return_counts=True)
+
+ # Store the maximal precision for each recall value and the absolute difference
+ # between any two unique recal values in the lists below. The products of these
+ # two nummbers constitute the rectangular areas whose sum will be our numerical
+ # integral.
+ maximal_precisions = np.zeros_like(unique_recalls)
+ recall_deltas = np.zeros_like(unique_recalls)
+
+ # Iterate over all unique recall values in reverse order. This saves a lot of computation:
+ # For each unique recall value `r`, we want to get the maximal precision value obtained
+ # for any recall value `r* >= r`. Once we know the maximal precision for the last `k` recall
+ # values after a given iteration, then in the next iteration, in order compute the maximal
+ # precisions for the last `l > k` recall values, we only need to compute the maximal precision
+ # for `l - k` recall values and then take the maximum between that and the previously computed
+ # maximum instead of computing the maximum over all `l` values.
+ # We skip the very last recall value, since the precision after between the last recall value
+ # recall 1.0 is defined to be zero.
+ for i in range(len(unique_recalls)-2, -1, -1):
+ begin = unique_recall_indices[i]
+ end = unique_recall_indices[i + 1]
+ # When computing the maximal precisions, use the maximum of the previous iteration to
+ # avoid unnecessary repeated computation over the same precision values.
+ # The maximal precisions are the heights of the rectangle areas of our integral under
+ # the precision-recall curve.
+ maximal_precisions[i] = np.maximum(np.amax(cumulative_precision[begin:end]), maximal_precisions[i + 1])
+ # The differences between two adjacent recall values are the widths of our rectangle areas.
+ recall_deltas[i] = unique_recalls[i + 1] - unique_recalls[i]
+
+ average_precision = np.sum(maximal_precisions * recall_deltas)
+
+ average_precisions.append(average_precision)
+
+ self.average_precisions = average_precisions
+
+ if ret:
+ return average_precisions
+
+ def compute_mean_average_precision(self, ret=True):
+ '''
+ Computes the mean average precision over all classes.
+
+ Note that `compute_average_precisions()` must be called before calling this method.
+
+ Arguments:
+ ret (bool, optional): If `True`, returns the mean average precision.
+
+ Returns:
+ A float, the mean average precision, by default. Optionally, None.
+ '''
+
+ if self.average_precisions is None:
+ raise ValueError("Average precisions not available. You must run `compute_average_precisions()` before you call this method.")
+
+ mean_average_precision = np.average(self.average_precisions[1:]) # The first element is for the background class, so skip it.
+ self.mean_average_precision = mean_average_precision
+
+ if ret:
+ return mean_average_precision
diff --git a/engine/object_detection_branch/single_shot_detector/eval_utils/coco_utils.py b/engine/object_detection_branch/single_shot_detector/eval_utils/coco_utils.py
new file mode 100644
index 0000000..b0e88f8
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/eval_utils/coco_utils.py
@@ -0,0 +1,200 @@
+'''
+A few utilities that are useful when working with the MS COCO datasets.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+import json
+from tqdm import trange
+from math import ceil
+import sys
+
+from data_generator.object_detection_2d_geometric_ops import Resize
+from data_generator.object_detection_2d_patch_sampling_ops import RandomPadFixedAR
+from data_generator.object_detection_2d_photometric_ops import ConvertTo3Channels
+from ssd_encoder_decoder.ssd_output_decoder import decode_detections
+from data_generator.object_detection_2d_misc_utils import apply_inverse_transforms
+
+def get_coco_category_maps(annotations_file):
+ '''
+ Builds dictionaries that map between MS COCO category IDs, transformed category IDs, and category names.
+ The original MS COCO category IDs are not consecutive unfortunately: The 80 category IDs are spread
+ across the integers 1 through 90 with some integers skipped. Since we usually use a one-hot
+ class representation in neural networks, we need to map these non-consecutive original COCO category
+ IDs (let's call them 'cats') to consecutive category IDs (let's call them 'classes').
+
+ Arguments:
+ annotations_file (str): The filepath to any MS COCO annotations JSON file.
+
+ Returns:
+ 1) cats_to_classes: A dictionary that maps between the original (keys) and the transformed category IDs (values).
+ 2) classes_to_cats: A dictionary that maps between the transformed (keys) and the original category IDs (values).
+ 3) cats_to_names: A dictionary that maps between original category IDs (keys) and the respective category names (values).
+ 4) classes_to_names: A list of the category names (values) with their indices representing the transformed IDs.
+ '''
+ with open(annotations_file, 'r') as f:
+ annotations = json.load(f)
+ cats_to_classes = {}
+ classes_to_cats = {}
+ cats_to_names = {}
+ classes_to_names = []
+ classes_to_names.append('background') # Need to add the background class first so that the indexing is right.
+ for i, cat in enumerate(annotations['categories']):
+ cats_to_classes[cat['id']] = i + 1
+ classes_to_cats[i + 1] = cat['id']
+ cats_to_names[cat['id']] = cat['name']
+ classes_to_names.append(cat['name'])
+
+ return cats_to_classes, classes_to_cats, cats_to_names, classes_to_names
+
+def predict_all_to_json(out_file,
+ model,
+ img_height,
+ img_width,
+ classes_to_cats,
+ data_generator,
+ batch_size,
+ data_generator_mode='resize',
+ model_mode='training',
+ confidence_thresh=0.01,
+ iou_threshold=0.45,
+ top_k=200,
+ pred_coords='centroids',
+ normalize_coords=True):
+ '''
+ Runs detection predictions over the whole dataset given a model and saves them in a JSON file
+ in the MS COCO detection results format.
+
+ Arguments:
+ out_file (str): The file name (full path) under which to save the results JSON file.
+ model (Keras model): A Keras SSD model object.
+ img_height (int): The input image height for the model.
+ img_width (int): The input image width for the model.
+ classes_to_cats (dict): A dictionary that maps the consecutive class IDs predicted by the model
+ to the non-consecutive original MS COCO category IDs.
+ data_generator (DataGenerator): A `DataGenerator` object with the evaluation dataset.
+ batch_size (int): The batch size for the evaluation.
+ data_generator_mode (str, optional): Either of 'resize' or 'pad'. If 'resize', the input images will
+ be resized (i.e. warped) to `(img_height, img_width)`. This mode does not preserve the aspect ratios of the images.
+ If 'pad', the input images will be first padded so that they have the aspect ratio defined by `img_height`
+ and `img_width` and then resized to `(img_height, img_width)`. This mode preserves the aspect ratios of the images.
+ model_mode (str, optional): The mode in which the model was created, i.e. 'training', 'inference' or 'inference_fast'.
+ This is needed in order to know whether the model output is already decoded or still needs to be decoded. Refer to
+ the model documentation for the meaning of the individual modes.
+ confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
+ positive class in order to be considered for the non-maximum suppression stage for the respective class.
+ A lower value will result in a larger part of the selection process being done by the non-maximum suppression
+ stage, while a larger value will result in a larger part of the selection process happening in the confidence
+ thresholding stage.
+ iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
+ with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
+ to the box score.
+ top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
+ non-maximum suppression stage. Defaults to 200, following the paper.
+ input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
+ for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
+ `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+ normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
+ and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
+ relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
+ Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
+ coordinates. Requires `img_height` and `img_width` if set to `True`.
+
+ Returns:
+ None.
+ '''
+
+ convert_to_3_channels = ConvertTo3Channels()
+ resize = Resize(height=img_height,width=img_width)
+ if data_generator_mode == 'resize':
+ transformations = [convert_to_3_channels,
+ resize]
+ elif data_generator_mode == 'pad':
+ random_pad = RandomPadFixedAR(patch_aspect_ratio=img_width/img_height, clip_boxes=False)
+ transformations = [convert_to_3_channels,
+ random_pad,
+ resize]
+ else:
+ raise ValueError("Unexpected argument value: `data_generator_mode` can be either of 'resize' or 'pad', but received '{}'.".format(data_generator_mode))
+
+ # Set the generator parameters.
+ generator = data_generator.generate(batch_size=batch_size,
+ shuffle=False,
+ transformations=transformations,
+ label_encoder=None,
+ returns={'processed_images',
+ 'image_ids',
+ 'inverse_transform'},
+ keep_images_without_gt=True)
+ # Put the results in this list.
+ results = []
+ # Compute the number of batches to iterate over the entire dataset.
+ n_images = data_generator.get_dataset_size()
+ print("Number of images in the evaluation dataset: {}".format(n_images))
+ n_batches = int(ceil(n_images / batch_size))
+ # Loop over all batches.
+ tr = trange(n_batches, file=sys.stdout)
+ tr.set_description('Producing results file')
+ for i in tr:
+ # Generate batch.
+ batch_X, batch_image_ids, batch_inverse_transforms = next(generator)
+ # Predict.
+ y_pred = model.predict(batch_X)
+ # If the model was created in 'training' mode, the raw predictions need to
+ # be decoded and filtered, otherwise that's already taken care of.
+ if model_mode == 'training':
+ # Decode.
+ y_pred = decode_detections(y_pred,
+ confidence_thresh=confidence_thresh,
+ iou_threshold=iou_threshold,
+ top_k=top_k,
+ input_coords=pred_coords,
+ normalize_coords=normalize_coords,
+ img_height=img_height,
+ img_width=img_width)
+ else:
+ # Filter out the all-zeros dummy elements of `y_pred`.
+ y_pred_filtered = []
+ for i in range(len(y_pred)):
+ y_pred_filtered.append(y_pred[i][y_pred[i,:,0] != 0])
+ y_pred = y_pred_filtered
+ # Convert the predicted box coordinates for the original images.
+ y_pred = apply_inverse_transforms(y_pred, batch_inverse_transforms)
+
+ # Convert each predicted box into the results format.
+ for k, batch_item in enumerate(y_pred):
+ for box in batch_item:
+ class_id = box[0]
+ # Transform the consecutive class IDs back to the original COCO category IDs.
+ cat_id = classes_to_cats[class_id]
+ # Round the box coordinates to reduce the JSON file size.
+ xmin = float(round(box[2], 1))
+ ymin = float(round(box[3], 1))
+ xmax = float(round(box[4], 1))
+ ymax = float(round(box[5], 1))
+ width = xmax - xmin
+ height = ymax - ymin
+ bbox = [xmin, ymin, width, height]
+ result = {}
+ result['image_id'] = batch_image_ids[k]
+ result['category_id'] = cat_id
+ result['score'] = float(round(box[1], 3))
+ result['bbox'] = bbox
+ results.append(result)
+
+ with open(out_file, 'w') as f:
+ json.dump(results, f)
+
+ print("Prediction results saved in '{}'".format(out_file))
diff --git a/engine/object_detection_branch/single_shot_detector/keras_layers/__init__.py b/engine/object_detection_branch/single_shot_detector/keras_layers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/single_shot_detector/keras_layers/keras_layer_AnchorBoxes.py b/engine/object_detection_branch/single_shot_detector/keras_layers/keras_layer_AnchorBoxes.py
new file mode 100644
index 0000000..d294358
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/keras_layers/keras_layer_AnchorBoxes.py
@@ -0,0 +1,281 @@
+'''
+A custom Keras layer to generate anchor boxes.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+
+import numpy as np
+
+import keras.backend as K
+from keras.engine.topology import InputSpec
+from keras.engine.topology import Layer
+
+from engine.object_detection_branch.single_shot_detector.bounding_box_utils.bounding_box_utils import convert_coordinates
+
+
+class AnchorBoxes(Layer):
+ '''
+ A Keras layer to create an output tensor containing anchor box coordinates
+ and variances based on the input tensor and the passed arguments.
+
+ A set of 2D anchor boxes of different aspect ratios is created for each spatial unit of
+ the input tensor. The number of anchor boxes created per unit depends on the arguments
+ `aspect_ratios` and `two_boxes_for_ar1`, in the default case it is 4. The boxes
+ are parameterized by the coordinate tuple `(xmin, xmax, ymin, ymax)`.
+
+ The logic implemented by this layer is identical to the logic in the module
+ `ssd_box_encode_decode_utils.py`.
+
+ The purpose of having this layer in the network is to make the model self-sufficient
+ at inference time. Since the model is predicting offsets to the anchor boxes
+ (rather than predicting absolute box coordinates directly), one needs to know the anchor
+ box coordinates in order to construct the final prediction boxes from the predicted offsets.
+ If the model's output tensor did not contain the anchor box coordinates, the necessary
+ information to convert the predicted offsets back to absolute coordinates would be missing
+ in the model output. The reason why it is necessary to predict offsets to the anchor boxes
+ rather than to predict absolute box coordinates directly is explained in `README.md`.
+
+ Input shape:
+ 4D tensor of shape `(batch, channels, height, width)` if `dim_ordering = 'th'`
+ or `(batch, height, width, channels)` if `dim_ordering = 'tf'`.
+
+ Output shape:
+ 5D tensor of shape `(batch, height, width, n_boxes, 8)`. The last axis contains
+ the four anchor box coordinates and the four variance values for each box.
+ '''
+
+ def __init__(self,
+ img_height,
+ img_width,
+ this_scale,
+ next_scale,
+ aspect_ratios=[0.5, 1.0, 2.0],
+ two_boxes_for_ar1=True,
+ this_steps=None,
+ this_offsets=None,
+ clip_boxes=False,
+ variances=[0.1, 0.1, 0.2, 0.2],
+ coords='centroids',
+ normalize_coords=False,
+ **kwargs):
+ '''
+ All arguments need to be set to the same values as in the box encoding process, otherwise the behavior is undefined.
+ Some of these arguments are explained in more detail in the documentation of the `SSDBoxEncoder` class.
+
+ Arguments:
+ img_height (int): The height of the input images.
+ img_width (int): The width of the input images.
+ this_scale (float): A float in [0, 1], the scaling factor for the size of the generated anchor boxes
+ as a fraction of the shorter side of the input image.
+ next_scale (float): A float in [0, 1], the next larger scaling factor. Only relevant if
+ `self.two_boxes_for_ar1 == True`.
+ aspect_ratios (list, optional): The list of aspect ratios for which default boxes are to be
+ generated for this layer.
+ two_boxes_for_ar1 (bool, optional): Only relevant if `aspect_ratios` contains 1.
+ If `True`, two default boxes will be generated for aspect ratio 1. The first will be generated
+ using the scaling factor for the respective layer, the second one will be generated using
+ geometric mean of said scaling factor and next bigger scaling factor.
+ clip_boxes (bool, optional): If `True`, clips the anchor box coordinates to stay within image boundaries.
+ variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by
+ its respective variance value.
+ coords (str, optional): The box coordinate format to be used internally in the model (i.e. this is not the input format
+ of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width, and height),
+ 'corners' for the format `(xmin, ymin, xmax, ymax)`, or 'minmax' for the format `(xmin, xmax, ymin, ymax)`.
+ normalize_coords (bool, optional): Set to `True` if the model uses relative instead of absolute coordinates,
+ i.e. if the model predicts box coordinates within [0,1] instead of absolute coordinates.
+ '''
+ if K.backend() != 'tensorflow':
+ raise TypeError("This layer only supports TensorFlow at the moment, but you are using the {} backend.".format(K.backend()))
+
+ if (this_scale < 0) or (next_scale < 0) or (this_scale > 1):
+ raise ValueError("`this_scale` must be in [0, 1] and `next_scale` must be >0, but `this_scale` == {}, `next_scale` == {}".format(this_scale, next_scale))
+
+ if len(variances) != 4:
+ raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances)))
+ variances = np.array(variances)
+ if np.any(variances <= 0):
+ raise ValueError("All variances must be >0, but the variances given are {}".format(variances))
+
+ self.img_height = img_height
+ self.img_width = img_width
+ self.this_scale = this_scale
+ self.next_scale = next_scale
+ self.aspect_ratios = aspect_ratios
+ self.two_boxes_for_ar1 = two_boxes_for_ar1
+ self.this_steps = this_steps
+ self.this_offsets = this_offsets
+ self.clip_boxes = clip_boxes
+ self.variances = variances
+ self.coords = coords
+ self.normalize_coords = normalize_coords
+ # Compute the number of boxes per cell
+ if (1 in aspect_ratios) and two_boxes_for_ar1:
+ self.n_boxes = len(aspect_ratios) + 1
+ else:
+ self.n_boxes = len(aspect_ratios)
+ super(AnchorBoxes, self).__init__(**kwargs)
+
+ def build(self, input_shape):
+ self.input_spec = [InputSpec(shape=input_shape)]
+ super(AnchorBoxes, self).build(input_shape)
+
+ def call(self, x, mask=None):
+ '''
+ Return an anchor box tensor based on the shape of the input tensor.
+
+ The logic implemented here is identical to the logic in the module `ssd_box_encode_decode_utils.py`.
+
+ Note that this tensor does not participate in any graph computations at runtime. It is being created
+ as a constant once during graph creation and is just being output along with the rest of the model output
+ during runtime. Because of this, all logic is implemented as Numpy array operations and it is sufficient
+ to convert the resulting Numpy array into a Keras tensor at the very end before outputting it.
+
+ Arguments:
+ x (tensor): 4D tensor of shape `(batch, channels, height, width)` if `dim_ordering = 'th'`
+ or `(batch, height, width, channels)` if `dim_ordering = 'tf'`. The input for this
+ layer must be the output of the localization predictor layer.
+ '''
+
+ # Compute box width and height for each aspect ratio
+ # The shorter side of the image will be used to compute `w` and `h` using `scale` and `aspect_ratios`.
+ size = min(self.img_height, self.img_width)
+ # Compute the box widths and and heights for all aspect ratios
+ wh_list = []
+ for ar in self.aspect_ratios:
+ if (ar == 1):
+ # Compute the regular anchor box for aspect ratio 1.
+ box_height = box_width = self.this_scale * size
+ wh_list.append((box_width, box_height))
+ if self.two_boxes_for_ar1:
+ # Compute one slightly larger version using the geometric mean of this scale value and the next.
+ box_height = box_width = np.sqrt(self.this_scale * self.next_scale) * size
+ wh_list.append((box_width, box_height))
+ else:
+ box_height = self.this_scale * size / np.sqrt(ar)
+ box_width = self.this_scale * size * np.sqrt(ar)
+ wh_list.append((box_width, box_height))
+ wh_list = np.array(wh_list)
+
+ # We need the shape of the input tensor
+ if K.image_dim_ordering() == 'tf':
+ batch_size, feature_map_height, feature_map_width, feature_map_channels = x._keras_shape
+ else: # Not yet relevant since TensorFlow is the only supported backend right now, but it can't harm to have this in here for the future
+ batch_size, feature_map_channels, feature_map_height, feature_map_width = x._keras_shape
+
+ # Compute the grid of box center points. They are identical for all aspect ratios.
+
+ # Compute the step sizes, i.e. how far apart the anchor box center points will be vertically and horizontally.
+ if (self.this_steps is None):
+ step_height = self.img_height / feature_map_height
+ step_width = self.img_width / feature_map_width
+ else:
+ if isinstance(self.this_steps, (list, tuple)) and (len(self.this_steps) == 2):
+ step_height = self.this_steps[0]
+ step_width = self.this_steps[1]
+ elif isinstance(self.this_steps, (int, float)):
+ step_height = self.this_steps
+ step_width = self.this_steps
+ # Compute the offsets, i.e. at what pixel values the first anchor box center point will be from the top and from the left of the image.
+ if (self.this_offsets is None):
+ offset_height = 0.5
+ offset_width = 0.5
+ else:
+ if isinstance(self.this_offsets, (list, tuple)) and (len(self.this_offsets) == 2):
+ offset_height = self.this_offsets[0]
+ offset_width = self.this_offsets[1]
+ elif isinstance(self.this_offsets, (int, float)):
+ offset_height = self.this_offsets
+ offset_width = self.this_offsets
+ # Now that we have the offsets and step sizes, compute the grid of anchor box center points.
+ cy = np.linspace(offset_height * step_height, (offset_height + feature_map_height - 1) * step_height, feature_map_height)
+ cx = np.linspace(offset_width * step_width, (offset_width + feature_map_width - 1) * step_width, feature_map_width)
+ cx_grid, cy_grid = np.meshgrid(cx, cy)
+ cx_grid = np.expand_dims(cx_grid, -1) # This is necessary for np.tile() to do what we want further down
+ cy_grid = np.expand_dims(cy_grid, -1) # This is necessary for np.tile() to do what we want further down
+
+ # Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)`
+ # where the last dimension will contain `(cx, cy, w, h)`
+ boxes_tensor = np.zeros((feature_map_height, feature_map_width, self.n_boxes, 4))
+
+ boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, self.n_boxes)) # Set cx
+ boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, self.n_boxes)) # Set cy
+ boxes_tensor[:, :, :, 2] = wh_list[:, 0] # Set w
+ boxes_tensor[:, :, :, 3] = wh_list[:, 1] # Set h
+
+ # Convert `(cx, cy, w, h)` to `(xmin, xmax, ymin, ymax)`
+ boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='centroids2corners')
+
+ # If `clip_boxes` is enabled, clip the coordinates to lie within the image boundaries
+ if self.clip_boxes:
+ x_coords = boxes_tensor[:,:,:,[0, 2]]
+ x_coords[x_coords >= self.img_width] = self.img_width - 1
+ x_coords[x_coords < 0] = 0
+ boxes_tensor[:,:,:,[0, 2]] = x_coords
+ y_coords = boxes_tensor[:,:,:,[1, 3]]
+ y_coords[y_coords >= self.img_height] = self.img_height - 1
+ y_coords[y_coords < 0] = 0
+ boxes_tensor[:,:,:,[1, 3]] = y_coords
+
+ # If `normalize_coords` is enabled, normalize the coordinates to be within [0,1]
+ if self.normalize_coords:
+ boxes_tensor[:, :, :, [0, 2]] /= self.img_width
+ boxes_tensor[:, :, :, [1, 3]] /= self.img_height
+
+ # TODO: Implement box limiting directly for `(cx, cy, w, h)` so that we don't have to unnecessarily convert back and forth.
+ if self.coords == 'centroids':
+ # Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`.
+ boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2centroids', border_pixels='half')
+ elif self.coords == 'minmax':
+ # Convert `(xmin, ymin, xmax, ymax)` to `(xmin, xmax, ymin, ymax).
+ boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2minmax', border_pixels='half')
+
+ # Create a tensor to contain the variances and append it to `boxes_tensor`. This tensor has the same shape
+ # as `boxes_tensor` and simply contains the same 4 variance values for every position in the last axis.
+ variances_tensor = np.zeros_like(boxes_tensor) # Has shape `(feature_map_height, feature_map_width, n_boxes, 4)`
+ variances_tensor += self.variances # Long live broadcasting
+ # Now `boxes_tensor` becomes a tensor of shape `(feature_map_height, feature_map_width, n_boxes, 8)`
+ boxes_tensor = np.concatenate((boxes_tensor, variances_tensor), axis=-1)
+
+ # Now prepend one dimension to `boxes_tensor` to account for the batch size and tile it along
+ # The result will be a 5D tensor of shape `(batch_size, feature_map_height, feature_map_width, n_boxes, 8)`
+ boxes_tensor = np.expand_dims(boxes_tensor, axis=0)
+ boxes_tensor = K.tile(K.constant(boxes_tensor, dtype='float32'), (K.shape(x)[0], 1, 1, 1, 1))
+
+ return boxes_tensor
+
+ def compute_output_shape(self, input_shape):
+ if K.image_dim_ordering() == 'tf':
+ batch_size, feature_map_height, feature_map_width, feature_map_channels = input_shape
+ else: # Not yet relevant since TensorFlow is the only supported backend right now, but it can't harm to have this in here for the future
+ batch_size, feature_map_channels, feature_map_height, feature_map_width = input_shape
+ return (batch_size, feature_map_height, feature_map_width, self.n_boxes, 8)
+
+ def get_config(self):
+ config = {
+ 'img_height': self.img_height,
+ 'img_width': self.img_width,
+ 'this_scale': self.this_scale,
+ 'next_scale': self.next_scale,
+ 'aspect_ratios': list(self.aspect_ratios),
+ 'two_boxes_for_ar1': self.two_boxes_for_ar1,
+ 'clip_boxes': self.clip_boxes,
+ 'variances': list(self.variances),
+ 'coords': self.coords,
+ 'normalize_coords': self.normalize_coords
+ }
+ base_config = super(AnchorBoxes, self).get_config()
+ return dict(list(base_config.items()) + list(config.items()))
diff --git a/engine/object_detection_branch/single_shot_detector/keras_layers/keras_layer_DecodeDetections.py b/engine/object_detection_branch/single_shot_detector/keras_layers/keras_layer_DecodeDetections.py
new file mode 100644
index 0000000..3fc4d57
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/keras_layers/keras_layer_DecodeDetections.py
@@ -0,0 +1,283 @@
+'''
+A custom Keras layer to decode the raw SSD prediction output. Corresponds to the
+`DetectionOutput` layer type in the original Caffe implementation of SSD.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+import tensorflow as tf
+import keras.backend as K
+from keras.engine.topology import InputSpec
+from keras.engine.topology import Layer
+
+class DecodeDetections(Layer):
+ '''
+ A Keras layer to decode the raw SSD prediction output.
+
+ Input shape:
+ 3D tensor of shape `(batch_size, n_boxes, n_classes + 12)`.
+
+ Output shape:
+ 3D tensor of shape `(batch_size, top_k, 6)`.
+ '''
+
+ def __init__(self,
+ confidence_thresh=0.01,
+ iou_threshold=0.45,
+ top_k=200,
+ nms_max_output_size=400,
+ coords='centroids',
+ normalize_coords=True,
+ img_height=None,
+ img_width=None,
+ **kwargs):
+ '''
+ All default argument values follow the Caffe implementation.
+
+ Arguments:
+ confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
+ positive class in order to be considered for the non-maximum suppression stage for the respective class.
+ A lower value will result in a larger part of the selection process being done by the non-maximum suppression
+ stage, while a larger value will result in a larger part of the selection process happening in the confidence
+ thresholding stage.
+ iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
+ with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
+ to the box score.
+ top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
+ non-maximum suppression stage.
+ nms_max_output_size (int, optional): The maximum number of predictions that will be left after performing non-maximum
+ suppression.
+ coords (str, optional): The box coordinate format that the model outputs. Must be 'centroids'
+ i.e. the format `(cx, cy, w, h)` (box center coordinates, width, and height). Other coordinate formats are
+ currently not supported.
+ normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
+ and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
+ relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
+ Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
+ coordinates. Requires `img_height` and `img_width` if set to `True`.
+ img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
+ img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
+ '''
+ if K.backend() != 'tensorflow':
+ raise TypeError("This layer only supports TensorFlow at the moment, but you are using the {} backend.".format(K.backend()))
+
+ if normalize_coords and ((img_height is None) or (img_width is None)):
+ raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
+
+ if coords != 'centroids':
+ raise ValueError("The DetectionOutput layer currently only supports the 'centroids' coordinate format.")
+
+ # We need these members for the config.
+ self.confidence_thresh = confidence_thresh
+ self.iou_threshold = iou_threshold
+ self.top_k = top_k
+ self.normalize_coords = normalize_coords
+ self.img_height = img_height
+ self.img_width = img_width
+ self.coords = coords
+ self.nms_max_output_size = nms_max_output_size
+
+ # We need these members for TensorFlow.
+ self.tf_confidence_thresh = tf.constant(self.confidence_thresh, name='confidence_thresh')
+ self.tf_iou_threshold = tf.constant(self.iou_threshold, name='iou_threshold')
+ self.tf_top_k = tf.constant(self.top_k, name='top_k')
+ self.tf_normalize_coords = tf.constant(self.normalize_coords, name='normalize_coords')
+ self.tf_img_height = tf.constant(self.img_height, dtype=tf.float32, name='img_height')
+ self.tf_img_width = tf.constant(self.img_width, dtype=tf.float32, name='img_width')
+ self.tf_nms_max_output_size = tf.constant(self.nms_max_output_size, name='nms_max_output_size')
+
+ super(DecodeDetections, self).__init__(**kwargs)
+
+ def build(self, input_shape):
+ self.input_spec = [InputSpec(shape=input_shape)]
+ super(DecodeDetections, self).build(input_shape)
+
+ def call(self, y_pred, mask=None):
+ '''
+ Returns:
+ 3D tensor of shape `(batch_size, top_k, 6)`. The second axis is zero-padded
+ to always yield `top_k` predictions per batch item. The last axis contains
+ the coordinates for each predicted box in the format
+ `[class_id, confidence, xmin, ymin, xmax, ymax]`.
+ '''
+
+ #####################################################################################
+ # 1. Convert the box coordinates from predicted anchor box offsets to predicted
+ # absolute coordinates
+ #####################################################################################
+
+ # Convert anchor box offsets to image offsets.
+ cx = y_pred[...,-12] * y_pred[...,-4] * y_pred[...,-6] + y_pred[...,-8] # cx = cx_pred * cx_variance * w_anchor + cx_anchor
+ cy = y_pred[...,-11] * y_pred[...,-3] * y_pred[...,-5] + y_pred[...,-7] # cy = cy_pred * cy_variance * h_anchor + cy_anchor
+ w = tf.exp(y_pred[...,-10] * y_pred[...,-2]) * y_pred[...,-6] # w = exp(w_pred * variance_w) * w_anchor
+ h = tf.exp(y_pred[...,-9] * y_pred[...,-1]) * y_pred[...,-5] # h = exp(h_pred * variance_h) * h_anchor
+
+ # Convert 'centroids' to 'corners'.
+ xmin = cx - 0.5 * w
+ ymin = cy - 0.5 * h
+ xmax = cx + 0.5 * w
+ ymax = cy + 0.5 * h
+
+ # If the model predicts box coordinates relative to the image dimensions and they are supposed
+ # to be converted back to absolute coordinates, do that.
+ def normalized_coords():
+ xmin1 = tf.expand_dims(xmin * self.tf_img_width, axis=-1)
+ ymin1 = tf.expand_dims(ymin * self.tf_img_height, axis=-1)
+ xmax1 = tf.expand_dims(xmax * self.tf_img_width, axis=-1)
+ ymax1 = tf.expand_dims(ymax * self.tf_img_height, axis=-1)
+ return xmin1, ymin1, xmax1, ymax1
+ def non_normalized_coords():
+ return tf.expand_dims(xmin, axis=-1), tf.expand_dims(ymin, axis=-1), tf.expand_dims(xmax, axis=-1), tf.expand_dims(ymax, axis=-1)
+
+ xmin, ymin, xmax, ymax = tf.cond(self.tf_normalize_coords, normalized_coords, non_normalized_coords)
+
+ # Concatenate the one-hot class confidences and the converted box coordinates to form the decoded predictions tensor.
+ y_pred = tf.concat(values=[y_pred[...,:-12], xmin, ymin, xmax, ymax], axis=-1)
+
+ #####################################################################################
+ # 2. Perform confidence thresholding, per-class non-maximum suppression, and
+ # top-k filtering.
+ #####################################################################################
+
+ batch_size = tf.shape(y_pred)[0] # Output dtype: tf.int32
+ n_boxes = tf.shape(y_pred)[1]
+ n_classes = y_pred.shape[2] - 4
+ class_indices = tf.range(1, n_classes)
+
+ # Create a function that filters the predictions for the given batch item. Specifically, it performs:
+ # - confidence thresholding
+ # - non-maximum suppression (NMS)
+ # - top-k filtering
+ def filter_predictions(batch_item):
+
+ # Create a function that filters the predictions for one single class.
+ def filter_single_class(index):
+
+ # From a tensor of shape (n_boxes, n_classes + 4 coordinates) extract
+ # a tensor of shape (n_boxes, 1 + 4 coordinates) that contains the
+ # confidnece values for just one class, determined by `index`.
+ confidences = tf.expand_dims(batch_item[..., index], axis=-1)
+ class_id = tf.fill(dims=tf.shape(confidences), value=tf.to_float(index))
+ box_coordinates = batch_item[...,-4:]
+
+ single_class = tf.concat([class_id, confidences, box_coordinates], axis=-1)
+
+ # Apply confidence thresholding with respect to the class defined by `index`.
+ threshold_met = single_class[:,1] > self.tf_confidence_thresh
+ single_class = tf.boolean_mask(tensor=single_class,
+ mask=threshold_met)
+
+ # If any boxes made the threshold, perform NMS.
+ def perform_nms():
+ scores = single_class[...,1]
+
+ # `tf.image.non_max_suppression()` needs the box coordinates in the format `(ymin, xmin, ymax, xmax)`.
+ xmin = tf.expand_dims(single_class[...,-4], axis=-1)
+ ymin = tf.expand_dims(single_class[...,-3], axis=-1)
+ xmax = tf.expand_dims(single_class[...,-2], axis=-1)
+ ymax = tf.expand_dims(single_class[...,-1], axis=-1)
+ boxes = tf.concat(values=[ymin, xmin, ymax, xmax], axis=-1)
+
+ maxima_indices = tf.image.non_max_suppression(boxes=boxes,
+ scores=scores,
+ max_output_size=self.tf_nms_max_output_size,
+ iou_threshold=self.iou_threshold,
+ name='non_maximum_suppresion')
+ maxima = tf.gather(params=single_class,
+ indices=maxima_indices,
+ axis=0)
+ return maxima
+
+ def no_confident_predictions():
+ return tf.constant(value=0.0, shape=(1,6))
+
+ single_class_nms = tf.cond(tf.equal(tf.size(single_class), 0), no_confident_predictions, perform_nms)
+
+ # Make sure `single_class` is exactly `self.nms_max_output_size` elements long.
+ padded_single_class = tf.pad(tensor=single_class_nms,
+ paddings=[[0, self.tf_nms_max_output_size - tf.shape(single_class_nms)[0]], [0, 0]],
+ mode='CONSTANT',
+ constant_values=0.0)
+
+ return padded_single_class
+
+ # Iterate `filter_single_class()` over all class indices.
+ filtered_single_classes = tf.map_fn(fn=lambda i: filter_single_class(i),
+ elems=tf.range(1,n_classes),
+ dtype=tf.float32,
+ parallel_iterations=128,
+ back_prop=False,
+ swap_memory=False,
+ infer_shape=True,
+ name='loop_over_classes')
+
+ # Concatenate the filtered results for all individual classes to one tensor.
+ filtered_predictions = tf.reshape(tensor=filtered_single_classes, shape=(-1,6))
+
+ # Perform top-k filtering for this batch item or pad it in case there are
+ # fewer than `self.top_k` boxes left at this point. Either way, produce a
+ # tensor of length `self.top_k`. By the time we return the final results tensor
+ # for the whole batch, all batch items must have the same number of predicted
+ # boxes so that the tensor dimensions are homogenous. If fewer than `self.top_k`
+ # predictions are left after the filtering process above, we pad the missing
+ # predictions with zeros as dummy entries.
+ def top_k():
+ return tf.gather(params=filtered_predictions,
+ indices=tf.nn.top_k(filtered_predictions[:, 1], k=self.tf_top_k, sorted=True).indices,
+ axis=0)
+ def pad_and_top_k():
+ padded_predictions = tf.pad(tensor=filtered_predictions,
+ paddings=[[0, self.tf_top_k - tf.shape(filtered_predictions)[0]], [0, 0]],
+ mode='CONSTANT',
+ constant_values=0.0)
+ return tf.gather(params=padded_predictions,
+ indices=tf.nn.top_k(padded_predictions[:, 1], k=self.tf_top_k, sorted=True).indices,
+ axis=0)
+
+ top_k_boxes = tf.cond(tf.greater_equal(tf.shape(filtered_predictions)[0], self.tf_top_k), top_k, pad_and_top_k)
+
+ return top_k_boxes
+
+ # Iterate `filter_predictions()` over all batch items.
+ output_tensor = tf.map_fn(fn=lambda x: filter_predictions(x),
+ elems=y_pred,
+ dtype=None,
+ parallel_iterations=128,
+ back_prop=False,
+ swap_memory=False,
+ infer_shape=True,
+ name='loop_over_batch')
+
+ return output_tensor
+
+ def compute_output_shape(self, input_shape):
+ batch_size, n_boxes, last_axis = input_shape
+ return (batch_size, self.tf_top_k, 6) # Last axis: (class_ID, confidence, 4 box coordinates)
+
+ def get_config(self):
+ config = {
+ 'confidence_thresh': self.confidence_thresh,
+ 'iou_threshold': self.iou_threshold,
+ 'top_k': self.top_k,
+ 'nms_max_output_size': self.nms_max_output_size,
+ 'coords': self.coords,
+ 'normalize_coords': self.normalize_coords,
+ 'img_height': self.img_height,
+ 'img_width': self.img_width,
+ }
+ base_config = super(DecodeDetections, self).get_config()
+ return dict(list(base_config.items()) + list(config.items()))
diff --git a/engine/object_detection_branch/single_shot_detector/keras_layers/keras_layer_DecodeDetectionsFast.py b/engine/object_detection_branch/single_shot_detector/keras_layers/keras_layer_DecodeDetectionsFast.py
new file mode 100644
index 0000000..f8ab221
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/keras_layers/keras_layer_DecodeDetectionsFast.py
@@ -0,0 +1,266 @@
+'''
+A custom Keras layer to decode the raw SSD prediction output. This is a modified
+and more efficient version of the `DetectionOutput` layer type in the original Caffe
+implementation of SSD. For a faithful replication of the original layer, please
+refer to the `DecodeDetections` layer.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+import tensorflow as tf
+import keras.backend as K
+from keras.engine.topology import InputSpec
+from keras.engine.topology import Layer
+
+class DecodeDetectionsFast(Layer):
+ '''
+ A Keras layer to decode the raw SSD prediction output.
+
+ Input shape:
+ 3D tensor of shape `(batch_size, n_boxes, n_classes + 12)`.
+
+ Output shape:
+ 3D tensor of shape `(batch_size, top_k, 6)`.
+ '''
+
+ def __init__(self,
+ confidence_thresh=0.01,
+ iou_threshold=0.45,
+ top_k=200,
+ nms_max_output_size=400,
+ coords='centroids',
+ normalize_coords=True,
+ img_height=None,
+ img_width=None,
+ **kwargs):
+ '''
+ All default argument values follow the Caffe implementation.
+
+ Arguments:
+ confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
+ positive class in order to be considered for the non-maximum suppression stage for the respective class.
+ A lower value will result in a larger part of the selection process being done by the non-maximum suppression
+ stage, while a larger value will result in a larger part of the selection process happening in the confidence
+ thresholding stage.
+ iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
+ with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
+ to the box score.
+ top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
+ non-maximum suppression stage.
+ nms_max_output_size (int, optional): The maximum number of predictions that will be left after performing non-maximum
+ suppression.
+ coords (str, optional): The box coordinate format that the model outputs. Must be 'centroids'
+ i.e. the format `(cx, cy, w, h)` (box center coordinates, width, and height). Other coordinate formats are
+ currently not supported.
+ normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
+ and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
+ relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
+ Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
+ coordinates. Requires `img_height` and `img_width` if set to `True`.
+ img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
+ img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
+ '''
+ if K.backend() != 'tensorflow':
+ raise TypeError("This layer only supports TensorFlow at the moment, but you are using the {} backend.".format(K.backend()))
+
+ if normalize_coords and ((img_height is None) or (img_width is None)):
+ raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
+
+ if coords != 'centroids':
+ raise ValueError("The DetectionOutput layer currently only supports the 'centroids' coordinate format.")
+
+ # We need these members for the config.
+ self.confidence_thresh = confidence_thresh
+ self.iou_threshold = iou_threshold
+ self.top_k = top_k
+ self.normalize_coords = normalize_coords
+ self.img_height = img_height
+ self.img_width = img_width
+ self.coords = coords
+ self.nms_max_output_size = nms_max_output_size
+
+ # We need these members for TensorFlow.
+ self.tf_confidence_thresh = tf.constant(self.confidence_thresh, name='confidence_thresh')
+ self.tf_iou_threshold = tf.constant(self.iou_threshold, name='iou_threshold')
+ self.tf_top_k = tf.constant(self.top_k, name='top_k')
+ self.tf_normalize_coords = tf.constant(self.normalize_coords, name='normalize_coords')
+ self.tf_img_height = tf.constant(self.img_height, dtype=tf.float32, name='img_height')
+ self.tf_img_width = tf.constant(self.img_width, dtype=tf.float32, name='img_width')
+ self.tf_nms_max_output_size = tf.constant(self.nms_max_output_size, name='nms_max_output_size')
+
+ super(DecodeDetectionsFast, self).__init__(**kwargs)
+
+ def build(self, input_shape):
+ self.input_spec = [InputSpec(shape=input_shape)]
+ super(DecodeDetectionsFast, self).build(input_shape)
+
+ def call(self, y_pred, mask=None):
+ '''
+ Returns:
+ 3D tensor of shape `(batch_size, top_k, 6)`. The second axis is zero-padded
+ to always yield `top_k` predictions per batch item. The last axis contains
+ the coordinates for each predicted box in the format
+ `[class_id, confidence, xmin, ymin, xmax, ymax]`.
+ '''
+
+ #####################################################################################
+ # 1. Convert the box coordinates from predicted anchor box offsets to predicted
+ # absolute coordinates
+ #####################################################################################
+
+ # Extract the predicted class IDs as the indices of the highest confidence values.
+ class_ids = tf.expand_dims(tf.to_float(tf.argmax(y_pred[...,:-12], axis=-1)), axis=-1)
+ # Extract the confidences of the maximal classes.
+ confidences = tf.reduce_max(y_pred[...,:-12], axis=-1, keep_dims=True)
+
+ # Convert anchor box offsets to image offsets.
+ cx = y_pred[...,-12] * y_pred[...,-4] * y_pred[...,-6] + y_pred[...,-8] # cx = cx_pred * cx_variance * w_anchor + cx_anchor
+ cy = y_pred[...,-11] * y_pred[...,-3] * y_pred[...,-5] + y_pred[...,-7] # cy = cy_pred * cy_variance * h_anchor + cy_anchor
+ w = tf.exp(y_pred[...,-10] * y_pred[...,-2]) * y_pred[...,-6] # w = exp(w_pred * variance_w) * w_anchor
+ h = tf.exp(y_pred[...,-9] * y_pred[...,-1]) * y_pred[...,-5] # h = exp(h_pred * variance_h) * h_anchor
+
+ # Convert 'centroids' to 'corners'.
+ xmin = cx - 0.5 * w
+ ymin = cy - 0.5 * h
+ xmax = cx + 0.5 * w
+ ymax = cy + 0.5 * h
+
+ # If the model predicts box coordinates relative to the image dimensions and they are supposed
+ # to be converted back to absolute coordinates, do that.
+ def normalized_coords():
+ xmin1 = tf.expand_dims(xmin * self.tf_img_width, axis=-1)
+ ymin1 = tf.expand_dims(ymin * self.tf_img_height, axis=-1)
+ xmax1 = tf.expand_dims(xmax * self.tf_img_width, axis=-1)
+ ymax1 = tf.expand_dims(ymax * self.tf_img_height, axis=-1)
+ return xmin1, ymin1, xmax1, ymax1
+ def non_normalized_coords():
+ return tf.expand_dims(xmin, axis=-1), tf.expand_dims(ymin, axis=-1), tf.expand_dims(xmax, axis=-1), tf.expand_dims(ymax, axis=-1)
+
+ xmin, ymin, xmax, ymax = tf.cond(self.tf_normalize_coords, normalized_coords, non_normalized_coords)
+
+ # Concatenate the one-hot class confidences and the converted box coordinates to form the decoded predictions tensor.
+ y_pred = tf.concat(values=[class_ids, confidences, xmin, ymin, xmax, ymax], axis=-1)
+
+ #####################################################################################
+ # 2. Perform confidence thresholding, non-maximum suppression, and top-k filtering.
+ #####################################################################################
+
+ batch_size = tf.shape(y_pred)[0] # Output dtype: tf.int32
+ n_boxes = tf.shape(y_pred)[1]
+ n_classes = y_pred.shape[2] - 4
+ class_indices = tf.range(1, n_classes)
+
+ # Create a function that filters the predictions for the given batch item. Specifically, it performs:
+ # - confidence thresholding
+ # - non-maximum suppression (NMS)
+ # - top-k filtering
+ def filter_predictions(batch_item):
+
+ # Keep only the non-background boxes.
+ positive_boxes = tf.not_equal(batch_item[...,0], 0.0)
+ predictions = tf.boolean_mask(tensor=batch_item,
+ mask=positive_boxes)
+
+ def perform_confidence_thresholding():
+ # Apply confidence thresholding.
+ threshold_met = predictions[:,1] > self.tf_confidence_thresh
+ return tf.boolean_mask(tensor=predictions,
+ mask=threshold_met)
+ def no_positive_boxes():
+ return tf.constant(value=0.0, shape=(1,6))
+
+ # If there are any positive predictions, perform confidence thresholding.
+ predictions_conf_thresh = tf.cond(tf.equal(tf.size(predictions), 0), no_positive_boxes, perform_confidence_thresholding)
+
+ def perform_nms():
+ scores = predictions_conf_thresh[...,1]
+
+ # `tf.image.non_max_suppression()` needs the box coordinates in the format `(ymin, xmin, ymax, xmax)`.
+ xmin = tf.expand_dims(predictions_conf_thresh[...,-4], axis=-1)
+ ymin = tf.expand_dims(predictions_conf_thresh[...,-3], axis=-1)
+ xmax = tf.expand_dims(predictions_conf_thresh[...,-2], axis=-1)
+ ymax = tf.expand_dims(predictions_conf_thresh[...,-1], axis=-1)
+ boxes = tf.concat(values=[ymin, xmin, ymax, xmax], axis=-1)
+
+ maxima_indices = tf.image.non_max_suppression(boxes=boxes,
+ scores=scores,
+ max_output_size=self.tf_nms_max_output_size,
+ iou_threshold=self.iou_threshold,
+ name='non_maximum_suppresion')
+ maxima = tf.gather(params=predictions_conf_thresh,
+ indices=maxima_indices,
+ axis=0)
+ return maxima
+ def no_confident_predictions():
+ return tf.constant(value=0.0, shape=(1,6))
+
+ # If any boxes made the threshold, perform NMS.
+ predictions_nms = tf.cond(tf.equal(tf.size(predictions_conf_thresh), 0), no_confident_predictions, perform_nms)
+
+ # Perform top-k filtering for this batch item or pad it in case there are
+ # fewer than `self.top_k` boxes left at this point. Either way, produce a
+ # tensor of length `self.top_k`. By the time we return the final results tensor
+ # for the whole batch, all batch items must have the same number of predicted
+ # boxes so that the tensor dimensions are homogenous. If fewer than `self.top_k`
+ # predictions are left after the filtering process above, we pad the missing
+ # predictions with zeros as dummy entries.
+ def top_k():
+ return tf.gather(params=predictions_nms,
+ indices=tf.nn.top_k(predictions_nms[:, 1], k=self.tf_top_k, sorted=True).indices,
+ axis=0)
+ def pad_and_top_k():
+ padded_predictions = tf.pad(tensor=predictions_nms,
+ paddings=[[0, self.tf_top_k - tf.shape(predictions_nms)[0]], [0, 0]],
+ mode='CONSTANT',
+ constant_values=0.0)
+ return tf.gather(params=padded_predictions,
+ indices=tf.nn.top_k(padded_predictions[:, 1], k=self.tf_top_k, sorted=True).indices,
+ axis=0)
+
+ top_k_boxes = tf.cond(tf.greater_equal(tf.shape(predictions_nms)[0], self.tf_top_k), top_k, pad_and_top_k)
+
+ return top_k_boxes
+
+ # Iterate `filter_predictions()` over all batch items.
+ output_tensor = tf.map_fn(fn=lambda x: filter_predictions(x),
+ elems=y_pred,
+ dtype=None,
+ parallel_iterations=128,
+ back_prop=False,
+ swap_memory=False,
+ infer_shape=True,
+ name='loop_over_batch')
+
+ return output_tensor
+
+ def compute_output_shape(self, input_shape):
+ batch_size, n_boxes, last_axis = input_shape
+ return (batch_size, self.tf_top_k, 6) # Last axis: (class_ID, confidence, 4 box coordinates)
+
+ def get_config(self):
+ config = {
+ 'confidence_thresh': self.confidence_thresh,
+ 'iou_threshold': self.iou_threshold,
+ 'top_k': self.top_k,
+ 'nms_max_output_size': self.nms_max_output_size,
+ 'coords': self.coords,
+ 'normalize_coords': self.normalize_coords,
+ 'img_height': self.img_height,
+ 'img_width': self.img_width,
+ }
+ base_config = super(DecodeDetectionsFast, self).get_config()
+ return dict(list(base_config.items()) + list(config.items()))
diff --git a/engine/object_detection_branch/single_shot_detector/keras_layers/keras_layer_L2Normalization.py b/engine/object_detection_branch/single_shot_detector/keras_layers/keras_layer_L2Normalization.py
new file mode 100644
index 0000000..e2c71bf
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/keras_layers/keras_layer_L2Normalization.py
@@ -0,0 +1,70 @@
+'''
+A custom Keras layer to perform L2-normalization.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+import keras.backend as K
+from keras.engine.topology import InputSpec
+from keras.engine.topology import Layer
+
+class L2Normalization(Layer):
+ '''
+ Performs L2 normalization on the input tensor with a learnable scaling parameter
+ as described in the paper "Parsenet: Looking Wider to See Better" (see references)
+ and as used in the original SSD model.
+
+ Arguments:
+ gamma_init (int): The initial scaling parameter. Defaults to 20 following the
+ SSD paper.
+
+ Input shape:
+ 4D tensor of shape `(batch, channels, height, width)` if `dim_ordering = 'th'`
+ or `(batch, height, width, channels)` if `dim_ordering = 'tf'`.
+
+ Returns:
+ The scaled tensor. Same shape as the input tensor.
+
+ References:
+ http://cs.unc.edu/~wliu/papers/parsenet.pdf
+ '''
+
+ def __init__(self, gamma_init=20, **kwargs):
+ if K.image_dim_ordering() == 'tf':
+ self.axis = 3
+ else:
+ self.axis = 1
+ self.gamma_init = gamma_init
+ super(L2Normalization, self).__init__(**kwargs)
+
+ def build(self, input_shape):
+ self.input_spec = [InputSpec(shape=input_shape)]
+ gamma = self.gamma_init * np.ones((input_shape[self.axis],))
+ self.gamma = K.variable(gamma, name='{}_gamma'.format(self.name))
+ self.trainable_weights = [self.gamma]
+ super(L2Normalization, self).build(input_shape)
+
+ def call(self, x, mask=None):
+ output = K.l2_normalize(x, self.axis)
+ return output * self.gamma
+
+ def get_config(self):
+ config = {
+ 'gamma_init': self.gamma_init
+ }
+ base_config = super(L2Normalization, self).get_config()
+ return dict(list(base_config.items()) + list(config.items()))
diff --git a/engine/object_detection_branch/single_shot_detector/keras_loss_function/__init__.py b/engine/object_detection_branch/single_shot_detector/keras_loss_function/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/single_shot_detector/keras_loss_function/keras_ssd_loss.py b/engine/object_detection_branch/single_shot_detector/keras_loss_function/keras_ssd_loss.py
new file mode 100644
index 0000000..83567f5
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/keras_loss_function/keras_ssd_loss.py
@@ -0,0 +1,211 @@
+'''
+The Keras-compatible loss function for the SSD model. Currently supports TensorFlow only.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import tensorflow as tf
+
+class SSDLoss:
+ '''
+ The SSD loss, see https://arxiv.org/abs/1512.02325.
+ '''
+
+ def __init__(self,
+ neg_pos_ratio=3,
+ n_neg_min=0,
+ alpha=1.0):
+ '''
+ Arguments:
+ neg_pos_ratio (int, optional): The maximum ratio of negative (i.e. background)
+ to positive ground truth boxes to include in the loss computation.
+ There are no actual background ground truth boxes of course, but `y_true`
+ contains anchor boxes labeled with the background class. Since
+ the number of background boxes in `y_true` will usually exceed
+ the number of positive boxes by far, it is necessary to balance
+ their influence on the loss. Defaults to 3 following the paper.
+ n_neg_min (int, optional): The minimum number of negative ground truth boxes to
+ enter the loss computation *per batch*. This argument can be used to make
+ sure that the model learns from a minimum number of negatives in batches
+ in which there are very few, or even none at all, positive ground truth
+ boxes. It defaults to 0 and if used, it should be set to a value that
+ stands in reasonable proportion to the batch size used for training.
+ alpha (float, optional): A factor to weight the localization loss in the
+ computation of the total loss. Defaults to 1.0 following the paper.
+ '''
+ self.neg_pos_ratio = neg_pos_ratio
+ self.n_neg_min = n_neg_min
+ self.alpha = alpha
+
+ def smooth_L1_loss(self, y_true, y_pred):
+ '''
+ Compute smooth L1 loss, see references.
+
+ Arguments:
+ y_true (nD tensor): A TensorFlow tensor of any shape containing the ground truth data.
+ In this context, the expected tensor has shape `(batch_size, #boxes, 4)` and
+ contains the ground truth bounding box coordinates, where the last dimension
+ contains `(xmin, xmax, ymin, ymax)`.
+ y_pred (nD tensor): A TensorFlow tensor of identical structure to `y_true` containing
+ the predicted data, in this context the predicted bounding box coordinates.
+
+ Returns:
+ The smooth L1 loss, a nD-1 Tensorflow tensor. In this context a 2D tensor
+ of shape (batch, n_boxes_total).
+
+ References:
+ https://arxiv.org/abs/1504.08083
+ '''
+ absolute_loss = tf.abs(y_true - y_pred)
+ square_loss = 0.5 * (y_true - y_pred)**2
+ l1_loss = tf.where(tf.less(absolute_loss, 1.0), square_loss, absolute_loss - 0.5)
+ return tf.reduce_sum(l1_loss, axis=-1)
+
+ def log_loss(self, y_true, y_pred):
+ '''
+ Compute the softmax log loss.
+
+ Arguments:
+ y_true (nD tensor): A TensorFlow tensor of any shape containing the ground truth data.
+ In this context, the expected tensor has shape (batch_size, #boxes, #classes)
+ and contains the ground truth bounding box categories.
+ y_pred (nD tensor): A TensorFlow tensor of identical structure to `y_true` containing
+ the predicted data, in this context the predicted bounding box categories.
+
+ Returns:
+ The softmax log loss, a nD-1 Tensorflow tensor. In this context a 2D tensor
+ of shape (batch, n_boxes_total).
+ '''
+ # Make sure that `y_pred` doesn't contain any zeros (which would break the log function)
+ y_pred = tf.maximum(y_pred, 1e-15)
+ # Compute the log loss
+ log_loss = -tf.reduce_sum(y_true * tf.log(y_pred), axis=-1)
+ return log_loss
+
+ def compute_loss(self, y_true, y_pred):
+ '''
+ Compute the loss of the SSD model prediction against the ground truth.
+
+ Arguments:
+ y_true (array): A Numpy array of shape `(batch_size, #boxes, #classes + 12)`,
+ where `#boxes` is the total number of boxes that the model predicts
+ per image. Be careful to make sure that the index of each given
+ box in `y_true` is the same as the index for the corresponding
+ box in `y_pred`. The last axis must have length `#classes + 12` and contain
+ `[classes one-hot encoded, 4 ground truth box coordinate offsets, 8 arbitrary entries]`
+ in this order, including the background class. The last eight entries of the
+ last axis are not used by this function and therefore their contents are
+ irrelevant, they only exist so that `y_true` has the same shape as `y_pred`,
+ where the last four entries of the last axis contain the anchor box
+ coordinates, which are needed during inference. Important: Boxes that
+ you want the cost function to ignore need to have a one-hot
+ class vector of all zeros.
+ y_pred (Keras tensor): The model prediction. The shape is identical
+ to that of `y_true`, i.e. `(batch_size, #boxes, #classes + 12)`.
+ The last axis must contain entries in the format
+ `[classes one-hot encoded, 4 predicted box coordinate offsets, 8 arbitrary entries]`.
+
+ Returns:
+ A scalar, the total multitask loss for classification and localization.
+ '''
+ self.neg_pos_ratio = tf.constant(self.neg_pos_ratio)
+ self.n_neg_min = tf.constant(self.n_neg_min)
+ self.alpha = tf.constant(self.alpha)
+
+ batch_size = tf.shape(y_pred)[0] # Output dtype: tf.int32
+ n_boxes = tf.shape(y_pred)[1] # Output dtype: tf.int32, note that `n_boxes` in this context denotes the total number of boxes per image, not the number of boxes per cell.
+
+ # 1: Compute the losses for class and box predictions for every box.
+
+ classification_loss = tf.to_float(self.log_loss(y_true[:,:,:-12], y_pred[:,:,:-12])) # Output shape: (batch_size, n_boxes)
+ localization_loss = tf.to_float(self.smooth_L1_loss(y_true[:,:,-12:-8], y_pred[:,:,-12:-8])) # Output shape: (batch_size, n_boxes)
+
+ # 2: Compute the classification losses for the positive and negative targets.
+
+ # Create masks for the positive and negative ground truth classes.
+ negatives = y_true[:,:,0] # Tensor of shape (batch_size, n_boxes)
+ positives = tf.to_float(tf.reduce_max(y_true[:,:,1:-12], axis=-1)) # Tensor of shape (batch_size, n_boxes)
+
+ # Count the number of positive boxes (classes 1 to n) in y_true across the whole batch.
+ n_positive = tf.reduce_sum(positives)
+
+ # Now mask all negative boxes and sum up the losses for the positive boxes PER batch item
+ # (Keras loss functions must output one scalar loss value PER batch item, rather than just
+ # one scalar for the entire batch, that's why we're not summing across all axes).
+ pos_class_loss = tf.reduce_sum(classification_loss * positives, axis=-1) # Tensor of shape (batch_size,)
+
+ # Compute the classification loss for the negative default boxes (if there are any).
+
+ # First, compute the classification loss for all negative boxes.
+ neg_class_loss_all = classification_loss * negatives # Tensor of shape (batch_size, n_boxes)
+ n_neg_losses = tf.count_nonzero(neg_class_loss_all, dtype=tf.int32) # The number of non-zero loss entries in `neg_class_loss_all`
+ # What's the point of `n_neg_losses`? For the next step, which will be to compute which negative boxes enter the classification
+ # loss, we don't just want to know how many negative ground truth boxes there are, but for how many of those there actually is
+ # a positive (i.e. non-zero) loss. This is necessary because `tf.nn.top-k()` in the function below will pick the top k boxes with
+ # the highest losses no matter what, even if it receives a vector where all losses are zero. In the unlikely event that all negative
+ # classification losses ARE actually zero though, this behavior might lead to `tf.nn.top-k()` returning the indices of positive
+ # boxes, leading to an incorrect negative classification loss computation, and hence an incorrect overall loss computation.
+ # We therefore need to make sure that `n_negative_keep`, which assumes the role of the `k` argument in `tf.nn.top-k()`,
+ # is at most the number of negative boxes for which there is a positive classification loss.
+
+ # Compute the number of negative examples we want to account for in the loss.
+ # We'll keep at most `self.neg_pos_ratio` times the number of positives in `y_true`, but at least `self.n_neg_min` (unless `n_neg_loses` is smaller).
+ n_negative_keep = tf.minimum(tf.maximum(self.neg_pos_ratio * tf.to_int32(n_positive), self.n_neg_min), n_neg_losses)
+
+ # In the unlikely case when either (1) there are no negative ground truth boxes at all
+ # or (2) the classification loss for all negative boxes is zero, return zero as the `neg_class_loss`.
+ def f1():
+ return tf.zeros([batch_size])
+ # Otherwise compute the negative loss.
+ def f2():
+ # Now we'll identify the top-k (where k == `n_negative_keep`) boxes with the highest confidence loss that
+ # belong to the background class in the ground truth data. Note that this doesn't necessarily mean that the model
+ # predicted the wrong class for those boxes, it just means that the loss for those boxes is the highest.
+
+ # To do this, we reshape `neg_class_loss_all` to 1D...
+ neg_class_loss_all_1D = tf.reshape(neg_class_loss_all, [-1]) # Tensor of shape (batch_size * n_boxes,)
+ # ...and then we get the indices for the `n_negative_keep` boxes with the highest loss out of those...
+ values, indices = tf.nn.top_k(neg_class_loss_all_1D,
+ k=n_negative_keep,
+ sorted=False) # We don't need them sorted.
+ # ...and with these indices we'll create a mask...
+ negatives_keep = tf.scatter_nd(indices=tf.expand_dims(indices, axis=1),
+ updates=tf.ones_like(indices, dtype=tf.int32),
+ shape=tf.shape(neg_class_loss_all_1D)) # Tensor of shape (batch_size * n_boxes,)
+ negatives_keep = tf.to_float(tf.reshape(negatives_keep, [batch_size, n_boxes])) # Tensor of shape (batch_size, n_boxes)
+ # ...and use it to keep only those boxes and mask all other classification losses
+ neg_class_loss = tf.reduce_sum(classification_loss * negatives_keep, axis=-1) # Tensor of shape (batch_size,)
+ return neg_class_loss
+
+ neg_class_loss = tf.cond(tf.equal(n_neg_losses, tf.constant(0)), f1, f2)
+
+ class_loss = pos_class_loss + neg_class_loss # Tensor of shape (batch_size,)
+
+ # 3: Compute the localization loss for the positive targets.
+ # We don't compute a localization loss for negative predicted boxes (obviously: there are no ground truth boxes they would correspond to).
+
+ loc_loss = tf.reduce_sum(localization_loss * positives, axis=-1) # Tensor of shape (batch_size,)
+
+ # 4: Compute the total loss.
+
+ total_loss = (class_loss + self.alpha * loc_loss) / tf.maximum(1.0, n_positive) # In case `n_positive == 0`
+ # Keras has the annoying habit of dividing the loss by the batch size, which sucks in our case
+ # because the relevant criterion to average our loss over is the number of positive boxes in the batch
+ # (by which we're dividing in the line above), not the batch size. So in order to revert Keras' averaging
+ # over the batch size, we'll have to multiply by it.
+ total_loss = total_loss * tf.to_float(batch_size)
+
+ return total_loss
diff --git a/engine/object_detection_branch/single_shot_detector/misc_utils/__init__.py b/engine/object_detection_branch/single_shot_detector/misc_utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/single_shot_detector/misc_utils/tensor_sampling_utils.py b/engine/object_detection_branch/single_shot_detector/misc_utils/tensor_sampling_utils.py
new file mode 100644
index 0000000..a27ce1d
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/misc_utils/tensor_sampling_utils.py
@@ -0,0 +1,177 @@
+'''
+Utilities that are useful to sub- or up-sample weights tensors.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+import numpy as np
+
+def sample_tensors(weights_list, sampling_instructions, axes=None, init=None, mean=0.0, stddev=0.005):
+ '''
+ Can sub-sample and/or up-sample individual dimensions of the tensors in the given list
+ of input tensors.
+
+ It is possible to sub-sample some dimensions and up-sample other dimensions at the same time.
+
+ The tensors in the list will be sampled consistently, i.e. for any given dimension that
+ corresponds among all tensors in the list, the same elements will be picked for every tensor
+ along that dimension.
+
+ For dimensions that are being sub-sampled, you can either provide a list of the indices
+ that should be picked, or you can provide the number of elements to be sub-sampled, in which
+ case the elements will be chosen at random.
+
+ For dimensions that are being up-sampled, "filler" elements will be insterted at random
+ positions along the respective dimension. These filler elements will be initialized either
+ with zero or from a normal distribution with selectable mean and standard deviation.
+
+ Arguments:
+ weights_list (list): A list of Numpy arrays. Each array represents one of the tensors
+ to be sampled. The tensor with the greatest number of dimensions must be the first
+ element in the list. For example, in the case of the weights of a 2D convolutional
+ layer, the kernel must be the first element in the list and the bias the second,
+ not the other way around. For all tensors in the list after the first tensor, the
+ lengths of each of their axes must identical to the length of some axis of the
+ first tensor.
+ sampling_instructions (list): A list that contains the sampling instructions for each
+ dimension of the first tensor. If the first tensor has `n` dimensions, then this
+ must be a list of length `n`. That means, sampling instructions for every dimension
+ of the first tensor must still be given even if not all dimensions should be changed.
+ The elements of this list can be either lists of integers or integers. If the sampling
+ instruction for a given dimension is a list of integers, then these integers represent
+ the indices of the elements of that dimension that will be sub-sampled. If the sampling
+ instruction for a given dimension is an integer, then that number of elements will be
+ sampled along said dimension. If the integer is greater than the number of elements
+ of the input tensors in that dimension, that dimension will be up-sampled. If the integer
+ is smaller than the number of elements of the input tensors in that dimension, that
+ dimension will be sub-sampled. If the integer is equal to the number of elements
+ of the input tensors in that dimension, that dimension will remain the same.
+ axes (list, optional): Only relevant if `weights_list` contains more than one tensor.
+ This list contains a list for each additional tensor in `weights_list` beyond the first.
+ Each of these lists contains integers that determine to which axes of the first tensor
+ the axes of the respective tensor correspond. For example, let the first tensor be a
+ 4D tensor and the second tensor in the list be a 2D tensor. If the first element of
+ `axis` is the list `[2,3]`, then that means that the two axes of the second tensor
+ correspond to the last two axes of the first tensor, in the same order. The point of
+ this list is for the program to know, if a given dimension of the first tensor is to
+ be sub- or up-sampled, which dimensions of the other tensors in the list must be
+ sub- or up-sampled accordingly.
+ init (list, optional): Only relevant for up-sampling. Must be `None` or a list of strings
+ that determines for each tensor in `weights_list` how the newly inserted values should
+ be initialized. The possible values are 'gaussian' for initialization from a normal
+ distribution with the selected mean and standard deviation (see the following two arguments),
+ or 'zeros' for zero-initialization. If `None`, all initializations default to
+ 'gaussian'.
+ mean (float, optional): Only relevant for up-sampling. The mean of the values that will
+ be inserted into the tensors at random in the case of up-sampling.
+ stddev (float, optional): Only relevant for up-sampling. The standard deviation of the
+ values that will be inserted into the tensors at random in the case of up-sampling.
+
+ Returns:
+ A list containing the sampled tensors in the same order in which they were given.
+ '''
+
+ first_tensor = weights_list[0]
+
+ if (not isinstance(sampling_instructions, (list, tuple))) or (len(sampling_instructions) != first_tensor.ndim):
+ raise ValueError("The sampling instructions must be a list whose length is the number of dimensions of the first tensor in `weights_list`.")
+
+ if (not init is None) and len(init) != len(weights_list):
+ raise ValueError("`init` must either be `None` or a list of strings that has the same length as `weights_list`.")
+
+ up_sample = [] # Store the dimensions along which we need to up-sample.
+ out_shape = [] # Store the shape of the output tensor here.
+ # Store two stages of the new (sub-sampled and/or up-sampled) weights tensors in the following two lists.
+ subsampled_weights_list = [] # Tensors after sub-sampling, but before up-sampling (if any).
+ upsampled_weights_list = [] # Sub-sampled tensors after up-sampling (if any), i.e. final output tensors.
+
+ # Create the slicing arrays from the sampling instructions.
+ sampling_slices = []
+ for i, sampling_inst in enumerate(sampling_instructions):
+ if isinstance(sampling_inst, (list, tuple)):
+ amax = np.amax(np.array(sampling_inst))
+ if amax >= first_tensor.shape[i]:
+ raise ValueError("The sample instructions for dimension {} contain index {}, which is greater than the length of that dimension.".format(i, amax))
+ sampling_slices.append(np.array(sampling_inst))
+ out_shape.append(len(sampling_inst))
+ elif isinstance(sampling_inst, int):
+ out_shape.append(sampling_inst)
+ if sampling_inst == first_tensor.shape[i]:
+ # Nothing to sample here, we're keeping the original number of elements along this axis.
+ sampling_slice = np.arange(sampling_inst)
+ sampling_slices.append(sampling_slice)
+ elif sampling_inst < first_tensor.shape[i]:
+ # We want to SUB-sample this dimension. Randomly pick `sample_inst` many elements from it.
+ sampling_slice1 = np.array([0]) # We will always sample class 0, the background class.
+ # Sample the rest of the classes.
+ sampling_slice2 = np.sort(np.random.choice(np.arange(1, first_tensor.shape[i]), sampling_inst - 1, replace=False))
+ sampling_slice = np.concatenate([sampling_slice1, sampling_slice2])
+ sampling_slices.append(sampling_slice)
+ else:
+ # We want to UP-sample. Pick all elements from this dimension.
+ sampling_slice = np.arange(first_tensor.shape[i])
+ sampling_slices.append(sampling_slice)
+ up_sample.append(i)
+ else:
+ raise ValueError("Each element of the sampling instructions must be either an integer or a list/tuple of integers, but received `{}`".format(type(sampling_inst)))
+
+ # Process the first tensor.
+ subsampled_first_tensor = np.copy(first_tensor[np.ix_(*sampling_slices)])
+ subsampled_weights_list.append(subsampled_first_tensor)
+
+ # Process the other tensors.
+ if len(weights_list) > 1:
+ for j in range(1, len(weights_list)):
+ this_sampling_slices = [sampling_slices[i] for i in axes[j-1]] # Get the sampling slices for this tensor.
+ subsampled_weights_list.append(np.copy(weights_list[j][np.ix_(*this_sampling_slices)]))
+
+ if up_sample:
+ # Take care of the dimensions that are to be up-sampled.
+
+ out_shape = np.array(out_shape)
+
+ # Process the first tensor.
+ if init is None or init[0] == 'gaussian':
+ upsampled_first_tensor = np.random.normal(loc=mean, scale=stddev, size=out_shape)
+ elif init[0] == 'zeros':
+ upsampled_first_tensor = np.zeros(out_shape)
+ else:
+ raise ValueError("Valid initializations are 'gaussian' and 'zeros', but received '{}'.".format(init[0]))
+ # Pick the indices of the elements in `upsampled_first_tensor` that should be occupied by `subsampled_first_tensor`.
+ up_sample_slices = [np.arange(k) for k in subsampled_first_tensor.shape]
+ for i in up_sample:
+ # Randomly select across which indices of this dimension to scatter the elements of `new_weights_tensor` in this dimension.
+ up_sample_slice1 = np.array([0])
+ up_sample_slice2 = np.sort(np.random.choice(np.arange(1, upsampled_first_tensor.shape[i]), subsampled_first_tensor.shape[i] - 1, replace=False))
+ up_sample_slices[i] = np.concatenate([up_sample_slice1, up_sample_slice2])
+ upsampled_first_tensor[np.ix_(*up_sample_slices)] = subsampled_first_tensor
+ upsampled_weights_list.append(upsampled_first_tensor)
+
+ # Process the other tensors
+ if len(weights_list) > 1:
+ for j in range(1, len(weights_list)):
+ if init is None or init[j] == 'gaussian':
+ upsampled_tensor = np.random.normal(loc=mean, scale=stddev, size=out_shape[axes[j-1]])
+ elif init[j] == 'zeros':
+ upsampled_tensor = np.zeros(out_shape[axes[j-1]])
+ else:
+ raise ValueError("Valid initializations are 'gaussian' and 'zeros', but received '{}'.".format(init[j]))
+ this_up_sample_slices = [up_sample_slices[i] for i in axes[j-1]] # Get the up-sampling slices for this tensor.
+ upsampled_tensor[np.ix_(*this_up_sample_slices)] = subsampled_weights_list[j]
+ upsampled_weights_list.append(upsampled_tensor)
+
+ return upsampled_weights_list
+ else:
+ return subsampled_weights_list
diff --git a/engine/object_detection_branch/single_shot_detector/models/__init__.py b/engine/object_detection_branch/single_shot_detector/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/single_shot_detector/models/keras_ssd300.py b/engine/object_detection_branch/single_shot_detector/models/keras_ssd300.py
new file mode 100644
index 0000000..a63f23f
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/models/keras_ssd300.py
@@ -0,0 +1,461 @@
+'''
+A Keras port of the original Caffe SSD300 network.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+
+import numpy as np
+
+import keras.backend as K
+from engine.object_detection_branch.single_shot_detector.keras_layers.keras_layer_AnchorBoxes import AnchorBoxes
+from engine.object_detection_branch.single_shot_detector.keras_layers.keras_layer_DecodeDetections import DecodeDetections
+from engine.object_detection_branch.single_shot_detector.keras_layers.keras_layer_L2Normalization import L2Normalization
+from keras.layers import Input, Lambda, Activation, Conv2D, MaxPooling2D, ZeroPadding2D, Reshape, Concatenate
+from keras.models import Model
+from keras.regularizers import l2
+
+from engine.object_detection_branch.single_shot_detector.keras_layers.keras_layer_DecodeDetectionsFast import \
+ DecodeDetectionsFast
+
+
+def ssd_300(image_size,
+ n_classes,
+ mode='training',
+ l2_regularization=0.0005,
+ min_scale=None,
+ max_scale=None,
+ scales=None,
+ aspect_ratios_global=None,
+ aspect_ratios_per_layer=[[1.0, 2.0, 0.5],
+ [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
+ [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
+ [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
+ [1.0, 2.0, 0.5],
+ [1.0, 2.0, 0.5]],
+ two_boxes_for_ar1=True,
+ steps=[8, 16, 32, 64, 100, 300],
+ offsets=None,
+ clip_boxes=False,
+ variances=[0.1, 0.1, 0.2, 0.2],
+ coords='centroids',
+ normalize_coords=True,
+ subtract_mean=[123, 117, 104],
+ divide_by_stddev=None,
+ swap_channels=[2, 1, 0],
+ confidence_thresh=0.01,
+ iou_threshold=0.45,
+ top_k=200,
+ nms_max_output_size=400,
+ return_predictor_sizes=False):
+ '''
+ Build a Keras model with SSD300 architecture, see references.
+
+ The base network is a reduced atrous VGG-16, extended by the SSD architecture,
+ as described in the paper.
+
+ Most of the arguments that this function takes are only needed for the anchor
+ box layers. In case you're training the network, the parameters passed here must
+ be the same as the ones used to set up `SSDBoxEncoder`. In case you're loading
+ trained weights, the parameters passed here must be the same as the ones used
+ to produce the trained weights.
+
+ Some of these arguments are explained in more detail in the documentation of the
+ `SSDBoxEncoder` class.
+
+ Note: Requires Keras v2.0 or later. Currently works only with the
+ TensorFlow backend (v1.0 or later).
+
+ Arguments:
+ image_size (tuple): The input image size in the format `(height, width, channels)`.
+ n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO.
+ mode (str, optional): One of 'training', 'inference' and 'inference_fast'. In 'training' mode,
+ the model outputs the raw prediction tensor, while in 'inference' and 'inference_fast' modes,
+ the raw predictions are decoded into absolute coordinates and filtered via confidence thresholding,
+ non-maximum suppression, and top-k filtering. The difference between latter two modes is that
+ 'inference' follows the exact procedure of the original Caffe implementation, while
+ 'inference_fast' uses a faster prediction decoding procedure.
+ l2_regularization (float, optional): The L2-regularization rate. Applies to all convolutional layers.
+ Set to zero to deactivate L2-regularization.
+ min_scale (float, optional): The smallest scaling factor for the size of the anchor boxes as a fraction
+ of the shorter side of the input images.
+ max_scale (float, optional): The largest scaling factor for the size of the anchor boxes as a fraction
+ of the shorter side of the input images. All scaling factors between the smallest and the
+ largest will be linearly interpolated. Note that the second to last of the linearly interpolated
+ scaling factors will actually be the scaling factor for the last predictor layer, while the last
+ scaling factor is used for the second box for aspect ratio 1 in the last predictor layer
+ if `two_boxes_for_ar1` is `True`.
+ scales (list, optional): A list of floats containing scaling factors per convolutional predictor layer.
+ This list must be one element longer than the number of predictor layers. The first `k` elements are the
+ scaling factors for the `k` predictor layers, while the last element is used for the second box
+ for aspect ratio 1 in the last predictor layer if `two_boxes_for_ar1` is `True`. This additional
+ last scaling factor must be passed either way, even if it is not being used. If a list is passed,
+ this argument overrides `min_scale` and `max_scale`. All scaling factors must be greater than zero.
+ aspect_ratios_global (list, optional): The list of aspect ratios for which anchor boxes are to be
+ generated. This list is valid for all prediction layers.
+ aspect_ratios_per_layer (list, optional): A list containing one aspect ratio list for each prediction layer.
+ This allows you to set the aspect ratios for each predictor layer individually, which is the case for the
+ original SSD300 implementation. If a list is passed, it overrides `aspect_ratios_global`.
+ two_boxes_for_ar1 (bool, optional): Only relevant for aspect ratio lists that contain 1. Will be ignored otherwise.
+ If `True`, two anchor boxes will be generated for aspect ratio 1. The first will be generated
+ using the scaling factor for the respective layer, the second one will be generated using
+ geometric mean of said scaling factor and next bigger scaling factor.
+ steps (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
+ either ints/floats or tuples of two ints/floats. These numbers represent for each predictor layer how many
+ pixels apart the anchor box center points should be vertically and horizontally along the spatial grid over
+ the image. If the list contains ints/floats, then that value will be used for both spatial dimensions.
+ If the list contains tuples of two ints/floats, then they represent `(step_height, step_width)`.
+ If no steps are provided, then they will be computed such that the anchor box center points will form an
+ equidistant grid within the image dimensions.
+ offsets (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
+ either floats or tuples of two floats. These numbers represent for each predictor layer how many
+ pixels from the top and left boarders of the image the top-most and left-most anchor box center points should be
+ as a fraction of `steps`. The last bit is important: The offsets are not absolute pixel values, but fractions
+ of the step size specified in the `steps` argument. If the list contains floats, then that value will
+ be used for both spatial dimensions. If the list contains tuples of two floats, then they represent
+ `(vertical_offset, horizontal_offset)`. If no offsets are provided, then they will default to 0.5 of the step size.
+ clip_boxes (bool, optional): If `True`, clips the anchor box coordinates to stay within image boundaries.
+ variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by
+ its respective variance value.
+ coords (str, optional): The box coordinate format to be used internally by the model (i.e. this is not the input format
+ of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width,
+ and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+ normalize_coords (bool, optional): Set to `True` if the model is supposed to use relative instead of absolute coordinates,
+ i.e. if the model predicts box coordinates within [0,1] instead of absolute coordinates.
+ subtract_mean (array-like, optional): `None` or an array-like object of integers or floating point values
+ of any shape that is broadcast-compatible with the image shape. The elements of this array will be
+ subtracted from the image pixel intensity values. For example, pass a list of three integers
+ to perform per-channel mean normalization for color images.
+ divide_by_stddev (array-like, optional): `None` or an array-like object of non-zero integers or
+ floating point values of any shape that is broadcast-compatible with the image shape. The image pixel
+ intensity values will be divided by the elements of this array. For example, pass a list
+ of three integers to perform per-channel standard deviation normalization for color images.
+ swap_channels (list, optional): Either `False` or a list of integers representing the desired order in which the input
+ image channels should be swapped.
+ confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
+ positive class in order to be considered for the non-maximum suppression stage for the respective class.
+ A lower value will result in a larger part of the selection process being done by the non-maximum suppression
+ stage, while a larger value will result in a larger part of the selection process happening in the confidence
+ thresholding stage.
+ iou_threshold (float, optional): A float in [0,1]. All boxes that have a Jaccard similarity of greater than `iou_threshold`
+ with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
+ to the box's confidence score.
+ top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
+ non-maximum suppression stage.
+ nms_max_output_size (int, optional): The maximal number of predictions that will be left over after the NMS stage.
+ return_predictor_sizes (bool, optional): If `True`, this function not only returns the model, but also
+ a list containing the spatial dimensions of the predictor layers. This isn't strictly necessary since
+ you can always get their sizes easily via the Keras API, but it's convenient and less error-prone
+ to get them this way. They are only relevant for training anyway (SSDBoxEncoder needs to know the
+ spatial dimensions of the predictor layers), for inference you don't need them.
+
+ Returns:
+ model: The Keras SSD300 model.
+ predictor_sizes (optional): A Numpy array containing the `(height, width)` portion
+ of the output tensor shape for each convolutional predictor layer. During
+ training, the generator function needs this in order to transform
+ the ground truth labels into tensors of identical structure as the
+ output tensors of the model, which is in turn needed for the cost
+ function.
+
+ References:
+ https://arxiv.org/abs/1512.02325v5
+ '''
+
+ n_predictor_layers = 6 # The number of predictor conv layers in the network is 6 for the original SSD300.
+ n_classes += 1 # Account for the background class.
+ l2_reg = l2_regularization # Make the internal name shorter.
+ img_height, img_width, img_channels = image_size[0], image_size[1], image_size[2]
+
+ ############################################################################
+ # Get a few exceptions out of the way.
+ ############################################################################
+
+ if aspect_ratios_global is None and aspect_ratios_per_layer is None:
+ raise ValueError("`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified.")
+ if aspect_ratios_per_layer:
+ if len(aspect_ratios_per_layer) != n_predictor_layers:
+ raise ValueError("It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}.".format(n_predictor_layers, len(aspect_ratios_per_layer)))
+
+ if (min_scale is None or max_scale is None) and scales is None:
+ raise ValueError("Either `min_scale` and `max_scale` or `scales` need to be specified.")
+ if scales:
+ if len(scales) != n_predictor_layers+1:
+ raise ValueError("It must be either scales is None or len(scales) == {}, but len(scales) == {}.".format(n_predictor_layers+1, len(scales)))
+ else: # If no explicit list of scaling factors was passed, compute the list of scaling factors from `min_scale` and `max_scale`
+ scales = np.linspace(min_scale, max_scale, n_predictor_layers+1)
+
+ if len(variances) != 4:
+ raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances)))
+ variances = np.array(variances)
+ if np.any(variances <= 0):
+ raise ValueError("All variances must be >0, but the variances given are {}".format(variances))
+
+ if (not (steps is None)) and (len(steps) != n_predictor_layers):
+ raise ValueError("You must provide at least one step value per predictor layer.")
+
+ if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
+ raise ValueError("You must provide at least one offset value per predictor layer.")
+
+ ############################################################################
+ # Compute the anchor box parameters.
+ ############################################################################
+
+ # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers.
+ if aspect_ratios_per_layer:
+ aspect_ratios = aspect_ratios_per_layer
+ else:
+ aspect_ratios = [aspect_ratios_global] * n_predictor_layers
+
+ # Compute the number of boxes to be predicted per cell for each predictor layer.
+ # We need this so that we know how many channels the predictor layers need to have.
+ if aspect_ratios_per_layer:
+ n_boxes = []
+ for ar in aspect_ratios_per_layer:
+ if (1 in ar) & two_boxes_for_ar1:
+ n_boxes.append(len(ar) + 1) # +1 for the second box for aspect ratio 1
+ else:
+ n_boxes.append(len(ar))
+ else: # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer
+ if (1 in aspect_ratios_global) & two_boxes_for_ar1:
+ n_boxes = len(aspect_ratios_global) + 1
+ else:
+ n_boxes = len(aspect_ratios_global)
+ n_boxes = [n_boxes] * n_predictor_layers
+
+ if steps is None:
+ steps = [None] * n_predictor_layers
+ if offsets is None:
+ offsets = [None] * n_predictor_layers
+
+ ############################################################################
+ # Define functions for the Lambda layers below.
+ ############################################################################
+
+ def identity_layer(tensor):
+ return tensor
+
+ def input_mean_normalization(tensor):
+ return tensor - np.array(subtract_mean)
+
+ def input_stddev_normalization(tensor):
+ return tensor / np.array(divide_by_stddev)
+
+ def input_channel_swap(tensor):
+ if len(swap_channels) == 3:
+ return K.stack([tensor[...,swap_channels[0]], tensor[...,swap_channels[1]], tensor[...,swap_channels[2]]], axis=-1)
+ elif len(swap_channels) == 4:
+ return K.stack([tensor[...,swap_channels[0]], tensor[...,swap_channels[1]], tensor[...,swap_channels[2]], tensor[...,swap_channels[3]]], axis=-1)
+
+ ############################################################################
+ # Build the network.
+ ############################################################################
+
+ x = Input(shape=(img_height, img_width, img_channels))
+
+ # The following identity layer is only needed so that the subsequent lambda layers can be optional.
+ x1 = Lambda(identity_layer, output_shape=(img_height, img_width, img_channels), name='identity_layer')(x)
+ if not (subtract_mean is None):
+ x1 = Lambda(input_mean_normalization, output_shape=(img_height, img_width, img_channels), name='input_mean_normalization')(x1)
+ if not (divide_by_stddev is None):
+ x1 = Lambda(input_stddev_normalization, output_shape=(img_height, img_width, img_channels), name='input_stddev_normalization')(x1)
+ if swap_channels:
+ x1 = Lambda(input_channel_swap, output_shape=(img_height, img_width, img_channels), name='input_channel_swap')(x1)
+
+ conv1_1 = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv1_1')(x1)
+ conv1_2 = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv1_2')(conv1_1)
+ pool1 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool1')(conv1_2)
+
+ conv2_1 = Conv2D(128, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv2_1')(pool1)
+ conv2_2 = Conv2D(128, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv2_2')(conv2_1)
+ pool2 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool2')(conv2_2)
+
+ conv3_1 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3_1')(pool2)
+ conv3_2 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3_2')(conv3_1)
+ conv3_3 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3_3')(conv3_2)
+ pool3 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool3')(conv3_3)
+
+ conv4_1 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_1')(pool3)
+ conv4_2 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_2')(conv4_1)
+ conv4_3 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_3')(conv4_2)
+ pool4 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool4')(conv4_3)
+
+ conv5_1 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5_1')(pool4)
+ conv5_2 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5_2')(conv5_1)
+ conv5_3 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5_3')(conv5_2)
+ pool5 = MaxPooling2D(pool_size=(3, 3), strides=(1, 1), padding='same', name='pool5')(conv5_3)
+
+ fc6 = Conv2D(1024, (3, 3), dilation_rate=(6, 6), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc6')(pool5)
+
+ fc7 = Conv2D(1024, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc7')(fc6)
+
+ conv6_1 = Conv2D(256, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_1')(fc7)
+ conv6_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv6_padding')(conv6_1)
+ conv6_2 = Conv2D(512, (3, 3), strides=(2, 2), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_2')(conv6_1)
+
+ conv7_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_1')(conv6_2)
+ conv7_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv7_padding')(conv7_1)
+ conv7_2 = Conv2D(256, (3, 3), strides=(2, 2), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_2')(conv7_1)
+
+ conv8_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_1')(conv7_2)
+ conv8_2 = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_2')(conv8_1)
+
+ conv9_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_1')(conv8_2)
+ conv9_2 = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_2')(conv9_1)
+
+ # Feed conv4_3 into the L2 normalization layer
+ conv4_3_norm = L2Normalization(gamma_init=20, name='conv4_3_norm')(conv4_3)
+
+ ### Build the convolutional predictor layers on top of the base network
+
+ # We precidt `n_classes` confidence values for each box, hence the confidence predictors have depth `n_boxes * n_classes`
+ # Output shape of the confidence layers: `(batch, height, width, n_boxes * n_classes)`
+ conv4_3_norm_mbox_conf = Conv2D(n_boxes[0] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_3_norm_mbox_conf')(conv4_3_norm)
+ fc7_mbox_conf = Conv2D(n_boxes[1] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc7_mbox_conf')(fc7)
+ conv6_2_mbox_conf = Conv2D(n_boxes[2] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_2_mbox_conf')(conv6_2)
+ conv7_2_mbox_conf = Conv2D(n_boxes[3] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_2_mbox_conf')(conv7_2)
+ conv8_2_mbox_conf = Conv2D(n_boxes[4] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_2_mbox_conf')(conv8_2)
+ conv9_2_mbox_conf = Conv2D(n_boxes[5] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_2_mbox_conf')(conv9_2)
+ # We predict 4 box coordinates for each box, hence the localization predictors have depth `n_boxes * 4`
+ # Output shape of the localization layers: `(batch, height, width, n_boxes * 4)`
+ conv4_3_norm_mbox_loc = Conv2D(n_boxes[0] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_3_norm_mbox_loc')(conv4_3_norm)
+ fc7_mbox_loc = Conv2D(n_boxes[1] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc7_mbox_loc')(fc7)
+ conv6_2_mbox_loc = Conv2D(n_boxes[2] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_2_mbox_loc')(conv6_2)
+ conv7_2_mbox_loc = Conv2D(n_boxes[3] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_2_mbox_loc')(conv7_2)
+ conv8_2_mbox_loc = Conv2D(n_boxes[4] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_2_mbox_loc')(conv8_2)
+ conv9_2_mbox_loc = Conv2D(n_boxes[5] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_2_mbox_loc')(conv9_2)
+
+ ### Generate the anchor boxes (called "priors" in the original Caffe/C++ implementation, so I'll keep their layer names)
+
+ # Output shape of anchors: `(batch, height, width, n_boxes, 8)`
+ conv4_3_norm_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[0], next_scale=scales[1], aspect_ratios=aspect_ratios[0],
+ two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[0], this_offsets=offsets[0], clip_boxes=clip_boxes,
+ variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv4_3_norm_mbox_priorbox')(conv4_3_norm_mbox_loc)
+ fc7_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[1], next_scale=scales[2], aspect_ratios=aspect_ratios[1],
+ two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[1], this_offsets=offsets[1], clip_boxes=clip_boxes,
+ variances=variances, coords=coords, normalize_coords=normalize_coords, name='fc7_mbox_priorbox')(fc7_mbox_loc)
+ conv6_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[2], next_scale=scales[3], aspect_ratios=aspect_ratios[2],
+ two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[2], this_offsets=offsets[2], clip_boxes=clip_boxes,
+ variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv6_2_mbox_priorbox')(conv6_2_mbox_loc)
+ conv7_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[3], next_scale=scales[4], aspect_ratios=aspect_ratios[3],
+ two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[3], this_offsets=offsets[3], clip_boxes=clip_boxes,
+ variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv7_2_mbox_priorbox')(conv7_2_mbox_loc)
+ conv8_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[4], next_scale=scales[5], aspect_ratios=aspect_ratios[4],
+ two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[4], this_offsets=offsets[4], clip_boxes=clip_boxes,
+ variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv8_2_mbox_priorbox')(conv8_2_mbox_loc)
+ conv9_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[5], next_scale=scales[6], aspect_ratios=aspect_ratios[5],
+ two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[5], this_offsets=offsets[5], clip_boxes=clip_boxes,
+ variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv9_2_mbox_priorbox')(conv9_2_mbox_loc)
+
+ ### Reshape
+
+ # Reshape the class predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, n_classes)`
+ # We want the classes isolated in the last axis to perform softmax on them
+ conv4_3_norm_mbox_conf_reshape = Reshape((-1, n_classes), name='conv4_3_norm_mbox_conf_reshape')(conv4_3_norm_mbox_conf)
+ fc7_mbox_conf_reshape = Reshape((-1, n_classes), name='fc7_mbox_conf_reshape')(fc7_mbox_conf)
+ conv6_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv6_2_mbox_conf_reshape')(conv6_2_mbox_conf)
+ conv7_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv7_2_mbox_conf_reshape')(conv7_2_mbox_conf)
+ conv8_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv8_2_mbox_conf_reshape')(conv8_2_mbox_conf)
+ conv9_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv9_2_mbox_conf_reshape')(conv9_2_mbox_conf)
+ # Reshape the box predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, 4)`
+ # We want the four box coordinates isolated in the last axis to compute the smooth L1 loss
+ conv4_3_norm_mbox_loc_reshape = Reshape((-1, 4), name='conv4_3_norm_mbox_loc_reshape')(conv4_3_norm_mbox_loc)
+ fc7_mbox_loc_reshape = Reshape((-1, 4), name='fc7_mbox_loc_reshape')(fc7_mbox_loc)
+ conv6_2_mbox_loc_reshape = Reshape((-1, 4), name='conv6_2_mbox_loc_reshape')(conv6_2_mbox_loc)
+ conv7_2_mbox_loc_reshape = Reshape((-1, 4), name='conv7_2_mbox_loc_reshape')(conv7_2_mbox_loc)
+ conv8_2_mbox_loc_reshape = Reshape((-1, 4), name='conv8_2_mbox_loc_reshape')(conv8_2_mbox_loc)
+ conv9_2_mbox_loc_reshape = Reshape((-1, 4), name='conv9_2_mbox_loc_reshape')(conv9_2_mbox_loc)
+ # Reshape the anchor box tensors, yielding 3D tensors of shape `(batch, height * width * n_boxes, 8)`
+ conv4_3_norm_mbox_priorbox_reshape = Reshape((-1, 8), name='conv4_3_norm_mbox_priorbox_reshape')(conv4_3_norm_mbox_priorbox)
+ fc7_mbox_priorbox_reshape = Reshape((-1, 8), name='fc7_mbox_priorbox_reshape')(fc7_mbox_priorbox)
+ conv6_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv6_2_mbox_priorbox_reshape')(conv6_2_mbox_priorbox)
+ conv7_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv7_2_mbox_priorbox_reshape')(conv7_2_mbox_priorbox)
+ conv8_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv8_2_mbox_priorbox_reshape')(conv8_2_mbox_priorbox)
+ conv9_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv9_2_mbox_priorbox_reshape')(conv9_2_mbox_priorbox)
+
+ ### Concatenate the predictions from the different layers
+
+ # Axis 0 (batch) and axis 2 (n_classes or 4, respectively) are identical for all layer predictions,
+ # so we want to concatenate along axis 1, the number of boxes per layer
+ # Output shape of `mbox_conf`: (batch, n_boxes_total, n_classes)
+ mbox_conf = Concatenate(axis=1, name='mbox_conf')([conv4_3_norm_mbox_conf_reshape,
+ fc7_mbox_conf_reshape,
+ conv6_2_mbox_conf_reshape,
+ conv7_2_mbox_conf_reshape,
+ conv8_2_mbox_conf_reshape,
+ conv9_2_mbox_conf_reshape])
+
+ # Output shape of `mbox_loc`: (batch, n_boxes_total, 4)
+ mbox_loc = Concatenate(axis=1, name='mbox_loc')([conv4_3_norm_mbox_loc_reshape,
+ fc7_mbox_loc_reshape,
+ conv6_2_mbox_loc_reshape,
+ conv7_2_mbox_loc_reshape,
+ conv8_2_mbox_loc_reshape,
+ conv9_2_mbox_loc_reshape])
+
+ # Output shape of `mbox_priorbox`: (batch, n_boxes_total, 8)
+ mbox_priorbox = Concatenate(axis=1, name='mbox_priorbox')([conv4_3_norm_mbox_priorbox_reshape,
+ fc7_mbox_priorbox_reshape,
+ conv6_2_mbox_priorbox_reshape,
+ conv7_2_mbox_priorbox_reshape,
+ conv8_2_mbox_priorbox_reshape,
+ conv9_2_mbox_priorbox_reshape])
+
+ # The box coordinate predictions will go into the loss function just the way they are,
+ # but for the class predictions, we'll apply a softmax activation layer first
+ mbox_conf_softmax = Activation('softmax', name='mbox_conf_softmax')(mbox_conf)
+
+ # Concatenate the class and box predictions and the anchors to one large predictions vector
+ # Output shape of `predictions`: (batch, n_boxes_total, n_classes + 4 + 8)
+ predictions = Concatenate(axis=2, name='predictions')([mbox_conf_softmax, mbox_loc, mbox_priorbox])
+
+ if mode == 'training':
+ model = Model(inputs=x, outputs=predictions)
+ elif mode == 'inference':
+ decoded_predictions = DecodeDetections(confidence_thresh=confidence_thresh,
+ iou_threshold=iou_threshold,
+ top_k=top_k,
+ nms_max_output_size=nms_max_output_size,
+ coords=coords,
+ normalize_coords=normalize_coords,
+ img_height=img_height,
+ img_width=img_width,
+ name='decoded_predictions')(predictions)
+ model = Model(inputs=x, outputs=decoded_predictions)
+ elif mode == 'inference_fast':
+ decoded_predictions = DecodeDetectionsFast(confidence_thresh=confidence_thresh,
+ iou_threshold=iou_threshold,
+ top_k=top_k,
+ nms_max_output_size=nms_max_output_size,
+ coords=coords,
+ normalize_coords=normalize_coords,
+ img_height=img_height,
+ img_width=img_width,
+ name='decoded_predictions')(predictions)
+ model = Model(inputs=x, outputs=decoded_predictions)
+ else:
+ raise ValueError("`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'.".format(mode))
+
+ if return_predictor_sizes:
+ predictor_sizes = np.array([conv4_3_norm_mbox_conf._keras_shape[1:3],
+ fc7_mbox_conf._keras_shape[1:3],
+ conv6_2_mbox_conf._keras_shape[1:3],
+ conv7_2_mbox_conf._keras_shape[1:3],
+ conv8_2_mbox_conf._keras_shape[1:3],
+ conv9_2_mbox_conf._keras_shape[1:3]])
+ return model, predictor_sizes
+ else:
+ return model
diff --git a/engine/object_detection_branch/single_shot_detector/models/keras_ssd512.py b/engine/object_detection_branch/single_shot_detector/models/keras_ssd512.py
new file mode 100644
index 0000000..3f69ac6
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/models/keras_ssd512.py
@@ -0,0 +1,477 @@
+'''
+A Keras port of the original Caffe SSD512 network.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+from keras.models import Model
+from keras.layers import Input, Lambda, Activation, Conv2D, MaxPooling2D, ZeroPadding2D, Reshape, Concatenate
+from keras.regularizers import l2
+import keras.backend as K
+
+from keras_layers.keras_layer_AnchorBoxes import AnchorBoxes
+from keras_layers.keras_layer_L2Normalization import L2Normalization
+from keras_layers.keras_layer_DecodeDetections import DecodeDetections
+from keras_layers.keras_layer_DecodeDetectionsFast import DecodeDetectionsFast
+
+def ssd_512(image_size,
+ n_classes,
+ mode='training',
+ l2_regularization=0.0005,
+ min_scale=None,
+ max_scale=None,
+ scales=None,
+ aspect_ratios_global=None,
+ aspect_ratios_per_layer=[[1.0, 2.0, 0.5],
+ [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
+ [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
+ [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
+ [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
+ [1.0, 2.0, 0.5],
+ [1.0, 2.0, 0.5]],
+ two_boxes_for_ar1=True,
+ steps=[8, 16, 32, 64, 128, 256, 512],
+ offsets=None,
+ clip_boxes=False,
+ variances=[0.1, 0.1, 0.2, 0.2],
+ coords='centroids',
+ normalize_coords=True,
+ subtract_mean=[123, 117, 104],
+ divide_by_stddev=None,
+ swap_channels=[2, 1, 0],
+ confidence_thresh=0.01,
+ iou_threshold=0.45,
+ top_k=200,
+ nms_max_output_size=400,
+ return_predictor_sizes=False):
+ '''
+ Build a Keras model with SSD512 architecture, see references.
+
+ The base network is a reduced atrous VGG-16, extended by the SSD architecture,
+ as described in the paper.
+
+ Most of the arguments that this function takes are only needed for the anchor
+ box layers. In case you're training the network, the parameters passed here must
+ be the same as the ones used to set up `SSDBoxEncoder`. In case you're loading
+ trained weights, the parameters passed here must be the same as the ones used
+ to produce the trained weights.
+
+ Some of these arguments are explained in more detail in the documentation of the
+ `SSDBoxEncoder` class.
+
+ Note: Requires Keras v2.0 or later. Currently works only with the
+ TensorFlow backend (v1.0 or later).
+
+ Arguments:
+ image_size (tuple): The input image size in the format `(height, width, channels)`.
+ n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO.
+ mode (str, optional): One of 'training', 'inference' and 'inference_fast'. In 'training' mode,
+ the model outputs the raw prediction tensor, while in 'inference' and 'inference_fast' modes,
+ the raw predictions are decoded into absolute coordinates and filtered via confidence thresholding,
+ non-maximum suppression, and top-k filtering. The difference between latter two modes is that
+ 'inference' follows the exact procedure of the original Caffe implementation, while
+ 'inference_fast' uses a faster prediction decoding procedure.
+ l2_regularization (float, optional): The L2-regularization rate. Applies to all convolutional layers.
+ Set to zero to deactivate L2-regularization.
+ min_scale (float, optional): The smallest scaling factor for the size of the anchor boxes as a fraction
+ of the shorter side of the input images.
+ max_scale (float, optional): The largest scaling factor for the size of the anchor boxes as a fraction
+ of the shorter side of the input images. All scaling factors between the smallest and the
+ largest will be linearly interpolated. Note that the second to last of the linearly interpolated
+ scaling factors will actually be the scaling factor for the last predictor layer, while the last
+ scaling factor is used for the second box for aspect ratio 1 in the last predictor layer
+ if `two_boxes_for_ar1` is `True`.
+ scales (list, optional): A list of floats containing scaling factors per convolutional predictor layer.
+ This list must be one element longer than the number of predictor layers. The first `k` elements are the
+ scaling factors for the `k` predictor layers, while the last element is used for the second box
+ for aspect ratio 1 in the last predictor layer if `two_boxes_for_ar1` is `True`. This additional
+ last scaling factor must be passed either way, even if it is not being used.
+ If a list is passed, this argument overrides `min_scale` and `max_scale`. All scaling factors
+ must be greater than zero.
+ aspect_ratios_global (list, optional): The list of aspect ratios for which anchor boxes are to be
+ generated. This list is valid for all prediction layers.
+ aspect_ratios_per_layer (list, optional): A list containing one aspect ratio list for each prediction layer.
+ This allows you to set the aspect ratios for each predictor layer individually, which is the case for the
+ original SSD512 implementation. If a list is passed, it overrides `aspect_ratios_global`.
+ two_boxes_for_ar1 (bool, optional): Only relevant for aspect ratio lists that contain 1. Will be ignored otherwise.
+ If `True`, two anchor boxes will be generated for aspect ratio 1. The first will be generated
+ using the scaling factor for the respective layer, the second one will be generated using
+ geometric mean of said scaling factor and next bigger scaling factor.
+ steps (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
+ either ints/floats or tuples of two ints/floats. These numbers represent for each predictor layer how many
+ pixels apart the anchor box center points should be vertically and horizontally along the spatial grid over
+ the image. If the list contains ints/floats, then that value will be used for both spatial dimensions.
+ If the list contains tuples of two ints/floats, then they represent `(step_height, step_width)`.
+ If no steps are provided, then they will be computed such that the anchor box center points will form an
+ equidistant grid within the image dimensions.
+ offsets (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
+ either floats or tuples of two floats. These numbers represent for each predictor layer how many
+ pixels from the top and left boarders of the image the top-most and left-most anchor box center points should be
+ as a fraction of `steps`. The last bit is important: The offsets are not absolute pixel values, but fractions
+ of the step size specified in the `steps` argument. If the list contains floats, then that value will
+ be used for both spatial dimensions. If the list contains tuples of two floats, then they represent
+ `(vertical_offset, horizontal_offset)`. If no offsets are provided, then they will default to 0.5 of the step size.
+ clip_boxes (bool, optional): If `True`, clips the anchor box coordinates to stay within image boundaries.
+ variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by
+ its respective variance value.
+ coords (str, optional): The box coordinate format to be used internally by the model (i.e. this is not the input format
+ of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width,
+ and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+ normalize_coords (bool, optional): Set to `True` if the model is supposed to use relative instead of absolute coordinates,
+ i.e. if the model predicts box coordinates within [0,1] instead of absolute coordinates.
+ subtract_mean (array-like, optional): `None` or an array-like object of integers or floating point values
+ of any shape that is broadcast-compatible with the image shape. The elements of this array will be
+ subtracted from the image pixel intensity values. For example, pass a list of three integers
+ to perform per-channel mean normalization for color images.
+ divide_by_stddev (array-like, optional): `None` or an array-like object of non-zero integers or
+ floating point values of any shape that is broadcast-compatible with the image shape. The image pixel
+ intensity values will be divided by the elements of this array. For example, pass a list
+ of three integers to perform per-channel standard deviation normalization for color images.
+ swap_channels (list, optional): Either `False` or a list of integers representing the desired order in which the input
+ image channels should be swapped.
+ confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
+ positive class in order to be considered for the non-maximum suppression stage for the respective class.
+ A lower value will result in a larger part of the selection process being done by the non-maximum suppression
+ stage, while a larger value will result in a larger part of the selection process happening in the confidence
+ thresholding stage.
+ iou_threshold (float, optional): A float in [0,1]. All boxes that have a Jaccard similarity of greater than `iou_threshold`
+ with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
+ to the box's confidence score.
+ top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
+ non-maximum suppression stage.
+ nms_max_output_size (int, optional): The maximal number of predictions that will be left over after the NMS stage.
+ return_predictor_sizes (bool, optional): If `True`, this function not only returns the model, but also
+ a list containing the spatial dimensions of the predictor layers. This isn't strictly necessary since
+ you can always get their sizes easily via the Keras API, but it's convenient and less error-prone
+ to get them this way. They are only relevant for training anyway (SSDBoxEncoder needs to know the
+ spatial dimensions of the predictor layers), for inference you don't need them.
+
+ Returns:
+ model: The Keras SSD512 model.
+ predictor_sizes (optional): A Numpy array containing the `(height, width)` portion
+ of the output tensor shape for each convolutional predictor layer. During
+ training, the generator function needs this in order to transform
+ the ground truth labels into tensors of identical structure as the
+ output tensors of the model, which is in turn needed for the cost
+ function.
+
+ References:
+ https://arxiv.org/abs/1512.02325v5
+ '''
+
+ n_predictor_layers = 7 # The number of predictor conv layers in the network is 7 for the original SSD512
+ n_classes += 1 # Account for the background class.
+ l2_reg = l2_regularization # Make the internal name shorter.
+ img_height, img_width, img_channels = image_size[0], image_size[1], image_size[2]
+
+ ############################################################################
+ # Get a few exceptions out of the way.
+ ############################################################################
+
+ if aspect_ratios_global is None and aspect_ratios_per_layer is None:
+ raise ValueError("`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified.")
+ if aspect_ratios_per_layer:
+ if len(aspect_ratios_per_layer) != n_predictor_layers:
+ raise ValueError("It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}.".format(n_predictor_layers, len(aspect_ratios_per_layer)))
+
+ if (min_scale is None or max_scale is None) and scales is None:
+ raise ValueError("Either `min_scale` and `max_scale` or `scales` need to be specified.")
+ if scales:
+ if len(scales) != n_predictor_layers+1:
+ raise ValueError("It must be either scales is None or len(scales) == {}, but len(scales) == {}.".format(n_predictor_layers+1, len(scales)))
+ else: # If no explicit list of scaling factors was passed, compute the list of scaling factors from `min_scale` and `max_scale`
+ scales = np.linspace(min_scale, max_scale, n_predictor_layers+1)
+
+ if len(variances) != 4:
+ raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances)))
+ variances = np.array(variances)
+ if np.any(variances <= 0):
+ raise ValueError("All variances must be >0, but the variances given are {}".format(variances))
+
+ if (not (steps is None)) and (len(steps) != n_predictor_layers):
+ raise ValueError("You must provide at least one step value per predictor layer.")
+
+ if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
+ raise ValueError("You must provide at least one offset value per predictor layer.")
+
+ ############################################################################
+ # Compute the anchor box parameters.
+ ############################################################################
+
+ # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers.
+ if aspect_ratios_per_layer:
+ aspect_ratios = aspect_ratios_per_layer
+ else:
+ aspect_ratios = [aspect_ratios_global] * n_predictor_layers
+
+ # Compute the number of boxes to be predicted per cell for each predictor layer.
+ # We need this so that we know how many channels the predictor layers need to have.
+ if aspect_ratios_per_layer:
+ n_boxes = []
+ for ar in aspect_ratios_per_layer:
+ if (1 in ar) & two_boxes_for_ar1:
+ n_boxes.append(len(ar) + 1) # +1 for the second box for aspect ratio 1
+ else:
+ n_boxes.append(len(ar))
+ else: # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer
+ if (1 in aspect_ratios_global) & two_boxes_for_ar1:
+ n_boxes = len(aspect_ratios_global) + 1
+ else:
+ n_boxes = len(aspect_ratios_global)
+ n_boxes = [n_boxes] * n_predictor_layers
+
+ if steps is None:
+ steps = [None] * n_predictor_layers
+ if offsets is None:
+ offsets = [None] * n_predictor_layers
+
+ ############################################################################
+ # Define functions for the Lambda layers below.
+ ############################################################################
+
+ def identity_layer(tensor):
+ return tensor
+
+ def input_mean_normalization(tensor):
+ return tensor - np.array(subtract_mean)
+
+ def input_stddev_normalization(tensor):
+ return tensor / np.array(divide_by_stddev)
+
+ def input_channel_swap(tensor):
+ if len(swap_channels) == 3:
+ return K.stack([tensor[...,swap_channels[0]], tensor[...,swap_channels[1]], tensor[...,swap_channels[2]]], axis=-1)
+ elif len(swap_channels) == 4:
+ return K.stack([tensor[...,swap_channels[0]], tensor[...,swap_channels[1]], tensor[...,swap_channels[2]], tensor[...,swap_channels[3]]], axis=-1)
+
+ ############################################################################
+ # Build the network.
+ ############################################################################
+
+ x = Input(shape=(img_height, img_width, img_channels))
+
+ # The following identity layer is only needed so that the subsequent lambda layers can be optional.
+ x1 = Lambda(identity_layer, output_shape=(img_height, img_width, img_channels), name='identity_layer')(x)
+ if not (subtract_mean is None):
+ x1 = Lambda(input_mean_normalization, output_shape=(img_height, img_width, img_channels), name='input_mean_normalization')(x1)
+ if not (divide_by_stddev is None):
+ x1 = Lambda(input_stddev_normalization, output_shape=(img_height, img_width, img_channels), name='input_stddev_normalization')(x1)
+ if swap_channels:
+ x1 = Lambda(input_channel_swap, output_shape=(img_height, img_width, img_channels), name='input_channel_swap')(x1)
+
+ conv1_1 = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv1_1')(x1)
+ conv1_2 = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv1_2')(conv1_1)
+ pool1 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool1')(conv1_2)
+
+ conv2_1 = Conv2D(128, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv2_1')(pool1)
+ conv2_2 = Conv2D(128, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv2_2')(conv2_1)
+ pool2 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool2')(conv2_2)
+
+ conv3_1 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3_1')(pool2)
+ conv3_2 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3_2')(conv3_1)
+ conv3_3 = Conv2D(256, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3_3')(conv3_2)
+ pool3 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool3')(conv3_3)
+
+ conv4_1 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_1')(pool3)
+ conv4_2 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_2')(conv4_1)
+ conv4_3 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_3')(conv4_2)
+ pool4 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool4')(conv4_3)
+
+ conv5_1 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5_1')(pool4)
+ conv5_2 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5_2')(conv5_1)
+ conv5_3 = Conv2D(512, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5_3')(conv5_2)
+ pool5 = MaxPooling2D(pool_size=(3, 3), strides=(1, 1), padding='same', name='pool5')(conv5_3)
+
+ fc6 = Conv2D(1024, (3, 3), dilation_rate=(6, 6), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc6')(pool5)
+
+ fc7 = Conv2D(1024, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc7')(fc6)
+
+ conv6_1 = Conv2D(256, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_1')(fc7)
+ conv6_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv6_padding')(conv6_1)
+ conv6_2 = Conv2D(512, (3, 3), strides=(2, 2), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_2')(conv6_1)
+
+ conv7_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_1')(conv6_2)
+ conv7_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv7_padding')(conv7_1)
+ conv7_2 = Conv2D(256, (3, 3), strides=(2, 2), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_2')(conv7_1)
+
+ conv8_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_1')(conv7_2)
+ conv8_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv8_padding')(conv8_1)
+ conv8_2 = Conv2D(256, (3, 3), strides=(2, 2), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_2')(conv8_1)
+
+ conv9_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_1')(conv8_2)
+ conv9_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv9_padding')(conv9_1)
+ conv9_2 = Conv2D(256, (3, 3), strides=(2, 2), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_2')(conv9_1)
+
+ conv10_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv10_1')(conv9_2)
+ conv10_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv10_padding')(conv10_1)
+ conv10_2 = Conv2D(256, (4, 4), strides=(1, 1), activation='relu', padding='valid', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv10_2')(conv10_1)
+
+ # Feed conv4_3 into the L2 normalization layer
+ conv4_3_norm = L2Normalization(gamma_init=20, name='conv4_3_norm')(conv4_3)
+
+ ### Build the convolutional predictor layers on top of the base network
+
+ # We precidt `n_classes` confidence values for each box, hence the confidence predictors have depth `n_boxes * n_classes`
+ # Output shape of the confidence layers: `(batch, height, width, n_boxes * n_classes)`
+ conv4_3_norm_mbox_conf = Conv2D(n_boxes[0] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_3_norm_mbox_conf')(conv4_3_norm)
+ fc7_mbox_conf = Conv2D(n_boxes[1] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc7_mbox_conf')(fc7)
+ conv6_2_mbox_conf = Conv2D(n_boxes[2] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_2_mbox_conf')(conv6_2)
+ conv7_2_mbox_conf = Conv2D(n_boxes[3] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_2_mbox_conf')(conv7_2)
+ conv8_2_mbox_conf = Conv2D(n_boxes[4] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_2_mbox_conf')(conv8_2)
+ conv9_2_mbox_conf = Conv2D(n_boxes[5] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_2_mbox_conf')(conv9_2)
+ conv10_2_mbox_conf = Conv2D(n_boxes[6] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv10_2_mbox_conf')(conv10_2)
+ # We predict 4 box coordinates for each box, hence the localization predictors have depth `n_boxes * 4`
+ # Output shape of the localization layers: `(batch, height, width, n_boxes * 4)`
+ conv4_3_norm_mbox_loc = Conv2D(n_boxes[0] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4_3_norm_mbox_loc')(conv4_3_norm)
+ fc7_mbox_loc = Conv2D(n_boxes[1] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='fc7_mbox_loc')(fc7)
+ conv6_2_mbox_loc = Conv2D(n_boxes[2] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6_2_mbox_loc')(conv6_2)
+ conv7_2_mbox_loc = Conv2D(n_boxes[3] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7_2_mbox_loc')(conv7_2)
+ conv8_2_mbox_loc = Conv2D(n_boxes[4] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv8_2_mbox_loc')(conv8_2)
+ conv9_2_mbox_loc = Conv2D(n_boxes[5] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv9_2_mbox_loc')(conv9_2)
+ conv10_2_mbox_loc = Conv2D(n_boxes[6] * 4, (3, 3), padding='same', kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv10_2_mbox_loc')(conv10_2)
+
+ ### Generate the anchor boxes (called "priors" in the original Caffe/C++ implementation, so I'll keep their layer names)
+
+ # Output shape of anchors: `(batch, height, width, n_boxes, 8)`
+ conv4_3_norm_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[0], next_scale=scales[1], aspect_ratios=aspect_ratios[0],
+ two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[0], this_offsets=offsets[0], clip_boxes=clip_boxes,
+ variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv4_3_norm_mbox_priorbox')(conv4_3_norm_mbox_loc)
+ fc7_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[1], next_scale=scales[2], aspect_ratios=aspect_ratios[1],
+ two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[1], this_offsets=offsets[1], clip_boxes=clip_boxes,
+ variances=variances, coords=coords, normalize_coords=normalize_coords, name='fc7_mbox_priorbox')(fc7_mbox_loc)
+ conv6_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[2], next_scale=scales[3], aspect_ratios=aspect_ratios[2],
+ two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[2], this_offsets=offsets[2], clip_boxes=clip_boxes,
+ variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv6_2_mbox_priorbox')(conv6_2_mbox_loc)
+ conv7_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[3], next_scale=scales[4], aspect_ratios=aspect_ratios[3],
+ two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[3], this_offsets=offsets[3], clip_boxes=clip_boxes,
+ variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv7_2_mbox_priorbox')(conv7_2_mbox_loc)
+ conv8_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[4], next_scale=scales[5], aspect_ratios=aspect_ratios[4],
+ two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[4], this_offsets=offsets[4], clip_boxes=clip_boxes,
+ variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv8_2_mbox_priorbox')(conv8_2_mbox_loc)
+ conv9_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[5], next_scale=scales[6], aspect_ratios=aspect_ratios[5],
+ two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[5], this_offsets=offsets[5], clip_boxes=clip_boxes,
+ variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv9_2_mbox_priorbox')(conv9_2_mbox_loc)
+ conv10_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[6], next_scale=scales[7], aspect_ratios=aspect_ratios[6],
+ two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[6], this_offsets=offsets[6], clip_boxes=clip_boxes,
+ variances=variances, coords=coords, normalize_coords=normalize_coords, name='conv10_2_mbox_priorbox')(conv10_2_mbox_loc)
+
+ ### Reshape
+
+ # Reshape the class predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, n_classes)`
+ # We want the classes isolated in the last axis to perform softmax on them
+ conv4_3_norm_mbox_conf_reshape = Reshape((-1, n_classes), name='conv4_3_norm_mbox_conf_reshape')(conv4_3_norm_mbox_conf)
+ fc7_mbox_conf_reshape = Reshape((-1, n_classes), name='fc7_mbox_conf_reshape')(fc7_mbox_conf)
+ conv6_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv6_2_mbox_conf_reshape')(conv6_2_mbox_conf)
+ conv7_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv7_2_mbox_conf_reshape')(conv7_2_mbox_conf)
+ conv8_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv8_2_mbox_conf_reshape')(conv8_2_mbox_conf)
+ conv9_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv9_2_mbox_conf_reshape')(conv9_2_mbox_conf)
+ conv10_2_mbox_conf_reshape = Reshape((-1, n_classes), name='conv10_2_mbox_conf_reshape')(conv10_2_mbox_conf)
+ # Reshape the box predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, 4)`
+ # We want the four box coordinates isolated in the last axis to compute the smooth L1 loss
+ conv4_3_norm_mbox_loc_reshape = Reshape((-1, 4), name='conv4_3_norm_mbox_loc_reshape')(conv4_3_norm_mbox_loc)
+ fc7_mbox_loc_reshape = Reshape((-1, 4), name='fc7_mbox_loc_reshape')(fc7_mbox_loc)
+ conv6_2_mbox_loc_reshape = Reshape((-1, 4), name='conv6_2_mbox_loc_reshape')(conv6_2_mbox_loc)
+ conv7_2_mbox_loc_reshape = Reshape((-1, 4), name='conv7_2_mbox_loc_reshape')(conv7_2_mbox_loc)
+ conv8_2_mbox_loc_reshape = Reshape((-1, 4), name='conv8_2_mbox_loc_reshape')(conv8_2_mbox_loc)
+ conv9_2_mbox_loc_reshape = Reshape((-1, 4), name='conv9_2_mbox_loc_reshape')(conv9_2_mbox_loc)
+ conv10_2_mbox_loc_reshape = Reshape((-1, 4), name='conv10_2_mbox_loc_reshape')(conv10_2_mbox_loc)
+ # Reshape the anchor box tensors, yielding 3D tensors of shape `(batch, height * width * n_boxes, 8)`
+ conv4_3_norm_mbox_priorbox_reshape = Reshape((-1, 8), name='conv4_3_norm_mbox_priorbox_reshape')(conv4_3_norm_mbox_priorbox)
+ fc7_mbox_priorbox_reshape = Reshape((-1, 8), name='fc7_mbox_priorbox_reshape')(fc7_mbox_priorbox)
+ conv6_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv6_2_mbox_priorbox_reshape')(conv6_2_mbox_priorbox)
+ conv7_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv7_2_mbox_priorbox_reshape')(conv7_2_mbox_priorbox)
+ conv8_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv8_2_mbox_priorbox_reshape')(conv8_2_mbox_priorbox)
+ conv9_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv9_2_mbox_priorbox_reshape')(conv9_2_mbox_priorbox)
+ conv10_2_mbox_priorbox_reshape = Reshape((-1, 8), name='conv10_2_mbox_priorbox_reshape')(conv10_2_mbox_priorbox)
+
+ ### Concatenate the predictions from the different layers
+
+ # Axis 0 (batch) and axis 2 (n_classes or 4, respectively) are identical for all layer predictions,
+ # so we want to concatenate along axis 1, the number of boxes per layer
+ # Output shape of `mbox_conf`: (batch, n_boxes_total, n_classes)
+ mbox_conf = Concatenate(axis=1, name='mbox_conf')([conv4_3_norm_mbox_conf_reshape,
+ fc7_mbox_conf_reshape,
+ conv6_2_mbox_conf_reshape,
+ conv7_2_mbox_conf_reshape,
+ conv8_2_mbox_conf_reshape,
+ conv9_2_mbox_conf_reshape,
+ conv10_2_mbox_conf_reshape])
+
+ # Output shape of `mbox_loc`: (batch, n_boxes_total, 4)
+ mbox_loc = Concatenate(axis=1, name='mbox_loc')([conv4_3_norm_mbox_loc_reshape,
+ fc7_mbox_loc_reshape,
+ conv6_2_mbox_loc_reshape,
+ conv7_2_mbox_loc_reshape,
+ conv8_2_mbox_loc_reshape,
+ conv9_2_mbox_loc_reshape,
+ conv10_2_mbox_loc_reshape])
+
+ # Output shape of `mbox_priorbox`: (batch, n_boxes_total, 8)
+ mbox_priorbox = Concatenate(axis=1, name='mbox_priorbox')([conv4_3_norm_mbox_priorbox_reshape,
+ fc7_mbox_priorbox_reshape,
+ conv6_2_mbox_priorbox_reshape,
+ conv7_2_mbox_priorbox_reshape,
+ conv8_2_mbox_priorbox_reshape,
+ conv9_2_mbox_priorbox_reshape,
+ conv10_2_mbox_priorbox_reshape])
+
+ # The box coordinate predictions will go into the loss function just the way they are,
+ # but for the class predictions, we'll apply a softmax activation layer first
+ mbox_conf_softmax = Activation('softmax', name='mbox_conf_softmax')(mbox_conf)
+
+ # Concatenate the class and box predictions and the anchors to one large predictions vector
+ # Output shape of `predictions`: (batch, n_boxes_total, n_classes + 4 + 8)
+ predictions = Concatenate(axis=2, name='predictions')([mbox_conf_softmax, mbox_loc, mbox_priorbox])
+
+ if mode == 'training':
+ model = Model(inputs=x, outputs=predictions)
+ elif mode == 'inference':
+ decoded_predictions = DecodeDetections(confidence_thresh=confidence_thresh,
+ iou_threshold=iou_threshold,
+ top_k=top_k,
+ nms_max_output_size=nms_max_output_size,
+ coords=coords,
+ normalize_coords=normalize_coords,
+ img_height=img_height,
+ img_width=img_width,
+ name='decoded_predictions')(predictions)
+ model = Model(inputs=x, outputs=decoded_predictions)
+ elif mode == 'inference_fast':
+ decoded_predictions = DecodeDetectionsFast(confidence_thresh=confidence_thresh,
+ iou_threshold=iou_threshold,
+ top_k=top_k,
+ nms_max_output_size=nms_max_output_size,
+ coords=coords,
+ normalize_coords=normalize_coords,
+ img_height=img_height,
+ img_width=img_width,
+ name='decoded_predictions')(predictions)
+ model = Model(inputs=x, outputs=decoded_predictions)
+ else:
+ raise ValueError("`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'.".format(mode))
+
+ if return_predictor_sizes:
+ predictor_sizes = np.array([conv4_3_norm_mbox_conf._keras_shape[1:3],
+ fc7_mbox_conf._keras_shape[1:3],
+ conv6_2_mbox_conf._keras_shape[1:3],
+ conv7_2_mbox_conf._keras_shape[1:3],
+ conv8_2_mbox_conf._keras_shape[1:3],
+ conv9_2_mbox_conf._keras_shape[1:3],
+ conv10_2_mbox_conf._keras_shape[1:3]])
+ return model, predictor_sizes
+ else:
+ return model
diff --git a/engine/object_detection_branch/single_shot_detector/models/keras_ssd7.py b/engine/object_detection_branch/single_shot_detector/models/keras_ssd7.py
new file mode 100644
index 0000000..5409599
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/models/keras_ssd7.py
@@ -0,0 +1,430 @@
+'''
+A small 7-layer Keras model with SSD architecture. Also serves as a template to build arbitrary network architectures.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+from keras.models import Model
+from keras.layers import Input, Lambda, Conv2D, MaxPooling2D, BatchNormalization, ELU, Reshape, Concatenate, Activation
+from keras.regularizers import l2
+import keras.backend as K
+
+from keras_layers.keras_layer_AnchorBoxes import AnchorBoxes
+from keras_layers.keras_layer_DecodeDetections import DecodeDetections
+from keras_layers.keras_layer_DecodeDetectionsFast import DecodeDetectionsFast
+
+def build_model(image_size,
+ n_classes,
+ mode='training',
+ l2_regularization=0.0,
+ min_scale=0.1,
+ max_scale=0.9,
+ scales=None,
+ aspect_ratios_global=[0.5, 1.0, 2.0],
+ aspect_ratios_per_layer=None,
+ two_boxes_for_ar1=True,
+ steps=None,
+ offsets=None,
+ clip_boxes=False,
+ variances=[1.0, 1.0, 1.0, 1.0],
+ coords='centroids',
+ normalize_coords=False,
+ subtract_mean=None,
+ divide_by_stddev=None,
+ swap_channels=False,
+ confidence_thresh=0.01,
+ iou_threshold=0.45,
+ top_k=200,
+ nms_max_output_size=400,
+ return_predictor_sizes=False):
+ '''
+ Build a Keras model with SSD architecture, see references.
+
+ The model consists of convolutional feature layers and a number of convolutional
+ predictor layers that take their input from different feature layers.
+ The model is fully convolutional.
+
+ The implementation found here is a smaller version of the original architecture
+ used in the paper (where the base network consists of a modified VGG-16 extended
+ by a few convolutional feature layers), but of course it could easily be changed to
+ an arbitrarily large SSD architecture by following the general design pattern used here.
+ This implementation has 7 convolutional layers and 4 convolutional predictor
+ layers that take their input from layers 4, 5, 6, and 7, respectively.
+
+ Most of the arguments that this function takes are only needed for the anchor
+ box layers. In case you're training the network, the parameters passed here must
+ be the same as the ones used to set up `SSDBoxEncoder`. In case you're loading
+ trained weights, the parameters passed here must be the same as the ones used
+ to produce the trained weights.
+
+ Some of these arguments are explained in more detail in the documentation of the
+ `SSDBoxEncoder` class.
+
+ Note: Requires Keras v2.0 or later. Training currently works only with the
+ TensorFlow backend (v1.0 or later).
+
+ Arguments:
+ image_size (tuple): The input image size in the format `(height, width, channels)`.
+ n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO.
+ mode (str, optional): One of 'training', 'inference' and 'inference_fast'. In 'training' mode,
+ the model outputs the raw prediction tensor, while in 'inference' and 'inference_fast' modes,
+ the raw predictions are decoded into absolute coordinates and filtered via confidence thresholding,
+ non-maximum suppression, and top-k filtering. The difference between latter two modes is that
+ 'inference' follows the exact procedure of the original Caffe implementation, while
+ 'inference_fast' uses a faster prediction decoding procedure.
+ l2_regularization (float, optional): The L2-regularization rate. Applies to all convolutional layers.
+ min_scale (float, optional): The smallest scaling factor for the size of the anchor boxes as a fraction
+ of the shorter side of the input images.
+ max_scale (float, optional): The largest scaling factor for the size of the anchor boxes as a fraction
+ of the shorter side of the input images. All scaling factors between the smallest and the
+ largest will be linearly interpolated. Note that the second to last of the linearly interpolated
+ scaling factors will actually be the scaling factor for the last predictor layer, while the last
+ scaling factor is used for the second box for aspect ratio 1 in the last predictor layer
+ if `two_boxes_for_ar1` is `True`.
+ scales (list, optional): A list of floats containing scaling factors per convolutional predictor layer.
+ This list must be one element longer than the number of predictor layers. The first `k` elements are the
+ scaling factors for the `k` predictor layers, while the last element is used for the second box
+ for aspect ratio 1 in the last predictor layer if `two_boxes_for_ar1` is `True`. This additional
+ last scaling factor must be passed either way, even if it is not being used. If a list is passed,
+ this argument overrides `min_scale` and `max_scale`. All scaling factors must be greater than zero.
+ aspect_ratios_global (list, optional): The list of aspect ratios for which anchor boxes are to be
+ generated. This list is valid for all predictor layers. The original implementation uses more aspect ratios
+ for some predictor layers and fewer for others. If you want to do that, too, then use the next argument instead.
+ aspect_ratios_per_layer (list, optional): A list containing one aspect ratio list for each predictor layer.
+ This allows you to set the aspect ratios for each predictor layer individually. If a list is passed,
+ it overrides `aspect_ratios_global`.
+ two_boxes_for_ar1 (bool, optional): Only relevant for aspect ratio lists that contain 1. Will be ignored otherwise.
+ If `True`, two anchor boxes will be generated for aspect ratio 1. The first will be generated
+ using the scaling factor for the respective layer, the second one will be generated using
+ geometric mean of said scaling factor and next bigger scaling factor.
+ steps (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
+ either ints/floats or tuples of two ints/floats. These numbers represent for each predictor layer how many
+ pixels apart the anchor box center points should be vertically and horizontally along the spatial grid over
+ the image. If the list contains ints/floats, then that value will be used for both spatial dimensions.
+ If the list contains tuples of two ints/floats, then they represent `(step_height, step_width)`.
+ If no steps are provided, then they will be computed such that the anchor box center points will form an
+ equidistant grid within the image dimensions.
+ offsets (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
+ either floats or tuples of two floats. These numbers represent for each predictor layer how many
+ pixels from the top and left boarders of the image the top-most and left-most anchor box center points should be
+ as a fraction of `steps`. The last bit is important: The offsets are not absolute pixel values, but fractions
+ of the step size specified in the `steps` argument. If the list contains floats, then that value will
+ be used for both spatial dimensions. If the list contains tuples of two floats, then they represent
+ `(vertical_offset, horizontal_offset)`. If no offsets are provided, then they will default to 0.5 of the step size,
+ which is also the recommended setting.
+ clip_boxes (bool, optional): If `True`, clips the anchor box coordinates to stay within image boundaries.
+ variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by
+ its respective variance value.
+ coords (str, optional): The box coordinate format to be used internally by the model (i.e. this is not the input format
+ of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width,
+ and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+ normalize_coords (bool, optional): Set to `True` if the model is supposed to use relative instead of absolute coordinates,
+ i.e. if the model predicts box coordinates within [0,1] instead of absolute coordinates.
+ subtract_mean (array-like, optional): `None` or an array-like object of integers or floating point values
+ of any shape that is broadcast-compatible with the image shape. The elements of this array will be
+ subtracted from the image pixel intensity values. For example, pass a list of three integers
+ to perform per-channel mean normalization for color images.
+ divide_by_stddev (array-like, optional): `None` or an array-like object of non-zero integers or
+ floating point values of any shape that is broadcast-compatible with the image shape. The image pixel
+ intensity values will be divided by the elements of this array. For example, pass a list
+ of three integers to perform per-channel standard deviation normalization for color images.
+ swap_channels (list, optional): Either `False` or a list of integers representing the desired order in which the input
+ image channels should be swapped.
+ confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
+ positive class in order to be considered for the non-maximum suppression stage for the respective class.
+ A lower value will result in a larger part of the selection process being done by the non-maximum suppression
+ stage, while a larger value will result in a larger part of the selection process happening in the confidence
+ thresholding stage.
+ iou_threshold (float, optional): A float in [0,1]. All boxes that have a Jaccard similarity of greater than `iou_threshold`
+ with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
+ to the box's confidence score.
+ top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
+ non-maximum suppression stage.
+ nms_max_output_size (int, optional): The maximal number of predictions that will be left over after the NMS stage.
+ return_predictor_sizes (bool, optional): If `True`, this function not only returns the model, but also
+ a list containing the spatial dimensions of the predictor layers. This isn't strictly necessary since
+ you can always get their sizes easily via the Keras API, but it's convenient and less error-prone
+ to get them this way. They are only relevant for training anyway (SSDBoxEncoder needs to know the
+ spatial dimensions of the predictor layers), for inference you don't need them.
+
+ Returns:
+ model: The Keras SSD model.
+ predictor_sizes (optional): A Numpy array containing the `(height, width)` portion
+ of the output tensor shape for each convolutional predictor layer. During
+ training, the generator function needs this in order to transform
+ the ground truth labels into tensors of identical structure as the
+ output tensors of the model, which is in turn needed for the cost
+ function.
+
+ References:
+ https://arxiv.org/abs/1512.02325v5
+ '''
+
+ n_predictor_layers = 4 # The number of predictor conv layers in the network
+ n_classes += 1 # Account for the background class.
+ l2_reg = l2_regularization # Make the internal name shorter.
+ img_height, img_width, img_channels = image_size[0], image_size[1], image_size[2]
+
+ ############################################################################
+ # Get a few exceptions out of the way.
+ ############################################################################
+
+ if aspect_ratios_global is None and aspect_ratios_per_layer is None:
+ raise ValueError("`aspect_ratios_global` and `aspect_ratios_per_layer` cannot both be None. At least one needs to be specified.")
+ if aspect_ratios_per_layer:
+ if len(aspect_ratios_per_layer) != n_predictor_layers:
+ raise ValueError("It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == {}, but len(aspect_ratios_per_layer) == {}.".format(n_predictor_layers, len(aspect_ratios_per_layer)))
+
+ if (min_scale is None or max_scale is None) and scales is None:
+ raise ValueError("Either `min_scale` and `max_scale` or `scales` need to be specified.")
+ if scales:
+ if len(scales) != n_predictor_layers+1:
+ raise ValueError("It must be either scales is None or len(scales) == {}, but len(scales) == {}.".format(n_predictor_layers+1, len(scales)))
+ else: # If no explicit list of scaling factors was passed, compute the list of scaling factors from `min_scale` and `max_scale`
+ scales = np.linspace(min_scale, max_scale, n_predictor_layers+1)
+
+ if len(variances) != 4: # We need one variance value for each of the four box coordinates
+ raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances)))
+ variances = np.array(variances)
+ if np.any(variances <= 0):
+ raise ValueError("All variances must be >0, but the variances given are {}".format(variances))
+
+ if (not (steps is None)) and (len(steps) != n_predictor_layers):
+ raise ValueError("You must provide at least one step value per predictor layer.")
+
+ if (not (offsets is None)) and (len(offsets) != n_predictor_layers):
+ raise ValueError("You must provide at least one offset value per predictor layer.")
+
+ ############################################################################
+ # Compute the anchor box parameters.
+ ############################################################################
+
+ # Set the aspect ratios for each predictor layer. These are only needed for the anchor box layers.
+ if aspect_ratios_per_layer:
+ aspect_ratios = aspect_ratios_per_layer
+ else:
+ aspect_ratios = [aspect_ratios_global] * n_predictor_layers
+
+ # Compute the number of boxes to be predicted per cell for each predictor layer.
+ # We need this so that we know how many channels the predictor layers need to have.
+ if aspect_ratios_per_layer:
+ n_boxes = []
+ for ar in aspect_ratios_per_layer:
+ if (1 in ar) & two_boxes_for_ar1:
+ n_boxes.append(len(ar) + 1) # +1 for the second box for aspect ratio 1
+ else:
+ n_boxes.append(len(ar))
+ else: # If only a global aspect ratio list was passed, then the number of boxes is the same for each predictor layer
+ if (1 in aspect_ratios_global) & two_boxes_for_ar1:
+ n_boxes = len(aspect_ratios_global) + 1
+ else:
+ n_boxes = len(aspect_ratios_global)
+ n_boxes = [n_boxes] * n_predictor_layers
+
+ if steps is None:
+ steps = [None] * n_predictor_layers
+ if offsets is None:
+ offsets = [None] * n_predictor_layers
+
+ ############################################################################
+ # Define functions for the Lambda layers below.
+ ############################################################################
+
+ def identity_layer(tensor):
+ return tensor
+
+ def input_mean_normalization(tensor):
+ return tensor - np.array(subtract_mean)
+
+ def input_stddev_normalization(tensor):
+ return tensor / np.array(divide_by_stddev)
+
+ def input_channel_swap(tensor):
+ if len(swap_channels) == 3:
+ return K.stack([tensor[...,swap_channels[0]], tensor[...,swap_channels[1]], tensor[...,swap_channels[2]]], axis=-1)
+ elif len(swap_channels) == 4:
+ return K.stack([tensor[...,swap_channels[0]], tensor[...,swap_channels[1]], tensor[...,swap_channels[2]], tensor[...,swap_channels[3]]], axis=-1)
+
+ ############################################################################
+ # Build the network.
+ ############################################################################
+
+ x = Input(shape=(img_height, img_width, img_channels))
+
+ # The following identity layer is only needed so that the subsequent lambda layers can be optional.
+ x1 = Lambda(identity_layer, output_shape=(img_height, img_width, img_channels), name='identity_layer')(x)
+ if not (subtract_mean is None):
+ x1 = Lambda(input_mean_normalization, output_shape=(img_height, img_width, img_channels), name='input_mean_normalization')(x1)
+ if not (divide_by_stddev is None):
+ x1 = Lambda(input_stddev_normalization, output_shape=(img_height, img_width, img_channels), name='input_stddev_normalization')(x1)
+ if swap_channels:
+ x1 = Lambda(input_channel_swap, output_shape=(img_height, img_width, img_channels), name='input_channel_swap')(x1)
+
+ conv1 = Conv2D(32, (5, 5), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv1')(x1)
+ conv1 = BatchNormalization(axis=3, momentum=0.99, name='bn1')(conv1) # Tensorflow uses filter format [filter_height, filter_width, in_channels, out_channels], hence axis = 3
+ conv1 = ELU(name='elu1')(conv1)
+ pool1 = MaxPooling2D(pool_size=(2, 2), name='pool1')(conv1)
+
+ conv2 = Conv2D(48, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv2')(pool1)
+ conv2 = BatchNormalization(axis=3, momentum=0.99, name='bn2')(conv2)
+ conv2 = ELU(name='elu2')(conv2)
+ pool2 = MaxPooling2D(pool_size=(2, 2), name='pool2')(conv2)
+
+ conv3 = Conv2D(64, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv3')(pool2)
+ conv3 = BatchNormalization(axis=3, momentum=0.99, name='bn3')(conv3)
+ conv3 = ELU(name='elu3')(conv3)
+ pool3 = MaxPooling2D(pool_size=(2, 2), name='pool3')(conv3)
+
+ conv4 = Conv2D(64, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv4')(pool3)
+ conv4 = BatchNormalization(axis=3, momentum=0.99, name='bn4')(conv4)
+ conv4 = ELU(name='elu4')(conv4)
+ pool4 = MaxPooling2D(pool_size=(2, 2), name='pool4')(conv4)
+
+ conv5 = Conv2D(48, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv5')(pool4)
+ conv5 = BatchNormalization(axis=3, momentum=0.99, name='bn5')(conv5)
+ conv5 = ELU(name='elu5')(conv5)
+ pool5 = MaxPooling2D(pool_size=(2, 2), name='pool5')(conv5)
+
+ conv6 = Conv2D(48, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv6')(pool5)
+ conv6 = BatchNormalization(axis=3, momentum=0.99, name='bn6')(conv6)
+ conv6 = ELU(name='elu6')(conv6)
+ pool6 = MaxPooling2D(pool_size=(2, 2), name='pool6')(conv6)
+
+ conv7 = Conv2D(32, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='conv7')(pool6)
+ conv7 = BatchNormalization(axis=3, momentum=0.99, name='bn7')(conv7)
+ conv7 = ELU(name='elu7')(conv7)
+
+ # The next part is to add the convolutional predictor layers on top of the base network
+ # that we defined above. Note that I use the term "base network" differently than the paper does.
+ # To me, the base network is everything that is not convolutional predictor layers or anchor
+ # box layers. In this case we'll have four predictor layers, but of course you could
+ # easily rewrite this into an arbitrarily deep base network and add an arbitrary number of
+ # predictor layers on top of the base network by simply following the pattern shown here.
+
+ # Build the convolutional predictor layers on top of conv layers 4, 5, 6, and 7.
+ # We build two predictor layers on top of each of these layers: One for class prediction (classification), one for box coordinate prediction (localization)
+ # We precidt `n_classes` confidence values for each box, hence the `classes` predictors have depth `n_boxes * n_classes`
+ # We predict 4 box coordinates for each box, hence the `boxes` predictors have depth `n_boxes * 4`
+ # Output shape of `classes`: `(batch, height, width, n_boxes * n_classes)`
+ classes4 = Conv2D(n_boxes[0] * n_classes, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='classes4')(conv4)
+ classes5 = Conv2D(n_boxes[1] * n_classes, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='classes5')(conv5)
+ classes6 = Conv2D(n_boxes[2] * n_classes, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='classes6')(conv6)
+ classes7 = Conv2D(n_boxes[3] * n_classes, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='classes7')(conv7)
+ # Output shape of `boxes`: `(batch, height, width, n_boxes * 4)`
+ boxes4 = Conv2D(n_boxes[0] * 4, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='boxes4')(conv4)
+ boxes5 = Conv2D(n_boxes[1] * 4, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='boxes5')(conv5)
+ boxes6 = Conv2D(n_boxes[2] * 4, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='boxes6')(conv6)
+ boxes7 = Conv2D(n_boxes[3] * 4, (3, 3), strides=(1, 1), padding="same", kernel_initializer='he_normal', kernel_regularizer=l2(l2_reg), name='boxes7')(conv7)
+
+ # Generate the anchor boxes
+ # Output shape of `anchors`: `(batch, height, width, n_boxes, 8)`
+ anchors4 = AnchorBoxes(img_height, img_width, this_scale=scales[0], next_scale=scales[1], aspect_ratios=aspect_ratios[0],
+ two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[0], this_offsets=offsets[0],
+ clip_boxes=clip_boxes, variances=variances, coords=coords, normalize_coords=normalize_coords, name='anchors4')(boxes4)
+ anchors5 = AnchorBoxes(img_height, img_width, this_scale=scales[1], next_scale=scales[2], aspect_ratios=aspect_ratios[1],
+ two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[1], this_offsets=offsets[1],
+ clip_boxes=clip_boxes, variances=variances, coords=coords, normalize_coords=normalize_coords, name='anchors5')(boxes5)
+ anchors6 = AnchorBoxes(img_height, img_width, this_scale=scales[2], next_scale=scales[3], aspect_ratios=aspect_ratios[2],
+ two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[2], this_offsets=offsets[2],
+ clip_boxes=clip_boxes, variances=variances, coords=coords, normalize_coords=normalize_coords, name='anchors6')(boxes6)
+ anchors7 = AnchorBoxes(img_height, img_width, this_scale=scales[3], next_scale=scales[4], aspect_ratios=aspect_ratios[3],
+ two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[3], this_offsets=offsets[3],
+ clip_boxes=clip_boxes, variances=variances, coords=coords, normalize_coords=normalize_coords, name='anchors7')(boxes7)
+
+ # Reshape the class predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, n_classes)`
+ # We want the classes isolated in the last axis to perform softmax on them
+ classes4_reshaped = Reshape((-1, n_classes), name='classes4_reshape')(classes4)
+ classes5_reshaped = Reshape((-1, n_classes), name='classes5_reshape')(classes5)
+ classes6_reshaped = Reshape((-1, n_classes), name='classes6_reshape')(classes6)
+ classes7_reshaped = Reshape((-1, n_classes), name='classes7_reshape')(classes7)
+ # Reshape the box coordinate predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, 4)`
+ # We want the four box coordinates isolated in the last axis to compute the smooth L1 loss
+ boxes4_reshaped = Reshape((-1, 4), name='boxes4_reshape')(boxes4)
+ boxes5_reshaped = Reshape((-1, 4), name='boxes5_reshape')(boxes5)
+ boxes6_reshaped = Reshape((-1, 4), name='boxes6_reshape')(boxes6)
+ boxes7_reshaped = Reshape((-1, 4), name='boxes7_reshape')(boxes7)
+ # Reshape the anchor box tensors, yielding 3D tensors of shape `(batch, height * width * n_boxes, 8)`
+ anchors4_reshaped = Reshape((-1, 8), name='anchors4_reshape')(anchors4)
+ anchors5_reshaped = Reshape((-1, 8), name='anchors5_reshape')(anchors5)
+ anchors6_reshaped = Reshape((-1, 8), name='anchors6_reshape')(anchors6)
+ anchors7_reshaped = Reshape((-1, 8), name='anchors7_reshape')(anchors7)
+
+ # Concatenate the predictions from the different layers and the assosciated anchor box tensors
+ # Axis 0 (batch) and axis 2 (n_classes or 4, respectively) are identical for all layer predictions,
+ # so we want to concatenate along axis 1
+ # Output shape of `classes_concat`: (batch, n_boxes_total, n_classes)
+ classes_concat = Concatenate(axis=1, name='classes_concat')([classes4_reshaped,
+ classes5_reshaped,
+ classes6_reshaped,
+ classes7_reshaped])
+
+ # Output shape of `boxes_concat`: (batch, n_boxes_total, 4)
+ boxes_concat = Concatenate(axis=1, name='boxes_concat')([boxes4_reshaped,
+ boxes5_reshaped,
+ boxes6_reshaped,
+ boxes7_reshaped])
+
+ # Output shape of `anchors_concat`: (batch, n_boxes_total, 8)
+ anchors_concat = Concatenate(axis=1, name='anchors_concat')([anchors4_reshaped,
+ anchors5_reshaped,
+ anchors6_reshaped,
+ anchors7_reshaped])
+
+ # The box coordinate predictions will go into the loss function just the way they are,
+ # but for the class predictions, we'll apply a softmax activation layer first
+ classes_softmax = Activation('softmax', name='classes_softmax')(classes_concat)
+
+ # Concatenate the class and box coordinate predictions and the anchors to one large predictions tensor
+ # Output shape of `predictions`: (batch, n_boxes_total, n_classes + 4 + 8)
+ predictions = Concatenate(axis=2, name='predictions')([classes_softmax, boxes_concat, anchors_concat])
+
+ if mode == 'training':
+ model = Model(inputs=x, outputs=predictions)
+ elif mode == 'inference':
+ decoded_predictions = DecodeDetections(confidence_thresh=confidence_thresh,
+ iou_threshold=iou_threshold,
+ top_k=top_k,
+ nms_max_output_size=nms_max_output_size,
+ coords=coords,
+ normalize_coords=normalize_coords,
+ img_height=img_height,
+ img_width=img_width,
+ name='decoded_predictions')(predictions)
+ model = Model(inputs=x, outputs=decoded_predictions)
+ elif mode == 'inference_fast':
+ decoded_predictions = DecodeDetectionsFast(confidence_thresh=confidence_thresh,
+ iou_threshold=iou_threshold,
+ top_k=top_k,
+ nms_max_output_size=nms_max_output_size,
+ coords=coords,
+ normalize_coords=normalize_coords,
+ img_height=img_height,
+ img_width=img_width,
+ name='decoded_predictions')(predictions)
+ model = Model(inputs=x, outputs=decoded_predictions)
+ else:
+ raise ValueError("`mode` must be one of 'training', 'inference' or 'inference_fast', but received '{}'.".format(mode))
+
+ if return_predictor_sizes:
+ # The spatial dimensions are the same for the `classes` and `boxes` predictor layers.
+ predictor_sizes = np.array([classes4._keras_shape[1:3],
+ classes5._keras_shape[1:3],
+ classes6._keras_shape[1:3],
+ classes7._keras_shape[1:3]])
+ return model, predictor_sizes
+ else:
+ return model
diff --git a/engine/object_detection_branch/single_shot_detector/ssd_encoder_decoder/__init__.py b/engine/object_detection_branch/single_shot_detector/ssd_encoder_decoder/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/engine/object_detection_branch/single_shot_detector/ssd_encoder_decoder/matching_utils.py b/engine/object_detection_branch/single_shot_detector/ssd_encoder_decoder/matching_utils.py
new file mode 100644
index 0000000..f1fcc90
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/ssd_encoder_decoder/matching_utils.py
@@ -0,0 +1,116 @@
+'''
+Utilities to match ground truth boxes to anchor boxes.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+import numpy as np
+
+def match_bipartite_greedy(weight_matrix):
+ '''
+ Returns a bipartite matching according to the given weight matrix.
+
+ The algorithm works as follows:
+
+ Let the first axis of `weight_matrix` represent ground truth boxes
+ and the second axis anchor boxes.
+ The ground truth box that has the greatest similarity with any
+ anchor box will be matched first, then out of the remaining ground
+ truth boxes, the ground truth box that has the greatest similarity
+ with any of the remaining anchor boxes will be matched second, and
+ so on. That is, the ground truth boxes will be matched in descending
+ order by maximum similarity with any of the respectively remaining
+ anchor boxes.
+ The runtime complexity is O(m^2 * n), where `m` is the number of
+ ground truth boxes and `n` is the number of anchor boxes.
+
+ Arguments:
+ weight_matrix (array): A 2D Numpy array that represents the weight matrix
+ for the matching process. If `(m,n)` is the shape of the weight matrix,
+ it must be `m <= n`. The weights can be integers or floating point
+ numbers. The matching process will maximize, i.e. larger weights are
+ preferred over smaller weights.
+
+ Returns:
+ A 1D Numpy array of length `weight_matrix.shape[0]` that represents
+ the matched index along the second axis of `weight_matrix` for each index
+ along the first axis.
+ '''
+
+ weight_matrix = np.copy(weight_matrix) # We'll modify this array.
+ num_ground_truth_boxes = weight_matrix.shape[0]
+ all_gt_indices = list(range(num_ground_truth_boxes)) # Only relevant for fancy-indexing below.
+
+ # This 1D array will contain for each ground truth box the index of
+ # the matched anchor box.
+ matches = np.zeros(num_ground_truth_boxes, dtype=np.int)
+
+ # In each iteration of the loop below, exactly one ground truth box
+ # will be matched to one anchor box.
+ for _ in range(num_ground_truth_boxes):
+
+ # Find the maximal anchor-ground truth pair in two steps: First, reduce
+ # over the anchor boxes and then reduce over the ground truth boxes.
+ anchor_indices = np.argmax(weight_matrix, axis=1) # Reduce along the anchor box axis.
+ overlaps = weight_matrix[all_gt_indices, anchor_indices]
+ ground_truth_index = np.argmax(overlaps) # Reduce along the ground truth box axis.
+ anchor_index = anchor_indices[ground_truth_index]
+ matches[ground_truth_index] = anchor_index # Set the match.
+
+ # Set the row of the matched ground truth box and the column of the matched
+ # anchor box to all zeros. This ensures that those boxes will not be matched again,
+ # because they will never be the best matches for any other boxes.
+ weight_matrix[ground_truth_index] = 0
+ weight_matrix[:,anchor_index] = 0
+
+ return matches
+
+def match_multi(weight_matrix, threshold):
+ '''
+ Matches all elements along the second axis of `weight_matrix` to their best
+ matches along the first axis subject to the constraint that the weight of a match
+ must be greater than or equal to `threshold` in order to produce a match.
+
+ If the weight matrix contains elements that should be ignored, the row or column
+ representing the respective elemet should be set to a value below `threshold`.
+
+ Arguments:
+ weight_matrix (array): A 2D Numpy array that represents the weight matrix
+ for the matching process. If `(m,n)` is the shape of the weight matrix,
+ it must be `m <= n`. The weights can be integers or floating point
+ numbers. The matching process will maximize, i.e. larger weights are
+ preferred over smaller weights.
+ threshold (float): A float that represents the threshold (i.e. lower bound)
+ that must be met by a pair of elements to produce a match.
+
+ Returns:
+ Two 1D Numpy arrays of equal length that represent the matched indices. The first
+ array contains the indices along the first axis of `weight_matrix`, the second array
+ contains the indices along the second axis.
+ '''
+
+ num_anchor_boxes = weight_matrix.shape[1]
+ all_anchor_indices = list(range(num_anchor_boxes)) # Only relevant for fancy-indexing below.
+
+ # Find the best ground truth match for every anchor box.
+ ground_truth_indices = np.argmax(weight_matrix, axis=0) # Array of shape (weight_matrix.shape[1],)
+ overlaps = weight_matrix[ground_truth_indices, all_anchor_indices] # Array of shape (weight_matrix.shape[1],)
+
+ # Filter out the matches with a weight below the threshold.
+ anchor_indices_thresh_met = np.nonzero(overlaps >= threshold)[0]
+ gt_indices_thresh_met = ground_truth_indices[anchor_indices_thresh_met]
+
+ return gt_indices_thresh_met, anchor_indices_thresh_met
diff --git a/engine/object_detection_branch/single_shot_detector/ssd_encoder_decoder/ssd_input_encoder.py b/engine/object_detection_branch/single_shot_detector/ssd_encoder_decoder/ssd_input_encoder.py
new file mode 100644
index 0000000..e09e0d5
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/ssd_encoder_decoder/ssd_input_encoder.py
@@ -0,0 +1,621 @@
+'''
+An encoder that converts ground truth annotations to SSD-compatible training targets.
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+
+import numpy as np
+
+from engine.object_detection_branch.single_shot_detector.ssd_encoder_decoder.matching_utils import match_bipartite_greedy, \
+ match_multi
+
+from engine.object_detection_branch.single_shot_detector.bounding_box_utils.bounding_box_utils import iou, convert_coordinates
+
+
+class SSDInputEncoder:
+ '''
+ Transforms ground truth labels for object detection in images
+ (2D bounding box coordinates and class labels) to the format required for
+ training an SSD model.
+
+ In the process of encoding the ground truth labels, a template of anchor boxes
+ is being built, which are subsequently matched to the ground truth boxes
+ via an intersection-over-union threshold criterion.
+ '''
+
+ def __init__(self,
+ img_height,
+ img_width,
+ n_classes,
+ predictor_sizes,
+ min_scale=0.1,
+ max_scale=0.9,
+ scales=None,
+ aspect_ratios_global=[0.5, 1.0, 2.0],
+ aspect_ratios_per_layer=None,
+ two_boxes_for_ar1=True,
+ steps=None,
+ offsets=None,
+ clip_boxes=False,
+ variances=[0.1, 0.1, 0.2, 0.2],
+ matching_type='multi',
+ pos_iou_threshold=0.5,
+ neg_iou_limit=0.3,
+ border_pixels='half',
+ coords='centroids',
+ normalize_coords=True,
+ background_id=0):
+ '''
+ Arguments:
+ img_height (int): The height of the input images.
+ img_width (int): The width of the input images.
+ n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO.
+ predictor_sizes (list): A list of int-tuples of the format `(height, width)`
+ containing the output heights and widths of the convolutional predictor layers.
+ min_scale (float, optional): The smallest scaling factor for the size of the anchor boxes as a fraction
+ of the shorter side of the input images. Note that you should set the scaling factors
+ such that the resulting anchor box sizes correspond to the sizes of the objects you are trying
+ to detect. Must be >0.
+ max_scale (float, optional): The largest scaling factor for the size of the anchor boxes as a fraction
+ of the shorter side of the input images. All scaling factors between the smallest and the
+ largest will be linearly interpolated. Note that the second to last of the linearly interpolated
+ scaling factors will actually be the scaling factor for the last predictor layer, while the last
+ scaling factor is used for the second box for aspect ratio 1 in the last predictor layer
+ if `two_boxes_for_ar1` is `True`. Note that you should set the scaling factors
+ such that the resulting anchor box sizes correspond to the sizes of the objects you are trying
+ to detect. Must be greater than or equal to `min_scale`.
+ scales (list, optional): A list of floats >0 containing scaling factors per convolutional predictor layer.
+ This list must be one element longer than the number of predictor layers. The first `k` elements are the
+ scaling factors for the `k` predictor layers, while the last element is used for the second box
+ for aspect ratio 1 in the last predictor layer if `two_boxes_for_ar1` is `True`. This additional
+ last scaling factor must be passed either way, even if it is not being used. If a list is passed,
+ this argument overrides `min_scale` and `max_scale`. All scaling factors must be greater than zero.
+ Note that you should set the scaling factors such that the resulting anchor box sizes correspond to
+ the sizes of the objects you are trying to detect.
+ aspect_ratios_global (list, optional): The list of aspect ratios for which anchor boxes are to be
+ generated. This list is valid for all prediction layers. Note that you should set the aspect ratios such
+ that the resulting anchor box shapes roughly correspond to the shapes of the objects you are trying to detect.
+ aspect_ratios_per_layer (list, optional): A list containing one aspect ratio list for each prediction layer.
+ If a list is passed, it overrides `aspect_ratios_global`. Note that you should set the aspect ratios such
+ that the resulting anchor box shapes very roughly correspond to the shapes of the objects you are trying to detect.
+ two_boxes_for_ar1 (bool, optional): Only relevant for aspect ratios lists that contain 1. Will be ignored otherwise.
+ If `True`, two anchor boxes will be generated for aspect ratio 1. The first will be generated
+ using the scaling factor for the respective layer, the second one will be generated using
+ geometric mean of said scaling factor and next bigger scaling factor.
+ steps (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
+ either ints/floats or tuples of two ints/floats. These numbers represent for each predictor layer how many
+ pixels apart the anchor box center points should be vertically and horizontally along the spatial grid over
+ the image. If the list contains ints/floats, then that value will be used for both spatial dimensions.
+ If the list contains tuples of two ints/floats, then they represent `(step_height, step_width)`.
+ If no steps are provided, then they will be computed such that the anchor box center points will form an
+ equidistant grid within the image dimensions.
+ offsets (list, optional): `None` or a list with as many elements as there are predictor layers. The elements can be
+ either floats or tuples of two floats. These numbers represent for each predictor layer how many
+ pixels from the top and left boarders of the image the top-most and left-most anchor box center points should be
+ as a fraction of `steps`. The last bit is important: The offsets are not absolute pixel values, but fractions
+ of the step size specified in the `steps` argument. If the list contains floats, then that value will
+ be used for both spatial dimensions. If the list contains tuples of two floats, then they represent
+ `(vertical_offset, horizontal_offset)`. If no offsets are provided, then they will default to 0.5 of the step size.
+ clip_boxes (bool, optional): If `True`, limits the anchor box coordinates to stay within image boundaries.
+ variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by
+ its respective variance value.
+ matching_type (str, optional): Can be either 'multi' or 'bipartite'. In 'bipartite' mode, each ground truth box will
+ be matched only to the one anchor box with the highest IoU overlap. In 'multi' mode, in addition to the aforementioned
+ bipartite matching, all anchor boxes with an IoU overlap greater than or equal to the `pos_iou_threshold` will be
+ matched to a given ground truth box.
+ pos_iou_threshold (float, optional): The intersection-over-union similarity threshold that must be
+ met in order to match a given ground truth box to a given anchor box.
+ neg_iou_limit (float, optional): The maximum allowed intersection-over-union similarity of an
+ anchor box with any ground truth box to be labeled a negative (i.e. background) box. If an
+ anchor box is neither a positive, nor a negative box, it will be ignored during training.
+ border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+ Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+ to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+ If 'half', then one of each of the two horizontal and vertical borders belong
+ to the boxex, but not the other.
+ coords (str, optional): The box coordinate format to be used internally by the model (i.e. this is not the input format
+ of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width,
+ and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+ normalize_coords (bool, optional): If `True`, the encoder uses relative instead of absolute coordinates.
+ This means instead of using absolute tartget coordinates, the encoder will scale all coordinates to be within [0,1].
+ This way learning becomes independent of the input image size.
+ background_id (int, optional): Determines which class ID is for the background class.
+ '''
+ predictor_sizes = np.array(predictor_sizes)
+ if predictor_sizes.ndim == 1:
+ predictor_sizes = np.expand_dims(predictor_sizes, axis=0)
+
+ ##################################################################################
+ # Handle exceptions.
+ ##################################################################################
+
+ if (min_scale is None or max_scale is None) and scales is None:
+ raise ValueError("Either `min_scale` and `max_scale` or `scales` need to be specified.")
+
+ if scales:
+ if (len(scales) != predictor_sizes.shape[0] + 1): # Must be two nested `if` statements since `list` and `bool` cannot be combined by `&`
+ raise ValueError("It must be either scales is None or len(scales) == len(predictor_sizes)+1, but len(scales) == {} and len(predictor_sizes)+1 == {}".format(len(scales), len(predictor_sizes)+1))
+ scales = np.array(scales)
+ if np.any(scales <= 0):
+ raise ValueError("All values in `scales` must be greater than 0, but the passed list of scales is {}".format(scales))
+ else: # If no list of scales was passed, we need to make sure that `min_scale` and `max_scale` are valid values.
+ if not 0 < min_scale <= max_scale:
+ raise ValueError("It must be 0 < min_scale <= max_scale, but it is min_scale = {} and max_scale = {}".format(min_scale, max_scale))
+
+ if not (aspect_ratios_per_layer is None):
+ if (len(aspect_ratios_per_layer) != predictor_sizes.shape[0]): # Must be two nested `if` statements since `list` and `bool` cannot be combined by `&`
+ raise ValueError("It must be either aspect_ratios_per_layer is None or len(aspect_ratios_per_layer) == len(predictor_sizes), but len(aspect_ratios_per_layer) == {} and len(predictor_sizes) == {}".format(len(aspect_ratios_per_layer), len(predictor_sizes)))
+ for aspect_ratios in aspect_ratios_per_layer:
+ if np.any(np.array(aspect_ratios) <= 0):
+ raise ValueError("All aspect ratios must be greater than zero.")
+ else:
+ if (aspect_ratios_global is None):
+ raise ValueError("At least one of `aspect_ratios_global` and `aspect_ratios_per_layer` must not be `None`.")
+ if np.any(np.array(aspect_ratios_global) <= 0):
+ raise ValueError("All aspect ratios must be greater than zero.")
+
+ if len(variances) != 4:
+ raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances)))
+ variances = np.array(variances)
+ if np.any(variances <= 0):
+ raise ValueError("All variances must be >0, but the variances given are {}".format(variances))
+
+ if not (coords == 'minmax' or coords == 'centroids' or coords == 'corners'):
+ raise ValueError("Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'.")
+
+ if (not (steps is None)) and (len(steps) != predictor_sizes.shape[0]):
+ raise ValueError("You must provide at least one step value per predictor layer.")
+
+ if (not (offsets is None)) and (len(offsets) != predictor_sizes.shape[0]):
+ raise ValueError("You must provide at least one offset value per predictor layer.")
+
+ ##################################################################################
+ # Set or compute members.
+ ##################################################################################
+
+ self.img_height = img_height
+ self.img_width = img_width
+ self.n_classes = n_classes + 1 # + 1 for the background class
+ self.predictor_sizes = predictor_sizes
+ self.min_scale = min_scale
+ self.max_scale = max_scale
+ # If `scales` is None, compute the scaling factors by linearly interpolating between
+ # `min_scale` and `max_scale`. If an explicit list of `scales` is given, however,
+ # then it takes precedent over `min_scale` and `max_scale`.
+ if (scales is None):
+ self.scales = np.linspace(self.min_scale, self.max_scale, len(self.predictor_sizes)+1)
+ else:
+ # If a list of scales is given explicitly, we'll use that instead of computing it from `min_scale` and `max_scale`.
+ self.scales = scales
+ # If `aspect_ratios_per_layer` is None, then we use the same list of aspect ratios
+ # `aspect_ratios_global` for all predictor layers. If `aspect_ratios_per_layer` is given,
+ # however, then it takes precedent over `aspect_ratios_global`.
+ if (aspect_ratios_per_layer is None):
+ self.aspect_ratios = [aspect_ratios_global] * predictor_sizes.shape[0]
+ else:
+ # If aspect ratios are given per layer, we'll use those.
+ self.aspect_ratios = aspect_ratios_per_layer
+ self.two_boxes_for_ar1 = two_boxes_for_ar1
+ if not (steps is None):
+ self.steps = steps
+ else:
+ self.steps = [None] * predictor_sizes.shape[0]
+ if not (offsets is None):
+ self.offsets = offsets
+ else:
+ self.offsets = [None] * predictor_sizes.shape[0]
+ self.clip_boxes = clip_boxes
+ self.variances = variances
+ self.matching_type = matching_type
+ self.pos_iou_threshold = pos_iou_threshold
+ self.neg_iou_limit = neg_iou_limit
+ self.border_pixels = border_pixels
+ self.coords = coords
+ self.normalize_coords = normalize_coords
+ self.background_id = background_id
+
+ # Compute the number of boxes per spatial location for each predictor layer.
+ # For example, if a predictor layer has three different aspect ratios, [1.0, 0.5, 2.0], and is
+ # supposed to predict two boxes of slightly different size for aspect ratio 1.0, then that predictor
+ # layer predicts a total of four boxes at every spatial location across the feature map.
+ if not (aspect_ratios_per_layer is None):
+ self.n_boxes = []
+ for aspect_ratios in aspect_ratios_per_layer:
+ if (1 in aspect_ratios) & two_boxes_for_ar1:
+ self.n_boxes.append(len(aspect_ratios) + 1)
+ else:
+ self.n_boxes.append(len(aspect_ratios))
+ else:
+ if (1 in aspect_ratios_global) & two_boxes_for_ar1:
+ self.n_boxes = len(aspect_ratios_global) + 1
+ else:
+ self.n_boxes = len(aspect_ratios_global)
+
+ ##################################################################################
+ # Compute the anchor boxes for each predictor layer.
+ ##################################################################################
+
+ # Compute the anchor boxes for each predictor layer. We only have to do this once
+ # since the anchor boxes depend only on the model configuration, not on the input data.
+ # For each predictor layer (i.e. for each scaling factor) the tensors for that layer's
+ # anchor boxes will have the shape `(feature_map_height, feature_map_width, n_boxes, 4)`.
+
+ self.boxes_list = [] # This will store the anchor boxes for each predicotr layer.
+
+ # The following lists just store diagnostic information. Sometimes it's handy to have the
+ # boxes' center points, heights, widths, etc. in a list.
+ self.wh_list_diag = [] # Box widths and heights for each predictor layer
+ self.steps_diag = [] # Horizontal and vertical distances between any two boxes for each predictor layer
+ self.offsets_diag = [] # Offsets for each predictor layer
+ self.centers_diag = [] # Anchor box center points as `(cy, cx)` for each predictor layer
+
+ # Iterate over all predictor layers and compute the anchor boxes for each one.
+ for i in range(len(self.predictor_sizes)):
+ boxes, center, wh, step, offset = self.generate_anchor_boxes_for_layer(feature_map_size=self.predictor_sizes[i],
+ aspect_ratios=self.aspect_ratios[i],
+ this_scale=self.scales[i],
+ next_scale=self.scales[i+1],
+ this_steps=self.steps[i],
+ this_offsets=self.offsets[i],
+ diagnostics=True)
+ self.boxes_list.append(boxes)
+ self.wh_list_diag.append(wh)
+ self.steps_diag.append(step)
+ self.offsets_diag.append(offset)
+ self.centers_diag.append(center)
+
+ def __call__(self, ground_truth_labels, diagnostics=False):
+ '''
+ Converts ground truth bounding box data into a suitable format to train an SSD model.
+
+ Arguments:
+ ground_truth_labels (list): A python list of length `batch_size` that contains one 2D Numpy array
+ for each batch image. Each such array has `k` rows for the `k` ground truth bounding boxes belonging
+ to the respective image, and the data for each ground truth bounding box has the format
+ `(class_id, xmin, ymin, xmax, ymax)` (i.e. the 'corners' coordinate format), and `class_id` must be
+ an integer greater than 0 for all boxes as class ID 0 is reserved for the background class.
+ diagnostics (bool, optional): If `True`, not only the encoded ground truth tensor will be returned,
+ but also a copy of it with anchor box coordinates in place of the ground truth coordinates.
+ This can be very useful if you want to visualize which anchor boxes got matched to which ground truth
+ boxes.
+
+ Returns:
+ `y_encoded`, a 3D numpy array of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)` that serves as the
+ ground truth label tensor for training, where `#boxes` is the total number of boxes predicted by the
+ model per image, and the classes are one-hot-encoded. The four elements after the class vecotrs in
+ the last axis are the box coordinates, the next four elements after that are just dummy elements, and
+ the last four elements are the variances.
+ '''
+
+ # Mapping to define which indices represent which coordinates in the ground truth.
+ class_id = 0
+ xmin = 1
+ ymin = 2
+ xmax = 3
+ ymax = 4
+
+ batch_size = len(ground_truth_labels)
+
+ ##################################################################################
+ # Generate the template for y_encoded.
+ ##################################################################################
+
+ y_encoded = self.generate_encoding_template(batch_size=batch_size, diagnostics=False)
+
+ ##################################################################################
+ # Match ground truth boxes to anchor boxes.
+ ##################################################################################
+
+ # Match the ground truth boxes to the anchor boxes. Every anchor box that does not have
+ # a ground truth match and for which the maximal IoU overlap with any ground truth box is less
+ # than or equal to `neg_iou_limit` will be a negative (background) box.
+
+ y_encoded[:, :, self.background_id] = 1 # All boxes are background boxes by default.
+ n_boxes = y_encoded.shape[1] # The total number of boxes that the model predicts per batch item
+ class_vectors = np.eye(self.n_classes) # An identity matrix that we'll use as one-hot class vectors
+
+ for i in range(batch_size): # For each batch item...
+
+ if ground_truth_labels[i].size == 0: continue # If there is no ground truth for this batch item, there is nothing to match.
+ labels = ground_truth_labels[i].astype(np.float) # The labels for this batch item
+
+ # Check for degenerate ground truth bounding boxes before attempting any computations.
+ if np.any(labels[:,[xmax]] - labels[:,[xmin]] <= 0) or np.any(labels[:,[ymax]] - labels[:,[ymin]] <= 0):
+ raise DegenerateBoxError("SSDInputEncoder detected degenerate ground truth bounding boxes for batch item {} with bounding boxes {}, ".format(i, labels) +
+ "i.e. bounding boxes where xmax <= xmin and/or ymax <= ymin. Degenerate ground truth " +
+ "bounding boxes will lead to NaN errors during the training.")
+
+ # Maybe normalize the box coordinates.
+ if self.normalize_coords:
+ labels[:,[ymin,ymax]] /= self.img_height # Normalize ymin and ymax relative to the image height
+ labels[:,[xmin,xmax]] /= self.img_width # Normalize xmin and xmax relative to the image width
+
+ # Maybe convert the box coordinate format.
+ if self.coords == 'centroids':
+ labels = convert_coordinates(labels, start_index=xmin, conversion='corners2centroids', border_pixels=self.border_pixels)
+ elif self.coords == 'minmax':
+ labels = convert_coordinates(labels, start_index=xmin, conversion='corners2minmax')
+
+ classes_one_hot = class_vectors[labels[:, class_id].astype(np.int)] # The one-hot class IDs for the ground truth boxes of this batch item
+ labels_one_hot = np.concatenate([classes_one_hot, labels[:, [xmin,ymin,xmax,ymax]]], axis=-1) # The one-hot version of the labels for this batch item
+
+ # Compute the IoU similarities between all anchor boxes and all ground truth boxes for this batch item.
+ # This is a matrix of shape `(num_ground_truth_boxes, num_anchor_boxes)`.
+ similarities = iou(labels[:,[xmin,ymin,xmax,ymax]], y_encoded[i,:,-12:-8], coords=self.coords, mode='outer_product', border_pixels=self.border_pixels)
+
+ # First: Do bipartite matching, i.e. match each ground truth box to the one anchor box with the highest IoU.
+ # This ensures that each ground truth box will have at least one good match.
+
+ # For each ground truth box, get the anchor box to match with it.
+ bipartite_matches = match_bipartite_greedy(weight_matrix=similarities)
+
+ # Write the ground truth data to the matched anchor boxes.
+ y_encoded[i, bipartite_matches, :-8] = labels_one_hot
+
+ # Set the columns of the matched anchor boxes to zero to indicate that they were matched.
+ similarities[:, bipartite_matches] = 0
+
+ # Second: Maybe do 'multi' matching, where each remaining anchor box will be matched to its most similar
+ # ground truth box with an IoU of at least `pos_iou_threshold`, or not matched if there is no
+ # such ground truth box.
+
+ if self.matching_type == 'multi':
+
+ # Get all matches that satisfy the IoU threshold.
+ matches = match_multi(weight_matrix=similarities, threshold=self.pos_iou_threshold)
+
+ # Write the ground truth data to the matched anchor boxes.
+ y_encoded[i, matches[1], :-8] = labels_one_hot[matches[0]]
+
+ # Set the columns of the matched anchor boxes to zero to indicate that they were matched.
+ similarities[:, matches[1]] = 0
+
+ # Third: Now after the matching is done, all negative (background) anchor boxes that have
+ # an IoU of `neg_iou_limit` or more with any ground truth box will be set to netral,
+ # i.e. they will no longer be background boxes. These anchors are "too close" to a
+ # ground truth box to be valid background boxes.
+
+ max_background_similarities = np.amax(similarities, axis=0)
+ neutral_boxes = np.nonzero(max_background_similarities >= self.neg_iou_limit)[0]
+ y_encoded[i, neutral_boxes, self.background_id] = 0
+
+ ##################################################################################
+ # Convert box coordinates to anchor box offsets.
+ ##################################################################################
+
+ if self.coords == 'centroids':
+ y_encoded[:,:,[-12,-11]] -= y_encoded[:,:,[-8,-7]] # cx(gt) - cx(anchor), cy(gt) - cy(anchor)
+ y_encoded[:,:,[-12,-11]] /= y_encoded[:,:,[-6,-5]] * y_encoded[:,:,[-4,-3]] # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance
+ y_encoded[:,:,[-10,-9]] /= y_encoded[:,:,[-6,-5]] # w(gt) / w(anchor), h(gt) / h(anchor)
+ y_encoded[:,:,[-10,-9]] = np.log(y_encoded[:,:,[-10,-9]]) / y_encoded[:,:,[-2,-1]] # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm)
+ elif self.coords == 'corners':
+ y_encoded[:,:,-12:-8] -= y_encoded[:,:,-8:-4] # (gt - anchor) for all four coordinates
+ y_encoded[:,:,[-12,-10]] /= np.expand_dims(y_encoded[:,:,-6] - y_encoded[:,:,-8], axis=-1) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
+ y_encoded[:,:,[-11,-9]] /= np.expand_dims(y_encoded[:,:,-5] - y_encoded[:,:,-7], axis=-1) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
+ y_encoded[:,:,-12:-8] /= y_encoded[:,:,-4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively
+ elif self.coords == 'minmax':
+ y_encoded[:,:,-12:-8] -= y_encoded[:,:,-8:-4] # (gt - anchor) for all four coordinates
+ y_encoded[:,:,[-12,-11]] /= np.expand_dims(y_encoded[:,:,-7] - y_encoded[:,:,-8], axis=-1) # (xmin(gt) - xmin(anchor)) / w(anchor), (xmax(gt) - xmax(anchor)) / w(anchor)
+ y_encoded[:,:,[-10,-9]] /= np.expand_dims(y_encoded[:,:,-5] - y_encoded[:,:,-6], axis=-1) # (ymin(gt) - ymin(anchor)) / h(anchor), (ymax(gt) - ymax(anchor)) / h(anchor)
+ y_encoded[:,:,-12:-8] /= y_encoded[:,:,-4:] # (gt - anchor) / size(anchor) / variance for all four coordinates, where 'size' refers to w and h respectively
+
+ if diagnostics:
+ # Here we'll save the matched anchor boxes (i.e. anchor boxes that were matched to a ground truth box, but keeping the anchor box coordinates).
+ y_matched_anchors = np.copy(y_encoded)
+ y_matched_anchors[:,:,-12:-8] = 0 # Keeping the anchor box coordinates means setting the offsets to zero.
+ return y_encoded, y_matched_anchors
+ else:
+ return y_encoded
+
+ def generate_anchor_boxes_for_layer(self,
+ feature_map_size,
+ aspect_ratios,
+ this_scale,
+ next_scale,
+ this_steps=None,
+ this_offsets=None,
+ diagnostics=False):
+ '''
+ Computes an array of the spatial positions and sizes of the anchor boxes for one predictor layer
+ of size `feature_map_size == [feature_map_height, feature_map_width]`.
+
+ Arguments:
+ feature_map_size (tuple): A list or tuple `[feature_map_height, feature_map_width]` with the spatial
+ dimensions of the feature map for which to generate the anchor boxes.
+ aspect_ratios (list): A list of floats, the aspect ratios for which anchor boxes are to be generated.
+ All list elements must be unique.
+ this_scale (float): A float in [0, 1], the scaling factor for the size of the generate anchor boxes
+ as a fraction of the shorter side of the input image.
+ next_scale (float): A float in [0, 1], the next larger scaling factor. Only relevant if
+ `self.two_boxes_for_ar1 == True`.
+ diagnostics (bool, optional): If true, the following additional outputs will be returned:
+ 1) A list of the center point `x` and `y` coordinates for each spatial location.
+ 2) A list containing `(width, height)` for each box aspect ratio.
+ 3) A tuple containing `(step_height, step_width)`
+ 4) A tuple containing `(offset_height, offset_width)`
+ This information can be useful to understand in just a few numbers what the generated grid of
+ anchor boxes actually looks like, i.e. how large the different boxes are and how dense
+ their spatial distribution is, in order to determine whether the box grid covers the input images
+ appropriately and whether the box sizes are appropriate to fit the sizes of the objects
+ to be detected.
+
+ Returns:
+ A 4D Numpy tensor of shape `(feature_map_height, feature_map_width, n_boxes_per_cell, 4)` where the
+ last dimension contains `(xmin, xmax, ymin, ymax)` for each anchor box in each cell of the feature map.
+ '''
+ # Compute box width and height for each aspect ratio.
+
+ # The shorter side of the image will be used to compute `w` and `h` using `scale` and `aspect_ratios`.
+ size = min(self.img_height, self.img_width)
+ # Compute the box widths and and heights for all aspect ratios
+ wh_list = []
+ for ar in aspect_ratios:
+ if (ar == 1):
+ # Compute the regular anchor box for aspect ratio 1.
+ box_height = box_width = this_scale * size
+ wh_list.append((box_width, box_height))
+ if self.two_boxes_for_ar1:
+ # Compute one slightly larger version using the geometric mean of this scale value and the next.
+ box_height = box_width = np.sqrt(this_scale * next_scale) * size
+ wh_list.append((box_width, box_height))
+ else:
+ box_width = this_scale * size * np.sqrt(ar)
+ box_height = this_scale * size / np.sqrt(ar)
+ wh_list.append((box_width, box_height))
+ wh_list = np.array(wh_list)
+ n_boxes = len(wh_list)
+
+ # Compute the grid of box center points. They are identical for all aspect ratios.
+
+ # Compute the step sizes, i.e. how far apart the anchor box center points will be vertically and horizontally.
+ if (this_steps is None):
+ step_height = self.img_height / feature_map_size[0]
+ step_width = self.img_width / feature_map_size[1]
+ else:
+ if isinstance(this_steps, (list, tuple)) and (len(this_steps) == 2):
+ step_height = this_steps[0]
+ step_width = this_steps[1]
+ elif isinstance(this_steps, (int, float)):
+ step_height = this_steps
+ step_width = this_steps
+ # Compute the offsets, i.e. at what pixel values the first anchor box center point will be from the top and from the left of the image.
+ if (this_offsets is None):
+ offset_height = 0.5
+ offset_width = 0.5
+ else:
+ if isinstance(this_offsets, (list, tuple)) and (len(this_offsets) == 2):
+ offset_height = this_offsets[0]
+ offset_width = this_offsets[1]
+ elif isinstance(this_offsets, (int, float)):
+ offset_height = this_offsets
+ offset_width = this_offsets
+ # Now that we have the offsets and step sizes, compute the grid of anchor box center points.
+ cy = np.linspace(offset_height * step_height, (offset_height + feature_map_size[0] - 1) * step_height, feature_map_size[0])
+ cx = np.linspace(offset_width * step_width, (offset_width + feature_map_size[1] - 1) * step_width, feature_map_size[1])
+ cx_grid, cy_grid = np.meshgrid(cx, cy)
+ cx_grid = np.expand_dims(cx_grid, -1) # This is necessary for np.tile() to do what we want further down
+ cy_grid = np.expand_dims(cy_grid, -1) # This is necessary for np.tile() to do what we want further down
+
+ # Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)`
+ # where the last dimension will contain `(cx, cy, w, h)`
+ boxes_tensor = np.zeros((feature_map_size[0], feature_map_size[1], n_boxes, 4))
+
+ boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, n_boxes)) # Set cx
+ boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, n_boxes)) # Set cy
+ boxes_tensor[:, :, :, 2] = wh_list[:, 0] # Set w
+ boxes_tensor[:, :, :, 3] = wh_list[:, 1] # Set h
+
+ # Convert `(cx, cy, w, h)` to `(xmin, ymin, xmax, ymax)`
+ boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='centroids2corners')
+
+ # If `clip_boxes` is enabled, clip the coordinates to lie within the image boundaries
+ if self.clip_boxes:
+ x_coords = boxes_tensor[:,:,:,[0, 2]]
+ x_coords[x_coords >= self.img_width] = self.img_width - 1
+ x_coords[x_coords < 0] = 0
+ boxes_tensor[:,:,:,[0, 2]] = x_coords
+ y_coords = boxes_tensor[:,:,:,[1, 3]]
+ y_coords[y_coords >= self.img_height] = self.img_height - 1
+ y_coords[y_coords < 0] = 0
+ boxes_tensor[:,:,:,[1, 3]] = y_coords
+
+ # `normalize_coords` is enabled, normalize the coordinates to be within [0,1]
+ if self.normalize_coords:
+ boxes_tensor[:, :, :, [0, 2]] /= self.img_width
+ boxes_tensor[:, :, :, [1, 3]] /= self.img_height
+
+ # TODO: Implement box limiting directly for `(cx, cy, w, h)` so that we don't have to unnecessarily convert back and forth.
+ if self.coords == 'centroids':
+ # Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`.
+ boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2centroids', border_pixels='half')
+ elif self.coords == 'minmax':
+ # Convert `(xmin, ymin, xmax, ymax)` to `(xmin, xmax, ymin, ymax).
+ boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2minmax', border_pixels='half')
+
+ if diagnostics:
+ return boxes_tensor, (cy, cx), wh_list, (step_height, step_width), (offset_height, offset_width)
+ else:
+ return boxes_tensor
+
+ def generate_encoding_template(self, batch_size, diagnostics=False):
+ '''
+ Produces an encoding template for the ground truth label tensor for a given batch.
+
+ Note that all tensor creation, reshaping and concatenation operations performed in this function
+ and the sub-functions it calls are identical to those performed inside the SSD model. This, of course,
+ must be the case in order to preserve the spatial meaning of each box prediction, but it's useful to make
+ yourself aware of this fact and why it is necessary.
+
+ In other words, the boxes in `y_encoded` must have a specific order in order correspond to the right spatial
+ positions and scales of the boxes predicted by the model. The sequence of operations here ensures that `y_encoded`
+ has this specific form.
+
+ Arguments:
+ batch_size (int): The batch size.
+ diagnostics (bool, optional): See the documnentation for `generate_anchor_boxes()`. The diagnostic output
+ here is similar, just for all predictor conv layers.
+
+ Returns:
+ A Numpy array of shape `(batch_size, #boxes, #classes + 12)`, the template into which to encode
+ the ground truth labels for training. The last axis has length `#classes + 12` because the model
+ output contains not only the 4 predicted box coordinate offsets, but also the 4 coordinates for
+ the anchor boxes and the 4 variance values.
+ '''
+ # Tile the anchor boxes for each predictor layer across all batch items.
+ boxes_batch = []
+ for boxes in self.boxes_list:
+ # Prepend one dimension to `self.boxes_list` to account for the batch size and tile it along.
+ # The result will be a 5D tensor of shape `(batch_size, feature_map_height, feature_map_width, n_boxes, 4)`
+ boxes = np.expand_dims(boxes, axis=0)
+ boxes = np.tile(boxes, (batch_size, 1, 1, 1, 1))
+
+ # Now reshape the 5D tensor above into a 3D tensor of shape
+ # `(batch, feature_map_height * feature_map_width * n_boxes, 4)`. The resulting
+ # order of the tensor content will be identical to the order obtained from the reshaping operation
+ # in our Keras model (we're using the Tensorflow backend, and tf.reshape() and np.reshape()
+ # use the same default index order, which is C-like index ordering)
+ boxes = np.reshape(boxes, (batch_size, -1, 4))
+ boxes_batch.append(boxes)
+
+ # Concatenate the anchor tensors from the individual layers to one.
+ boxes_tensor = np.concatenate(boxes_batch, axis=1)
+
+ # 3: Create a template tensor to hold the one-hot class encodings of shape `(batch, #boxes, #classes)`
+ # It will contain all zeros for now, the classes will be set in the matching process that follows
+ classes_tensor = np.zeros((batch_size, boxes_tensor.shape[1], self.n_classes))
+
+ # 4: Create a tensor to contain the variances. This tensor has the same shape as `boxes_tensor` and simply
+ # contains the same 4 variance values for every position in the last axis.
+ variances_tensor = np.zeros_like(boxes_tensor)
+ variances_tensor += self.variances # Long live broadcasting
+
+ # 4: Concatenate the classes, boxes and variances tensors to get our final template for y_encoded. We also need
+ # another tensor of the shape of `boxes_tensor` as a space filler so that `y_encoding_template` has the same
+ # shape as the SSD model output tensor. The content of this tensor is irrelevant, we'll just use
+ # `boxes_tensor` a second time.
+ y_encoding_template = np.concatenate((classes_tensor, boxes_tensor, boxes_tensor, variances_tensor), axis=2)
+
+ if diagnostics:
+ return y_encoding_template, self.centers_diag, self.wh_list_diag, self.steps_diag, self.offsets_diag
+ else:
+ return y_encoding_template
+
+class DegenerateBoxError(Exception):
+ '''
+ An exception class to be raised if degenerate boxes are being detected.
+ '''
+ pass
diff --git a/engine/object_detection_branch/single_shot_detector/ssd_encoder_decoder/ssd_output_decoder.py b/engine/object_detection_branch/single_shot_detector/ssd_encoder_decoder/ssd_output_decoder.py
new file mode 100644
index 0000000..c22356f
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/ssd_encoder_decoder/ssd_output_decoder.py
@@ -0,0 +1,532 @@
+'''
+Includes:
+* Functions to decode and filter raw SSD model output. These are only needed if the
+ SSD model does not have a `DecodeDetections` layer.
+* Functions to perform greedy non-maximum suppression
+
+Copyright (C) 2018 Pierluigi Ferrari
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+from __future__ import division
+
+import numpy as np
+
+from engine.object_detection_branch.single_shot_detector.bounding_box_utils.bounding_box_utils import iou, convert_coordinates
+
+
+def greedy_nms(y_pred_decoded, iou_threshold=0.45, coords='corners', border_pixels='half'):
+ '''
+ Perform greedy non-maximum suppression on the input boxes.
+
+ Greedy NMS works by selecting the box with the highest score and
+ removing all boxes around it that are too close to it measured by IoU-similarity.
+ Out of the boxes that are left over, once again the one with the highest
+ score is selected and so on, until no boxes with too much overlap are left.
+
+ Arguments:
+ y_pred_decoded (list): A batch of decoded predictions. For a given batch size `n` this
+ is a list of length `n` where each list element is a 2D Numpy array.
+ For a batch item with `k` predicted boxes this 2D Numpy array has
+ shape `(k, 6)`, where each row contains the coordinates of the respective
+ box in the format `[class_id, score, xmin, xmax, ymin, ymax]`.
+ Technically, the number of columns doesn't have to be 6, it can be
+ arbitrary as long as the first four elements of each row are
+ `xmin`, `xmax`, `ymin`, `ymax` (in this order) and the last element
+ is the score assigned to the prediction. Note that this function is
+ agnostic to the scale of the score or what it represents.
+ iou_threshold (float, optional): All boxes with a Jaccard similarity of
+ greater than `iou_threshold` with a locally maximal box will be removed
+ from the set of predictions, where 'maximal' refers to the box score.
+ coords (str, optional): The coordinate format of `y_pred_decoded`.
+ Can be one of the formats supported by `iou()`.
+ border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+ Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+ to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+ If 'half', then one of each of the two horizontal and vertical borders belong
+ to the boxex, but not the other.
+
+ Returns:
+ The predictions after removing non-maxima. The format is the same as the input format.
+ '''
+ y_pred_decoded_nms = []
+ for batch_item in y_pred_decoded: # For the labels of each batch item...
+ boxes_left = np.copy(batch_item)
+ maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
+ while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
+ maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence...
+ maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
+ maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
+ boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
+ if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
+ similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
+ boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
+ y_pred_decoded_nms.append(np.array(maxima))
+
+ return y_pred_decoded_nms
+
+def _greedy_nms(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'):
+ '''
+ The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal
+ function for per-class NMS in `decode_detections()`.
+ '''
+ boxes_left = np.copy(predictions)
+ maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
+ while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
+ maximum_index = np.argmax(boxes_left[:,0]) # ...get the index of the next box with the highest confidence...
+ maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
+ maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
+ boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
+ if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
+ similarities = iou(boxes_left[:,1:], maximum_box[1:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
+ boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
+ return np.array(maxima)
+
+def _greedy_nms2(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'):
+ '''
+ The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal
+ function in `decode_detections_fast()`.
+ '''
+ boxes_left = np.copy(predictions)
+ maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
+ while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
+ maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence...
+ maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
+ maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
+ boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
+ if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
+ similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
+ boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
+ return np.array(maxima)
+
+def decode_detections(y_pred,
+ confidence_thresh=0.01,
+ iou_threshold=0.45,
+ top_k=200,
+ input_coords='centroids',
+ normalize_coords=True,
+ img_height=None,
+ img_width=None,
+ border_pixels='half'):
+ '''
+ Convert model prediction output back to a format that contains only the positive box predictions
+ (i.e. the same format that `SSDInputEncoder` takes as input).
+
+ After the decoding, two stages of prediction filtering are performed for each class individually:
+ First confidence thresholding, then greedy non-maximum suppression. The filtering results for all
+ classes are concatenated and the `top_k` overall highest confidence results constitute the final
+ predictions for a given batch item. This procedure follows the original Caffe implementation.
+ For a slightly different and more efficient alternative to decode raw model output that performs
+ non-maximum suppresion globally instead of per class, see `decode_detections_fast()` below.
+
+ Arguments:
+ y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
+ of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
+ boxes predicted by the model per image and the last axis contains
+ `[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
+ confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
+ positive class in order to be considered for the non-maximum suppression stage for the respective class.
+ A lower value will result in a larger part of the selection process being done by the non-maximum suppression
+ stage, while a larger value will result in a larger part of the selection process happening in the confidence
+ thresholding stage.
+ iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
+ with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
+ to the box score.
+ top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
+ non-maximum suppression stage.
+ input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
+ for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
+ `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+ normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
+ and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
+ relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
+ Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
+ coordinates. Requires `img_height` and `img_width` if set to `True`.
+ img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
+ img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
+ border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+ Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+ to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+ If 'half', then one of each of the two horizontal and vertical borders belong
+ to the boxex, but not the other.
+
+ Returns:
+ A python list of length `batch_size` where each list element represents the predicted boxes
+ for one image and contains a Numpy array of shape `(boxes, 6)` where each row is a box prediction for
+ a non-background class for the respective image in the format `[class_id, confidence, xmin, ymin, xmax, ymax]`.
+ '''
+ if normalize_coords and ((img_height is None) or (img_width is None)):
+ raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
+
+ # 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
+
+ y_pred_decoded_raw = np.copy(y_pred[:,:,:-8]) # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]`
+
+ if input_coords == 'centroids':
+ y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]] * y_pred[:,:,[-2,-1]]) # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor)
+ y_pred_decoded_raw[:,:,[-2,-1]] *= y_pred[:,:,[-6,-5]] # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred)
+ y_pred_decoded_raw[:,:,[-4,-3]] *= y_pred[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred)
+ y_pred_decoded_raw[:,:,[-4,-3]] += y_pred[:,:,[-8,-7]] # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred)
+ y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='centroids2corners')
+ elif input_coords == 'minmax':
+ y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
+ y_pred_decoded_raw[:,:,[-4,-3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
+ y_pred_decoded_raw[:,:,[-2,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
+ y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
+ y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='minmax2corners')
+ elif input_coords == 'corners':
+ y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
+ y_pred_decoded_raw[:,:,[-4,-2]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
+ y_pred_decoded_raw[:,:,[-3,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
+ y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
+ else:
+ raise ValueError("Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'.")
+
+ # 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
+
+ if normalize_coords:
+ y_pred_decoded_raw[:,:,[-4,-2]] *= img_width # Convert xmin, xmax back to absolute coordinates
+ y_pred_decoded_raw[:,:,[-3,-1]] *= img_height # Convert ymin, ymax back to absolute coordinates
+
+ # 3: Apply confidence thresholding and non-maximum suppression per class
+
+ n_classes = y_pred_decoded_raw.shape[-1] - 4 # The number of classes is the length of the last axis minus the four box coordinates
+
+ y_pred_decoded = [] # Store the final predictions in this list
+ for batch_item in y_pred_decoded_raw: # `batch_item` has shape `[n_boxes, n_classes + 4 coords]`
+ pred = [] # Store the final predictions for this batch item here
+ for class_id in range(1, n_classes): # For each class except the background class (which has class ID 0)...
+ single_class = batch_item[:,[class_id, -4, -3, -2, -1]] # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 5]` and...
+ threshold_met = single_class[single_class[:,0] > confidence_thresh] # ...keep only those boxes with a confidence above the set threshold.
+ if threshold_met.shape[0] > 0: # If any boxes made the threshold...
+ maxima = _greedy_nms(threshold_met, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on them.
+ maxima_output = np.zeros((maxima.shape[0], maxima.shape[1] + 1)) # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]`
+ maxima_output[:,0] = class_id # Write the class ID to the first column...
+ maxima_output[:,1:] = maxima # ...and write the maxima to the other columns...
+ pred.append(maxima_output) # ...and append the maxima for this class to the list of maxima for this batch item.
+ # Once we're through with all classes, keep only the `top_k` maxima with the highest scores
+ if pred: # If there are any predictions left after confidence-thresholding...
+ pred = np.concatenate(pred, axis=0)
+ if top_k != 'all' and pred.shape[0] > top_k: # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,...
+ top_k_indices = np.argpartition(pred[:,1], kth=pred.shape[0]-top_k, axis=0)[pred.shape[0]-top_k:] # ...get the indices of the `top_k` highest-score maxima...
+ pred = pred[top_k_indices] # ...and keep only those entries of `pred`...
+ else:
+ pred = np.array(pred) # Even if empty, `pred` must become a Numpy array.
+ y_pred_decoded.append(pred) # ...and now that we're done, append the array of final predictions for this batch item to the output list
+
+ return y_pred_decoded
+
+def decode_detections_fast(y_pred,
+ confidence_thresh=0.5,
+ iou_threshold=0.45,
+ top_k='all',
+ input_coords='centroids',
+ normalize_coords=True,
+ img_height=None,
+ img_width=None,
+ border_pixels='half'):
+ '''
+ Convert model prediction output back to a format that contains only the positive box predictions
+ (i.e. the same format that `enconde_y()` takes as input).
+
+ Optionally performs confidence thresholding and greedy non-maximum suppression after the decoding stage.
+
+ Note that the decoding procedure used here is not the same as the procedure used in the original Caffe implementation.
+ For each box, the procedure used here assigns the box's highest confidence as its predicted class. Then it removes
+ all boxes for which the highest confidence is the background class. This results in less work for the subsequent
+ non-maximum suppression, because the vast majority of the predictions will be filtered out just by the fact that
+ their highest confidence is for the background class. It is much more efficient than the procedure of the original
+ implementation, but the results may also differ.
+
+ Arguments:
+ y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
+ of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
+ boxes predicted by the model per image and the last axis contains
+ `[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
+ confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in any positive
+ class required for a given box to be considered a positive prediction. A lower value will result
+ in better recall, while a higher value will result in better precision. Do not use this parameter with the
+ goal to combat the inevitably many duplicates that an SSD will produce, the subsequent non-maximum suppression
+ stage will take care of those.
+ iou_threshold (float, optional): `None` or a float in [0,1]. If `None`, no non-maximum suppression will be
+ performed. If not `None`, greedy NMS will be performed after the confidence thresholding stage, meaning
+ all boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed
+ from the set of predictions, where 'maximal' refers to the box score.
+ top_k (int, optional): 'all' or an integer with number of highest scoring predictions to be kept for each batch item
+ after the non-maximum suppression stage. If 'all', all predictions left after the NMS stage will be kept.
+ input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
+ for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
+ `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+ normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
+ and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
+ relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
+ Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
+ coordinates. Requires `img_height` and `img_width` if set to `True`.
+ img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
+ img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
+ border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+ Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+ to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+ If 'half', then one of each of the two horizontal and vertical borders belong
+ to the boxex, but not the other.
+
+ Returns:
+ A python list of length `batch_size` where each list element represents the predicted boxes
+ for one image and contains a Numpy array of shape `(boxes, 6)` where each row is a box prediction for
+ a non-background class for the respective image in the format `[class_id, confidence, xmin, xmax, ymin, ymax]`.
+ '''
+ if normalize_coords and ((img_height is None) or (img_width is None)):
+ raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
+
+ # 1: Convert the classes from one-hot encoding to their class ID
+ y_pred_converted = np.copy(y_pred[:,:,-14:-8]) # Slice out the four offset predictions plus two elements whereto we'll write the class IDs and confidences in the next step
+ y_pred_converted[:,:,0] = np.argmax(y_pred[:,:,:-12], axis=-1) # The indices of the highest confidence values in the one-hot class vectors are the class ID
+ y_pred_converted[:,:,1] = np.amax(y_pred[:,:,:-12], axis=-1) # Store the confidence values themselves, too
+
+ # 2: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
+ if input_coords == 'centroids':
+ y_pred_converted[:,:,[4,5]] = np.exp(y_pred_converted[:,:,[4,5]] * y_pred[:,:,[-2,-1]]) # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor)
+ y_pred_converted[:,:,[4,5]] *= y_pred[:,:,[-6,-5]] # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred)
+ y_pred_converted[:,:,[2,3]] *= y_pred[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred)
+ y_pred_converted[:,:,[2,3]] += y_pred[:,:,[-8,-7]] # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred)
+ y_pred_converted = convert_coordinates(y_pred_converted, start_index=-4, conversion='centroids2corners')
+ elif input_coords == 'minmax':
+ y_pred_converted[:,:,2:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
+ y_pred_converted[:,:,[2,3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
+ y_pred_converted[:,:,[4,5]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
+ y_pred_converted[:,:,2:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
+ y_pred_converted = convert_coordinates(y_pred_converted, start_index=-4, conversion='minmax2corners')
+ elif input_coords == 'corners':
+ y_pred_converted[:,:,2:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
+ y_pred_converted[:,:,[2,4]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
+ y_pred_converted[:,:,[3,5]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
+ y_pred_converted[:,:,2:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
+ else:
+ raise ValueError("Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'.")
+
+ # 3: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
+ if normalize_coords:
+ y_pred_converted[:,:,[2,4]] *= img_width # Convert xmin, xmax back to absolute coordinates
+ y_pred_converted[:,:,[3,5]] *= img_height # Convert ymin, ymax back to absolute coordinates
+
+ # 4: Decode our huge `(batch, #boxes, 6)` tensor into a list of length `batch` where each list entry is an array containing only the positive predictions
+ y_pred_decoded = []
+ for batch_item in y_pred_converted: # For each image in the batch...
+ boxes = batch_item[np.nonzero(batch_item[:,0])] # ...get all boxes that don't belong to the background class,...
+ boxes = boxes[boxes[:,1] >= confidence_thresh] # ...then filter out those positive boxes for which the prediction confidence is too low and after that...
+ if iou_threshold: # ...if an IoU threshold is set...
+ boxes = _greedy_nms2(boxes, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on the remaining boxes.
+ if top_k != 'all' and boxes.shape[0] > top_k: # If we have more than `top_k` results left at this point...
+ top_k_indices = np.argpartition(boxes[:,1], kth=boxes.shape[0]-top_k, axis=0)[boxes.shape[0]-top_k:] # ...get the indices of the `top_k` highest-scoring boxes...
+ boxes = boxes[top_k_indices] # ...and keep only those boxes...
+ y_pred_decoded.append(boxes) # ...and now that we're done, append the array of final predictions for this batch item to the output list
+
+ return y_pred_decoded
+
+################################################################################################
+# Debugging tools, not relevant for normal use
+################################################################################################
+
+# The functions below are for debugging, so you won't normally need them. That is,
+# unless you need to debug your model, of course.
+
+def decode_detections_debug(y_pred,
+ confidence_thresh=0.01,
+ iou_threshold=0.45,
+ top_k=200,
+ input_coords='centroids',
+ normalize_coords=True,
+ img_height=None,
+ img_width=None,
+ variance_encoded_in_target=False,
+ border_pixels='half'):
+ '''
+ This decoder performs the same processing as `decode_detections()`, but the output format for each left-over
+ predicted box is `[box_id, class_id, confidence, xmin, ymin, xmax, ymax]`.
+
+ That is, in addition to the usual data, each predicted box has the internal index of that box within
+ the model (`box_id`) prepended to it. This allows you to know exactly which part of the model made a given
+ box prediction; in particular, it allows you to know which predictor layer made a given prediction.
+ This can be useful for debugging.
+
+ Arguments:
+ y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
+ of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
+ boxes predicted by the model per image and the last axis contains
+ `[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
+ confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
+ positive class in order to be considered for the non-maximum suppression stage for the respective class.
+ A lower value will result in a larger part of the selection process being done by the non-maximum suppression
+ stage, while a larger value will result in a larger part of the selection process happening in the confidence
+ thresholding stage.
+ iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
+ with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
+ to the box score.
+ top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
+ non-maximum suppression stage.
+ input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
+ for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
+ `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
+ normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
+ and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
+ relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
+ Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
+ coordinates. Requires `img_height` and `img_width` if set to `True`.
+ img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
+ img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
+ border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
+ Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
+ to the boxes. If 'exclude', the border pixels do not belong to the boxes.
+ If 'half', then one of each of the two horizontal and vertical borders belong
+ to the boxex, but not the other.
+
+ Returns:
+ A python list of length `batch_size` where each list element represents the predicted boxes
+ for one image and contains a Numpy array of shape `(boxes, 7)` where each row is a box prediction for
+ a non-background class for the respective image in the format `[box_id, class_id, confidence, xmin, ymin, xmax, ymax]`.
+ '''
+ if normalize_coords and ((img_height is None) or (img_width is None)):
+ raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
+
+ # 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
+
+ y_pred_decoded_raw = np.copy(y_pred[:,:,:-8]) # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]`
+
+ if input_coords == 'centroids':
+ if variance_encoded_in_target:
+ # Decode the predicted box center x and y coordinates.
+ y_pred_decoded_raw[:,:,[-4,-3]] = y_pred_decoded_raw[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] + y_pred[:,:,[-8,-7]]
+ # Decode the predicted box width and heigt.
+ y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]]) * y_pred[:,:,[-6,-5]]
+ else:
+ # Decode the predicted box center x and y coordinates.
+ y_pred_decoded_raw[:,:,[-4,-3]] = y_pred_decoded_raw[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] * y_pred[:,:,[-4,-3]] + y_pred[:,:,[-8,-7]]
+ # Decode the predicted box width and heigt.
+ y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]] * y_pred[:,:,[-2,-1]]) * y_pred[:,:,[-6,-5]]
+ y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='centroids2corners')
+ elif input_coords == 'minmax':
+ y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
+ y_pred_decoded_raw[:,:,[-4,-3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
+ y_pred_decoded_raw[:,:,[-2,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
+ y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
+ y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='minmax2corners')
+ elif input_coords == 'corners':
+ y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
+ y_pred_decoded_raw[:,:,[-4,-2]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
+ y_pred_decoded_raw[:,:,[-3,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
+ y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
+ else:
+ raise ValueError("Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'.")
+
+ # 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
+
+ if normalize_coords:
+ y_pred_decoded_raw[:,:,[-4,-2]] *= img_width # Convert xmin, xmax back to absolute coordinates
+ y_pred_decoded_raw[:,:,[-3,-1]] *= img_height # Convert ymin, ymax back to absolute coordinates
+
+ # 3: For each batch item, prepend each box's internal index to its coordinates.
+
+ y_pred_decoded_raw2 = np.zeros((y_pred_decoded_raw.shape[0], y_pred_decoded_raw.shape[1], y_pred_decoded_raw.shape[2] + 1)) # Expand the last axis by one.
+ y_pred_decoded_raw2[:,:,1:] = y_pred_decoded_raw
+ y_pred_decoded_raw2[:,:,0] = np.arange(y_pred_decoded_raw.shape[1]) # Put the box indices as the first element for each box via broadcasting.
+ y_pred_decoded_raw = y_pred_decoded_raw2
+
+ # 4: Apply confidence thresholding and non-maximum suppression per class
+
+ n_classes = y_pred_decoded_raw.shape[-1] - 5 # The number of classes is the length of the last axis minus the four box coordinates and minus the index
+
+ y_pred_decoded = [] # Store the final predictions in this list
+ for batch_item in y_pred_decoded_raw: # `batch_item` has shape `[n_boxes, n_classes + 4 coords]`
+ pred = [] # Store the final predictions for this batch item here
+ for class_id in range(1, n_classes): # For each class except the background class (which has class ID 0)...
+ single_class = batch_item[:,[0, class_id + 1, -4, -3, -2, -1]] # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 6]` and...
+ threshold_met = single_class[single_class[:,1] > confidence_thresh] # ...keep only those boxes with a confidence above the set threshold.
+ if threshold_met.shape[0] > 0: # If any boxes made the threshold...
+ maxima = _greedy_nms_debug(threshold_met, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on them.
+ maxima_output = np.zeros((maxima.shape[0], maxima.shape[1] + 1)) # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]`
+ maxima_output[:,0] = maxima[:,0] # Write the box index to the first column...
+ maxima_output[:,1] = class_id # ...and write the class ID to the second column...
+ maxima_output[:,2:] = maxima[:,1:] # ...and write the rest of the maxima data to the other columns...
+ pred.append(maxima_output) # ...and append the maxima for this class to the list of maxima for this batch item.
+ # Once we're through with all classes, keep only the `top_k` maxima with the highest scores
+ pred = np.concatenate(pred, axis=0)
+ if pred.shape[0] > top_k: # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,...
+ top_k_indices = np.argpartition(pred[:,2], kth=pred.shape[0]-top_k, axis=0)[pred.shape[0]-top_k:] # ...get the indices of the `top_k` highest-score maxima...
+ pred = pred[top_k_indices] # ...and keep only those entries of `pred`...
+ y_pred_decoded.append(pred) # ...and now that we're done, append the array of final predictions for this batch item to the output list
+
+ return y_pred_decoded
+
+def _greedy_nms_debug(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'):
+ '''
+ The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal
+ function for per-class NMS in `decode_detections_debug()`. The difference is that it keeps the indices of all
+ left-over boxes for each batch item, which allows you to know which predictor layer predicted a given output
+ box and is thus useful for debugging.
+ '''
+ boxes_left = np.copy(predictions)
+ maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
+ while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
+ maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence...
+ maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
+ maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
+ boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
+ if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
+ similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
+ boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
+ return np.array(maxima)
+
+def get_num_boxes_per_pred_layer(predictor_sizes, aspect_ratios, two_boxes_for_ar1):
+ '''
+ Returns a list of the number of boxes that each predictor layer predicts.
+
+ `aspect_ratios` must be a nested list, containing a list of aspect ratios
+ for each predictor layer.
+ '''
+ num_boxes_per_pred_layer = []
+ for i in range(len(predictor_sizes)):
+ if two_boxes_for_ar1:
+ num_boxes_per_pred_layer.append(predictor_sizes[i][0] * predictor_sizes[i][1] * (len(aspect_ratios[i]) + 1))
+ else:
+ num_boxes_per_pred_layer.append(predictor_sizes[i][0] * predictor_sizes[i][1] * len(aspect_ratios[i]))
+ return num_boxes_per_pred_layer
+
+def get_pred_layers(y_pred_decoded, num_boxes_per_pred_layer):
+ '''
+ For a given prediction tensor decoded with `decode_detections_debug()`, returns a list
+ with the indices of the predictor layers that made each predictions.
+
+ That is, this function lets you know which predictor layer is responsible
+ for a given prediction.
+
+ Arguments:
+ y_pred_decoded (array): The decoded model output tensor. Must have been
+ decoded with `decode_detections_debug()` so that it contains the internal box index
+ for each predicted box.
+ num_boxes_per_pred_layer (list): A list that contains the total number
+ of boxes that each predictor layer predicts.
+ '''
+ pred_layers_all = []
+ cum_boxes_per_pred_layer = np.cumsum(num_boxes_per_pred_layer)
+ for batch_item in y_pred_decoded:
+ pred_layers = []
+ for prediction in batch_item:
+ if (prediction[0] < 0) or (prediction[0] >= cum_boxes_per_pred_layer[-1]):
+ raise ValueError("Box index is out of bounds of the possible indices as given by the values in `num_boxes_per_pred_layer`.")
+ for i in range(len(cum_boxes_per_pred_layer)):
+ if prediction[0] < cum_boxes_per_pred_layer[i]:
+ pred_layers.append(i)
+ break
+ pred_layers_all.append(pred_layers)
+ return pred_layers_all
diff --git a/engine/object_detection_branch/single_shot_detector/training_summaries/ssd300_pascal_07+12_loss_history.png b/engine/object_detection_branch/single_shot_detector/training_summaries/ssd300_pascal_07+12_loss_history.png
new file mode 100644
index 0000000..975707c
Binary files /dev/null and b/engine/object_detection_branch/single_shot_detector/training_summaries/ssd300_pascal_07+12_loss_history.png differ
diff --git a/engine/object_detection_branch/single_shot_detector/training_summaries/ssd300_pascal_07+12_training_summary.md b/engine/object_detection_branch/single_shot_detector/training_summaries/ssd300_pascal_07+12_training_summary.md
new file mode 100755
index 0000000..38a48c5
--- /dev/null
+++ b/engine/object_detection_branch/single_shot_detector/training_summaries/ssd300_pascal_07+12_training_summary.md
@@ -0,0 +1,46 @@
+## SSD300 Pascal VOC 07+12 Training Summary
+---
+
+This is a summary of the training of an SSD300 on the Pascal VOC 2007 `trainval` and 2012 `trainval` image sets using the same configuration as in the original Caffe implementation for that same model.
+
+Since neither the SSD paper nor the GitHub repository of the original Caffe SSD implementation state details on the training progress, but only the final evaluation results, maybe some will find the loss curves and intermediate mAP evaluation results provided here helpful for comparison with their own training.
+
+What you see below are the training results of running the [`ssd300_training.ipynb`](../ssd300_training.ipynb) notebook as is, in which all parameters are already preset to replicate the training configuration of the original SSD300 "07+12" model. I just made one small change: I occasionally ran into `OOM` errors at batch size 32, so I trained with batch size 31.
+
+Important note about the data shown below:
+
+SGD is inherently unstable at the beginning of the training. Remember that the optimization is stochastic, i.e. if you start a fresh training ten times, the loss pattern over the first training steps can look different each time, and in the case of SGD, very different. One time the loss might decrease smoothly right from the start, which is what happened in my case below. Another time the loss might get temporarily stuck on a plateau very early on such that nothing seems to be happening for a couple of hundred training steps. Yet another time the loss might blow up right at the start and become `NaN`. As long as the loss doesn't become `NaN`, the final convergence loss does, in my experience, not strongly depend on the loss progression in the very early phase of the training. In other words, even if the loss doesn't decrease as fast in the beginning, you will likely still end up with the same convergence loss, it will just take longer to get there. Just as a benchmark, after the first 1,000 training steps I've seen anything between around 10 and 15 as values for the training loss. The Adam optimizer doesn't suffer from this variability to the same extent and is evidently the superior optimizer, but since the original Caffe models were trained with SGD, I used that to reproduce the original results.
+
+### Training and Validation Loss
+
+What you see below are the training and validation loss every 1,000 training steps. The validation loss is computed on the Pascal VOC 2007 `test` image set. In my case it took only around 105,000 instead of the expected 120,000 iterations for the validation loss to converge, but as explained above, it may well take longer. The drop you're seeing at 56,000 training steps was when I reduced the learning rate from 0.001 to 0.0001. The original learning rate schedule schedules this reduction only after 80,000 training steps, but since the loss decreased so quickly in the beginning in my case, I had to decrease the learning rate earlier. I reduced the learning rate to 0.00001 after 76,000 training steps and kept it constant from there.
+
+
+
+### Mean Average Precision
+
+Here are the intermediate and final mAP values on Pascal VOC 2007 `test`, evaluated using the official Pascal VOCdevkit 2007 Matlab evaluation code. The table shows the best values after every 20,000 training steps. Once again, the progress may be slower depending on how the early phase of the training is going. In another training I started with the same configuration, I got an mAP of only 0.665 after the first 20,000 training steps. The full model after 102,000 training steps can be downloaded [here](https://drive.google.com/open?id=1-MYYaZbIHNPtI2zzklgVBAjssbP06BeA).
+
+| | Steps | 20k | 40k | 60k | 80k | 100k | 102k |
+|-------------|-------|----------|----------|----------|----------|----------|----------|
+|aeroplane | AP | 0.6874 | 0.7401 | 0.7679 | 0.7827 | 0.7912 | 0.7904 |
+|bicycle | AP | 0.7786 | 0.8203 | 0.795 | 0.8436 | 0.8453 | 0.8466 |
+|bird | AP | 0.6855 | 0.6939 | 0.7191 | 0.7564 | 0.7655 | 0.7672 |
+|boat | AP | 0.5804 | 0.6173 | 0.6258 | 0.6866 | 0.6896 | 0.6952 |
+|bottle | AP | 0.3449 | 0.4288 | 0.453 | 0.4681 | 0.4896 | 0.4844 |
+|bus | AP | 0.7771 | 0.8332 | 0.8343 | 0.8525 | 0.8537 | 0.8554 |
+|car | AP | 0.8048 | 0.8435 | 0.8345 | 0.848 | 0.8546 | 0.8543 |
+|cat | AP | 0.852 | 0.7989 | 0.8551 | 0.8759 | 0.8727 | 0.8746 |
+|chair | AP | 0.5085 | 0.5548 | 0.5287 | 0.5873 | 0.5895 | 0.5911 |
+|cow | AP | 0.7359 | 0.7821 | 0.791 | 0.8278 | 0.8271 | 0.8243 |
+|diningtable | AP | 0.6805 | 0.7181 | 0.7502 | 0.7543 | 0.7733 | 0.7614 |
+|dog | AP | 0.8118 | 0.7898 | 0.8222 | 0.8546 | 0.8544 | 0.8552 |
+|horse | AP | 0.823 | 0.8501 | 0.8532 | 0.8586 | 0.8688 | 0.867 |
+|motorbike | AP | 0.7725 | 0.7935 | 0.8081 | 0.845 | 0.8471 | 0.8509 |
+|person | AP | 0.73 | 0.7514 | 0.7634 | 0.7851 | 0.7869 | 0.7862 |
+|pottedplant | AP | 0.4112 | 0.4335 | 0.4982 | 0.5051 | 0.5131 | 0.5182 |
+|sheep | AP | 0.6821 | 0.7324 | 0.7283 | 0.7717 | 0.7783 | 0.7799 |
+|sofa | AP | 0.7417 | 0.7824 | 0.7663 | 0.7928 | 0.7911 | 0.794 |
+|train | AP | 0.7942 | 0.8169 | 0.8326 | 0.867 | 0.862 | 0.8596 |
+|tvmonitor | AP | 0.725 | 0.7301 | 0.7259 | 0.7589 | 0.7649 | 0.7651 |
+| |**mAP**|**0.696** |**0.726** |**0.738** |**0.766** |**0.7709**|**0.7711**|
diff --git a/engine/object_detection_branch/ssd_detector.py b/engine/object_detection_branch/ssd_detector.py
new file mode 100644
index 0000000..ed733ec
--- /dev/null
+++ b/engine/object_detection_branch/ssd_detector.py
@@ -0,0 +1,165 @@
+from keras import backend as K
+from keras.models import load_model
+from keras.preprocessing import image
+from keras.optimizers import Adam
+from scipy.misc import imread
+import numpy as np
+from matplotlib import pyplot as plt
+
+from keras.utils.data_utils import get_file
+
+from engine.object_detection_branch.single_shot_detector.models.keras_ssd300 import ssd_300
+from engine.object_detection_branch.single_shot_detector.keras_loss_function.keras_ssd_loss import SSDLoss
+from engine.object_detection_branch.single_shot_detector.keras_layers.keras_layer_AnchorBoxes import AnchorBoxes
+from engine.object_detection_branch.single_shot_detector.keras_layers.keras_layer_DecodeDetections import DecodeDetections
+from engine.object_detection_branch.single_shot_detector.keras_layers.keras_layer_DecodeDetectionsFast import DecodeDetectionsFast
+from engine.object_detection_branch.single_shot_detector.keras_layers.keras_layer_L2Normalization import L2Normalization
+
+from engine.object_detection_branch.single_shot_detector.ssd_encoder_decoder.ssd_output_decoder import decode_detections, decode_detections_fast
+
+from engine.object_detection_branch.single_shot_detector.data_generator.object_detection_2d_data_generator import DataGenerator
+from engine.object_detection_branch.single_shot_detector.data_generator.object_detection_2d_photometric_ops import ConvertTo3Channels
+from engine.object_detection_branch.single_shot_detector.data_generator.object_detection_2d_geometric_ops import Resize
+from engine.object_detection_branch.single_shot_detector.data_generator.object_detection_2d_misc_utils import apply_inverse_transforms
+
+WEIGHTS_PATH = 'https://github.com/GKalliatakis/Keras-EMOTIC-resources/releases/download/v1.0.1/VGG_VOC0712_SSD_300x300_iter_120000.h5'
+
+def single_shot_detector(img_path,
+ imshow=False):
+ # Set the image size.
+ img_height = 300
+ img_width = 300
+
+ # 1: Build the Keras model
+
+ K.clear_session() # Clear previous models from memory.
+
+ model = ssd_300(image_size=(img_height, img_width, 3),
+ n_classes=20,
+ mode='inference',
+ l2_regularization=0.0005,
+ scales=[0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05],
+ # The scales for MS COCO are [0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05]
+ aspect_ratios_per_layer=[[1.0, 2.0, 0.5],
+ [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
+ [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
+ [1.0, 2.0, 0.5, 3.0, 1.0 / 3.0],
+ [1.0, 2.0, 0.5],
+ [1.0, 2.0, 0.5]],
+ two_boxes_for_ar1=True,
+ steps=[8, 16, 32, 64, 100, 300],
+ offsets=[0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
+ clip_boxes=False,
+ variances=[0.1, 0.1, 0.2, 0.2],
+ normalize_coords=True,
+ subtract_mean=[123, 117, 104],
+ swap_channels=[2, 1, 0],
+ confidence_thresh=0.5,
+ iou_threshold=0.45,
+ top_k=200,
+ nms_max_output_size=400)
+
+ # 2: Load the trained weights into the model.
+
+ weights_path = get_file('VGG_VOC0712_SSD_300x300_iter_120000.h5',
+ WEIGHTS_PATH,
+ cache_subdir='EMOTIC/object_detectors')
+
+ model.load_weights(weights_path, by_name=True)
+
+ # 3: Compile the model so that Keras won't complain the next time you load it.
+
+ adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
+
+ ssd_loss = SSDLoss(neg_pos_ratio=3, alpha=1.0)
+
+ model.compile(optimizer=adam, loss=ssd_loss.compute_loss)
+
+ orig_images = [] # Store the images here.
+ input_images = [] # Store resized versions of the images here.
+
+ # Load the image
+ orig_images.append(imread(img_path))
+ img = image.load_img(img_path, target_size=(img_height, img_width))
+ img = image.img_to_array(img)
+ input_images.append(img)
+ input_images = np.array(input_images)
+
+ y_pred = model.predict(input_images)
+
+ confidence_threshold = 0.5
+
+ y_pred_thresh = [y_pred[k][y_pred[k, :, 1] > confidence_threshold] for k in range(y_pred.shape[0])]
+
+ np.set_printoptions(precision=2, suppress=True, linewidth=90)
+ # print("Predicted boxes:\n")
+ # print(' class conf xmin ymin xmax ymax')
+ # print(y_pred_thresh[0])
+
+ # nb_persons = len(y_pred_thresh[0])
+
+ # Display the image and draw the predicted boxes onto it.
+
+ # Set the colors for the bounding boxes
+ colors = plt.cm.hsv(np.linspace(0, 1, 21)).tolist()
+ classes = ['background',
+ 'aeroplane', 'bicycle', 'bird', 'boat',
+ 'bottle', 'bus', 'car', 'cat',
+ 'chair', 'cow', 'diningtable', 'dog',
+ 'horse', 'motorbike', 'person', 'pottedplant',
+ 'sheep', 'sofa', 'train', 'tvmonitor']
+
+ plt.figure(figsize=(20, 12))
+ plt.imshow(orig_images[0])
+
+ current_axis = plt.gca()
+
+ # print len(y_pred_thresh[0])
+
+ final_array = np.empty([len(y_pred_thresh[0]), 4])
+
+ counter = 0
+ # add a person counter to exclude labels != person
+ persons_counter = 0
+
+ for box in y_pred_thresh[0]:
+ # Transform the predicted bounding boxes for the 300x300 image to the original image dimensions.
+ xmin = box[2] * orig_images[0].shape[1] / img_width
+ ymin = box[3] * orig_images[0].shape[0] / img_height
+ xmax = box[4] * orig_images[0].shape[1] / img_width
+ ymax = box[5] * orig_images[0].shape[0] / img_height
+ color = colors[int(box[0])]
+ label = '{}: {:.2f}'.format(classes[int(box[0])], box[1])
+
+ current_class = classes[int(box[0])]
+
+ if current_class == 'person':
+ persons_counter = persons_counter + 1
+
+ if box[0] == 15:
+ final_array[counter][0] = xmin
+ final_array[counter][1] = ymin
+ final_array[counter][2] = xmax
+ final_array[counter][3] = ymax
+
+ else:
+ final_array[counter][0] = 0
+ final_array[counter][1] = 0
+ final_array[counter][2] = 0
+ final_array[counter][3] = 0
+
+ counter = counter + 1
+
+ current_axis.add_patch(
+ plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, color=color, fill=False, linewidth=2))
+ current_axis.text(xmin, ymin, label, size='x-large', color='white', bbox={'facecolor': color, 'alpha': 1.0})
+
+ # uncomment to show the input image annotated with the detected objects
+
+ if imshow:
+ plt.show()
+
+
+ # return final_array, len(y_pred_thresh[0])
+
+ return final_array, persons_counter
\ No newline at end of file
diff --git a/evaluation/HRA_2Class_predictions_results b/evaluation/HRA_2Class_predictions_results
new file mode 100644
index 0000000..adf9986
--- /dev/null
+++ b/evaluation/HRA_2Class_predictions_results
@@ -0,0 +1,158 @@
+
+============================= VGG16-cl-1layer(s) =============================
+ Top-1 acc. => 0.62
+ Coverage => 73%
+ Average Precision (AP) => 0.8345215140796967%
+
+ ============================= VGG16-cl-2layer(s) =============================
+ Top-1 acc. => 0.61
+ Coverage => 77%
+ Average Precision (AP) => 0.8135769643619918%
+
+ ============================= VGG16-cl-3layer(s) =============================
+ Top-1 acc. => 0.56
+ Coverage => 83%
+ Average Precision (AP) => 0.8311130979084907%
+
+
+
+ ============================= VGG16-dp-1layer(s) =============================
+ Top-1 acc. => 0.58
+ Coverage => 0%
+ Average Precision (AP) => 0.6372331501551507%
+
+ ============================= VGG16-dp-2layer(s) =============================
+ Top-1 acc. => 0.63
+ Coverage => 43%
+ Average Precision (AP) => 0.7627484135065733%
+
+ ============================= VGG16-dp-3layer(s) =============================
+ Top-1 acc. => 0.72
+ Coverage => 69%
+ Average Precision (AP) => 0.8826589518307391%
+
+
+
+
+---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~
+
+
+============================= VGG19-cl-1layer(s) =============================
+ Top-1 acc. => 0.65
+ Coverage => 30%
+ Average Precision (AP) => 0.8239785274488193%
+
+
+============================= VGG19-cl-2layer(s) =============================
+ Top-1 acc. => 0.61
+ Coverage => 64%
+ Average Precision (AP) => 0.8299190028154326%
+
+
+ ============================= VGG19-cl-3layer(s) =============================
+ Top-1 acc. => 0.55
+ Coverage => 87%
+ Average Precision (AP) => 0.8265037793236898%
+
+
+
+
+
+============================= VGG19-dp-1layer(s) =============================
+ Top-1 acc. => 0.69
+ Coverage => 3%
+ Average Precision (AP) => 0.8105181237137611%
+
+
+ ============================= VGG19-dp-2layer(s) =============================
+ Top-1 acc. => 0.77
+ Coverage => 54%
+ Average Precision (AP) => 0.6919831643560939%
+
+
+ ============================= VGG19-dp-3layer(s) =============================
+ Top-1 acc. => 0.82
+ Coverage => 64%
+ Average Precision (AP) => 0.6152902152469849%
+
+
+---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~
+
+
+============================= ResNet50-cl-1layer(s) =============================
+ Top-1 acc. => 0.51
+ Coverage => 0%
+ Average Precision (AP) => 0.44509945226763314%
+
+
+ ============================= ResNet50-cl-2layer(s) =============================
+ Top-1 acc. => 0.52
+ Coverage => 0%
+ Average Precision (AP) => 0.5719949910546765%
+
+
+ ============================= ResNet50-cl-3layer(s) =============================
+ Top-1 acc. => 0.5
+ Coverage => 99%
+ Average Precision (AP) => 0.6643653613914939%
+
+
+
+
+
+ ============================= ResNet50-dp-1layer(s) =============================
+ Top-1 acc. => 0.6
+ Coverage => 0%
+ Average Precision (AP) => 0.7359942609888032%
+
+============================= ResNet50-dp-2layer(s) =============================
+ Top-1 acc. => 0.42
+ Coverage => 1%
+ Average Precision (AP) => 0.5846424818121695%
+
+ ============================= ResNet50-dp-3layer(s) =============================
+ Top-1 acc. => 0.53
+ Coverage => 0%
+ Average Precision (AP) => 0.5482905143327813%
+
+
+---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~---~
+
+
+============================= VGG16_Places365-cl-1layer(s) =============================
+ Top-1 acc. => 0.59
+ Coverage => 71%
+ Average Precision (AP) => 0.8169642230481746%
+
+
+ ============================= VGG16_Places365-cl-2layer(s) =============================
+ Top-1 acc. => 0.54
+ Coverage => 44%
+ Average Precision (AP) => 0.6677467900746151%
+
+
+ ============================= VGG16_Places365-cl-3layer(s) =============================
+ Top-1 acc. => 0.67
+ Coverage => 0%
+ Average Precision (AP) => 0.6521301430604437%
+
+
+
+
+
+ ============================= VGG16_Places365-dp-1layer(s) =============================
+ Top-1 acc. => 0.64
+ Coverage => 3%
+ Average Precision (AP) => 0.6867269919769725%
+
+
+ ============================= VGG16_Places365-dp-2layer(s) =============================
+ Top-1 acc. => 0.8
+ Coverage => 49%
+ Average Precision (AP) => 0.632754438949631%
+
+
+ ============================= VGG16_Places365-dp-3layer(s) =============================
+ Top-1 acc. => 0.81
+ Coverage => 66%
+ Average Precision (AP) => 0.7588658647686364%
\ No newline at end of file
diff --git a/evaluation/VAD_predictions_results b/evaluation/VAD_predictions_results
new file mode 100644
index 0000000..01b3324
--- /dev/null
+++ b/evaluation/VAD_predictions_results
@@ -0,0 +1,58 @@
+------------------------ ResNet50 - euclidean loss ------------------------
+ Mean absolute error (MAE): 1.7944522668
+ Mean squared error (MSE): 4.92477120278
+Root mean squared error (RMSE): 2.21918255283
+Explained variance score (EVS): -1.27654424337
+ R^2 Score (R2 Score): -1.80123371249
+
+
+------------------------ VGG16 - euclidean loss ------------------------
+ Mean absolute error (MAE): 1.69004644761
+ Mean squared error (MSE): 4.51888541682
+Root mean squared error (RMSE): 2.12576701847
+Explained variance score (EVS): -1.42371232725
+ R^2 Score (R2 Score): -1.57036391159
+
+
+------------------------ VGG19 - euclidean loss ------------------------
+ Mean absolute error (MAE): 1.67684520113
+ Mean squared error (MSE): 4.35512476284
+Root mean squared error (RMSE): 2.08689356768
+Explained variance score (EVS): -1.18390084417
+ R^2 Score (R2 Score): -1.47721605846
+
+
+
+
+------------------------ VGG16_ResNet50 - euclidean loss ------------------------
+ Mean absolute error (MAE): 1.50535977136
+ Mean squared error (MSE): 3.55751174135
+Root mean squared error (RMSE): 1.88613672393
+Explained variance score (EVS): -0.892874333679
+ R^2 Score (R2 Score): -1.02352990872
+
+
+------------------------ VGG16_VGG19 - euclidean loss ------------------------
+ Mean absolute error (MAE): 1.46945273284
+ Mean squared error (MSE): 3.41380063821
+Root mean squared error (RMSE): 1.84764732517
+Explained variance score (EVS): -0.844251569992
+ R^2 Score (R2 Score): -0.941786337215
+
+
+------------------------ VGG19_ResNet50 - euclidean loss ------------------------
+ Mean absolute error (MAE): 1.58599179902
+ Mean squared error (MSE): 3.87913591802
+Root mean squared error (RMSE): 1.96955221257
+Explained variance score (EVS): -0.818387939215
+ R^2 Score (R2 Score): -1.20647129815
+
+
+
+
+------------------------ VGG19_ResNet50_VGG16 - euclidean loss ------------------------
+ Mean absolute error (MAE): 1.4612788618
+ Mean squared error (MSE): 3.34615663044
+Root mean squared error (RMSE): 1.82925029191
+Explained variance score (EVS): -0.714029774688
+ R^2 Score (R2 Score): -0.903310097972
\ No newline at end of file
diff --git a/evaluation/__init__.py b/evaluation/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/evaluation/displacenet_evaluator.py b/evaluation/displacenet_evaluator.py
new file mode 100644
index 0000000..9db5d03
--- /dev/null
+++ b/evaluation/displacenet_evaluator.py
@@ -0,0 +1,139 @@
+from __future__ import print_function
+import os
+
+from applications.hra_utils import prepare_input_data, predict_v2
+from applications.hra_vgg16 import HRA_VGG16
+from applications.hra_vgg19 import HRA_VGG19
+from applications.hra_resnet50 import HRA_ResNet50
+from applications.hra_vgg16_places365 import HRA_VGG16_Places365
+from sklearn.metrics import accuracy_score, classification_report, precision_score, confusion_matrix, average_precision_score
+from keras.preprocessing import image
+from inference.displacenet_single_image_inference_unified import displaceNet_inference
+
+class DisplaceNetBaseEvaluator(object):
+ """Perfofmance metrics base class.
+ """
+
+
+ def __init__(self,
+ hra_model_backend_name,nb_of_conv_layers_to_fine_tune,
+ emotic_model_a_backend_name,emotic_model_b_backend_name,emotic_model_c_backend_name,
+ violation_class,
+ main_test_dir ='/home/sandbox/Desktop/Human_Rights_Archive_DB/test',
+ ):
+
+ self.hra_model_backend_name = hra_model_backend_name
+ self.nb_of_conv_layers_to_fine_tune = nb_of_conv_layers_to_fine_tune
+ self.emotic_model_a_backend_name = emotic_model_a_backend_name
+ self.emotic_model_b_backend_name = emotic_model_b_backend_name
+ self.emotic_model_c_backend_name = emotic_model_c_backend_name
+ self.main_test_dir = main_test_dir
+ self.total_nb_of_test_images = sum([len(files) for r, d, files in os.walk(main_test_dir)])
+ self.sorted_categories_names = sorted(os.listdir(main_test_dir))
+ self.violation_class = violation_class
+ self.y_true = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+
+
+
+ def _obtain_y_pred(self,
+ prob_threshold=0.75):
+
+ y_pred = []
+ y_scores = []
+
+ predicted_class_list = []
+ actual_class_list = []
+ coverage_count = 0
+
+ for hra_class in self.sorted_categories_names:
+
+ # variable that contains the main dir alongside the selected category
+ tmp = os.path.join(self.main_test_dir, hra_class)
+ img_names = sorted(os.listdir(tmp))
+
+ for raw_img in img_names:
+ # variable that contains the final image to be loaded
+ print('Processing [' + raw_img + ']')
+ final_img = os.path.join(tmp, raw_img)
+
+ preds = displaceNet_inference(img_path=final_img,
+ emotic_model_a_backend_name=self.emotic_model_a_backend_name,
+ emotic_model_b_backend_name=self.emotic_model_b_backend_name,
+ emotic_model_c_backend_name=self.emotic_model_c_backend_name,
+ hra_model_backend_name=self.hra_model_backend_name,
+ nb_of_fine_tuned_conv_layers=self.nb_of_conv_layers_to_fine_tune,
+ violation_class=self.violation_class)
+
+
+ preds = preds[0]
+
+ y_pred.append(int(preds[0][0]))
+ y_scores.append(preds[0][2])
+
+ top_1_predicted_probability = preds[0][2]
+
+ # top_1_predicted = np.argmax(preds)
+ top_1_predicted_label = preds[0][1]
+
+ if top_1_predicted_probability >= prob_threshold:
+ coverage_count += 1
+
+ print ('`' + hra_class + '/' + raw_img + '` ===> `' +
+ top_1_predicted_label + '`' + ' with ' + str(top_1_predicted_probability) + ' P')
+
+ predicted_class_list.append(top_1_predicted_label)
+ actual_class_list.append(hra_class)
+
+ total_coverage_per = (coverage_count * 100) / self.total_nb_of_test_images
+
+ return y_pred, self.y_true, y_scores, total_coverage_per
+
+
+
+if __name__ == "__main__":
+
+ violation_class = 'cl'
+ hra_model_backend_name = 'VGG16'
+ nb_of_conv_layers_to_fine_tune = 1
+
+ emotic_model_a_backend_name = 'VGG19'
+ emotic_model_b_backend_name = 'VGG16'
+ emotic_model_c_backend_name = None
+
+ model_backend_name = 'VGG16'
+
+ main_test_dir = '/home/gkallia/git/AbuseNet/datasets/HRA-2clas-full-test/ChildLabour'
+
+ # ---------------------------------------------------- #
+
+
+
+
+
+ base_evaluator = DisplaceNetBaseEvaluator(hra_model_backend_name=hra_model_backend_name,
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune,
+ emotic_model_a_backend_name=emotic_model_a_backend_name,
+ emotic_model_b_backend_name=emotic_model_b_backend_name,
+ emotic_model_c_backend_name=emotic_model_c_backend_name,
+ violation_class=violation_class,
+ main_test_dir =main_test_dir,
+ )
+
+ y_pred, y_true, y_scores, total_coverage_per = base_evaluator._obtain_y_pred()
+
+ # print y_true
+ top1_acc = accuracy_score(y_true, y_pred)
+
+ AP = average_precision_score(y_true, y_scores, 'micro')
+
+
+ string = model_backend_name+'-'+violation_class+'-'+str(nb_of_conv_layers_to_fine_tune)+'layer(s)'
+
+ print('\n')
+ print( '============================= %s =============================' %string)
+ print(' Top-1 acc. => ' + str(top1_acc))
+ print(' Coverage => ' + str(total_coverage_per) + '%')
+ print(' Average Precision (AP) => ' + str(AP) + '%')
\ No newline at end of file
diff --git a/evaluation/evaluate_HRA_applications.py b/evaluation/evaluate_HRA_applications.py
new file mode 100644
index 0000000..aeba6f3
--- /dev/null
+++ b/evaluation/evaluate_HRA_applications.py
@@ -0,0 +1,123 @@
+from sklearn.metrics import accuracy_score, classification_report, precision_score, confusion_matrix,average_precision_score
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+from handcrafted_metrics import HRA_metrics, plot_confusion_matrix
+
+from applications.hra_resnet50 import HRA_ResNet50
+from applications.hra_vgg16 import HRA_VGG16
+from applications.hra_vgg19 import HRA_VGG19
+from applications.hra_vgg16_places365 import HRA_VGG16_Places365
+
+# from applications.latest.hra_vgg16_checkpoint import HRA_VGG16
+# from applications.latest.hra_vgg16_places365 import HRA_VGG16_Places365
+# from applications.latest.compoundNet_vgg16_checkpoint import CompoundNet_VGG16
+
+
+def _obtain_model(model_backend_name,
+ violation_class,
+ nb_of_conv_layers_to_fine_tune):
+
+ if model_backend_name == 'VGG16':
+ model = HRA_VGG16(weights='HRA',
+ violation_class=violation_class,
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune)
+
+
+
+ elif model_backend_name == 'VGG19':
+ model = HRA_VGG19(weights='HRA',
+ violation_class=violation_class,
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune)
+
+
+ elif model_backend_name == 'ResNet50':
+ model = HRA_ResNet50(weights='HRA',
+ violation_class=violation_class,
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune)
+
+
+ elif model_backend_name == 'VGG16_Places365':
+ model = HRA_VGG16_Places365(weights='HRA',
+ violation_class=violation_class,
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune)
+
+ return model
+
+
+violation_class = 'dp'
+model_backend_name = 'VGG16'
+nb_of_conv_layers_to_fine_tune = 2
+
+model = _obtain_model(model_backend_name=model_backend_name,
+ violation_class=violation_class,
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune)
+
+
+
+metrics = HRA_metrics(main_test_dir ='/home/sandbox/Desktop/Two-class-HRV/ChildLabour/test')
+
+[y_true, y_pred, y_score] = metrics.predict_labels(model)
+
+
+print y_true
+print y_pred
+print y_score
+
+
+# print y_true
+top1_acc = accuracy_score(y_true, y_pred)
+
+# top5_acc = top_k_accuracy_score(y_true=y_true, y_pred=y_pred,k=3,normalize=True)
+coverage = metrics.coverage(model,prob_threshold=0.85)
+# coverage = metrics.coverage_duo_ensemble(model_a,model_b,prob_threshold=0.85)
+
+
+# AP = average_precision_score (y_true = y_true, y_score=y_score)
+#
+# print AP
+
+
+
+print ('\n')
+print ('=======================================================================================================')
+print (model_backend_name+' Top-1 acc. => '+str(top1_acc))
+print (model_backend_name+' Coverage => '+str(coverage)+'%')
+
+#
+#
+# target_names = ['arms', 'child_labour', 'child_marriage', 'detention_centres', 'disability_rights', 'displaced_populations',
+# 'environment', 'no_violation', 'out_of_school']
+#
+# result= model_backend_name+' => '+ str(accuracy_score(y_true, y_pred))+ '\n'
+# result= model_backend_name+' => '+str(coverage)+'%'+ '\n'
+#
+#
+# f=open("results/coverage_late_fusion.txt", "a+")
+# f.write(result+'\n\n')
+# # f.write(str(y_pred)+'\n\n')
+# f.close()
+#
+# print(classification_report(y_true, y_pred, target_names=target_names))
+#
+# print (precision_score(y_true, y_pred, average=None))
+#
+# cnf_matrix=confusion_matrix(y_true, y_pred)
+# np.set_printoptions(precision=2)
+#
+# # Plot non-normalized confusion matrix
+# plt.figure()
+# plot_confusion_matrix(cnf_matrix, classes=target_names,
+# title='Confusion matrix, without normalization')
+#
+# # Plot normalized confusion matrix
+# plt.figure()
+# plot_confusion_matrix(cnf_matrix, classes=target_names, normalize=True,
+# title='Normalized confusion matrix')
+#
+# plt.show()
+#
+#
+# print (cnf_matrix.diagonal()/cnf_matrix.sum(axis=1))
\ No newline at end of file
diff --git a/evaluation/evaluate_VAD_predictions.py b/evaluation/evaluate_VAD_predictions.py
new file mode 100644
index 0000000..09fdab2
--- /dev/null
+++ b/evaluation/evaluate_VAD_predictions.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+""" Evaluates the estimated target values of either a single classifier or ensemble of classifiers
+ model using different regression metrics.
+
+# Reference
+- [3.3.4. Regression metrics](http://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics)
+"""
+
+from __future__ import print_function
+import numpy as np
+from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score
+
+# for printing purposes only
+classifier_name = 'VGG19_ResNet50_VGG16 - euclidean loss'
+
+# remains constant
+y_true = np.load('/home/sandbox/Desktop/EMOTIC_resources/VAD-regression/numpy_matrices/Y_train/y_test.npy')
+
+y_predicted = np.load('/home/sandbox/Desktop/VGG19_ResNet50_VGG16_y_predicted.npy')
+
+
+# reference http://joshlawman.com/metrics-regression/
+MSE = mean_squared_error(y_true = y_true, y_pred=y_predicted)
+RMSE = np.sqrt(MSE)
+r2 = r2_score(y_true = y_true, y_pred=y_predicted)
+explained_var = explained_variance_score(y_true = y_true, y_pred=y_predicted)
+MAE = mean_absolute_error(y_true = y_true, y_pred=y_predicted)
+
+
+
+print ('------------------------ ', classifier_name, '------------------------ ')
+print (' Mean absolute error (MAE): ', MAE)
+print (' Mean squared error (MSE): ', MSE) # closer to zero are better.
+print ('Root mean squared error (RMSE): ', RMSE)
+print ('Explained variance score (EVS): ', explained_var) # best possible score is 1.0
+print (' R^2 Score (R2 Score): ', r2) # best possible score is 1.0 and it can be negative
+
diff --git a/evaluation/handcrafted_metrics.py b/evaluation/handcrafted_metrics.py
new file mode 100644
index 0000000..c3aff15
--- /dev/null
+++ b/evaluation/handcrafted_metrics.py
@@ -0,0 +1,296 @@
+import os
+import itertools
+import numpy as np
+from keras.preprocessing import image
+
+from applications.hra_utils import predict
+import matplotlib.pyplot as plt
+
+
+
+class HRA_metrics():
+ """Perfofmance metrics base class.
+ """
+ def __init__(self,
+ main_test_dir ='/home/sandbox/Desktop/Human_Rights_Archive_DB/test',
+ violation_class = 'cl'
+ ):
+
+
+ self.main_test_dir = main_test_dir
+ self.total_nb_of_test_images = sum([len(files) for r, d, files in os.walk(main_test_dir)])
+ self.sorted_categories_names = sorted(os.listdir(main_test_dir))
+ self.violation_class = violation_class
+
+
+
+ def predict_labels(self,
+ model):
+ """Computes the predicted and ground truth labels, as returned by a single classifier.
+
+ # Arguments
+ model = keras model for which we want to predict the labels.
+
+ # Returns
+ y_true : 1d array-like containing the ground truth (correct) labels.
+ y_pred : 1d array-like containing the predicted labels, as returned by a classifier.
+ """
+ y_pred = []
+ y_true= []
+ y_score = []
+
+ for hra_class in self.sorted_categories_names:
+
+ # variable that contains the main dir alongside the selected category
+ tmp = os.path.join(self.main_test_dir, hra_class)
+ img_names = sorted(os.listdir(tmp))
+
+ if self.violation_class == 'cl':
+ for raw_img in img_names:
+
+ if hra_class == 'child_labour':
+ true_label = 0
+ elif hra_class == 'no_child_labour':
+ true_label = 1
+
+ y_true.append(true_label)
+
+ elif self.violation_class == 'dp':
+ for raw_img in img_names:
+
+ if hra_class == 'displaced_populations':
+ true_label = 0
+ elif hra_class == 'no_displaced_populations':
+ true_label = 1
+
+ y_true.append(true_label)
+
+ for raw_img in img_names:
+
+ # variable that contains the final image to be loaded
+ print ('Processing [' + raw_img + ']')
+ final_img = os.path.join(tmp, raw_img)
+ img = image.load_img(final_img, target_size=(224, 224))
+
+
+
+ preds = predict(violation_class=self.violation_class,
+ model=model,
+ img=img,
+ target_size=(224, 224))
+
+ print preds
+
+ y_pred.append(int(preds[0][0]))
+ y_score.append(preds[0][2])
+
+
+ # print y_pred
+
+ return y_true, y_pred, y_score
+
+
+
+ # def predict_labels_KNeighborsClassifier(self,
+ # classifier):
+ # """Computes the predicted and ground truth labels, as returned by a single classifier.
+ #
+ # # Arguments
+ # model = keras model for which we want to predict the labels.
+ #
+ # # Returns
+ # y_true : 1d array-like containing the ground truth (correct) labels.
+ # y_pred : 1d array-like containing the predicted labels, as returned by a classifier.
+ # """
+ # y_pred = []
+ # y_true= []
+ # y_score = []
+ #
+ # for hra_class in self.sorted_categories_names:
+ #
+ # # variable that contains the main dir alongside the selected category
+ # tmp = os.path.join(self.main_test_dir, hra_class)
+ # img_names = sorted(os.listdir(tmp))
+ #
+ # for raw_img in img_names:
+ #
+ # if hra_class == 'arms':
+ # true_label = 0
+ # elif hra_class == 'child_labour':
+ # true_label = 1
+ # elif hra_class == 'child_marriage':
+ # true_label = 2
+ # elif hra_class == 'detention_centres':
+ # true_label = 3
+ # elif hra_class == 'disability_rights':
+ # true_label = 4
+ # elif hra_class == 'displaced_populations':
+ # true_label = 5
+ # elif hra_class == 'environment':
+ # true_label = 6
+ # elif hra_class == 'no_violation':
+ # true_label = 7
+ # elif hra_class == 'out_of_school':
+ # true_label = 8
+ #
+ # y_true.append(true_label)
+ #
+ #
+ # # variable that contains the final image to be loaded
+ # print ('Processing [' + raw_img + ']')
+ # final_img = os.path.join(tmp, raw_img)
+ # img = image.load_img(final_img, target_size=(224, 224))
+ #
+ # preds = predict(model, img, target_size)
+ #
+ # y_pred.append(int(preds[0][0]))
+ # y_score.append(int(preds[0][2]))
+ #
+ #
+ # print y_pred
+ #
+ # return y_true, y_pred, y_score
+ #
+
+
+
+ def coverage(self,
+ model,
+ prob_threshold = 0.75):
+ """Coverage is the fraction of examples for which the ML system is able to produce a response.
+ """
+
+
+ predicted_class_list = []
+ actual_class_list = []
+ coverage_count = 0
+
+ for category in self.sorted_categories_names:
+ # variable that contains the main dir alongside the selected category
+ tmp = os.path.join(self.main_test_dir, category)
+ img_names = sorted(os.listdir(tmp))
+
+ for raw_img in img_names:
+ # variable that contains the final image to be loaded
+ final_img = os.path.join(tmp, raw_img)
+
+
+ img = image.load_img(final_img, target_size=(224, 224))
+ # preprocess image
+ x = image.img_to_array(img)
+ x = np.expand_dims(x, axis=0)
+ x = preprocess_input(x)
+
+ preds = predict(model, img, target_size)
+
+ top_1_predicted_probability = preds[0][2]
+
+ # top_1_predicted = np.argmax(preds)
+ top_1_predicted_label = preds[0][1]
+
+ if top_1_predicted_probability >= prob_threshold:
+ coverage_count += 1
+
+ print ('`' + category + '/' + raw_img + '` ===> `' +
+ top_1_predicted_label + '`' + ' with ' + str(top_1_predicted_probability) + ' P')
+
+ predicted_class_list.append(top_1_predicted_label)
+ actual_class_list.append(category)
+
+ total_coverage_per = (coverage_count * 100) / self.total_nb_of_test_images
+
+ return total_coverage_per
+
+
+
+def top_k_accuracy_score(y_true, y_pred, k=5, normalize=True):
+ """Top k Accuracy classification score.
+ For multiclass classification tasks, this metric returns the
+ number of times that the correct class was among the top k classes
+ predicted.
+ Parameters
+ ----------
+ y_true : 1d array-like, or class indicator array / sparse matrix
+ shape num_samples or [num_samples, num_classes]
+ Ground truth (correct) classes.
+ y_pred : array-like, shape [num_samples, num_classes]
+ For each sample, each row represents the
+ likelihood of each possible class.
+ The number of columns must be at least as large as the set of possible
+ classes.
+ k : int, optional (default=5) predictions are counted as correct if
+ probability of correct class is in the top k classes.
+ normalize : bool, optional (default=True)
+ If ``False``, return the number of top k correctly classified samples.
+ Otherwise, return the fraction of top k correctly classified samples.
+ Returns
+ -------
+ score : float
+ If ``normalize == True``, return the proportion of top k correctly
+ classified samples, (float), else it returns the number of top k
+ correctly classified samples (int.)
+ The best performance is 1 with ``normalize == True`` and the number
+ of samples with ``normalize == False``.
+ See also
+ --------
+ accuracy_score
+ Notes
+ -----
+ If k = 1, the result will be the same as the accuracy_score (though see
+ note below). If k is the same as the number of classes, this score will be
+ perfect and meaningless.
+ In cases where two or more classes are assigned equal likelihood, the
+ result may be incorrect if one of those classes falls at the threshold, as
+ one class must be chosen to be the nth class and the class chosen may not
+ be the correct one.
+ """
+ if len(y_true.shape) == 2:
+ y_true = np.argmax(y_true, axis=1)
+
+ num_obs, num_labels = y_pred.shape
+ idx = num_labels - k - 1
+ counter = 0
+ argsorted = np.argsort(y_pred, axis=1)
+ for i in range(num_obs):
+ if y_true[i] in argsorted[i, idx + 1:]:
+ counter += 1
+ if normalize:
+ return counter / num_obs
+ else:
+ return counter
+
+
+
+def plot_confusion_matrix(cm, classes,
+ normalize=False,
+ title='Confusion matrix',
+ cmap=plt.cm.Blues):
+ """
+ This function prints and plots the confusion matrix.
+ Normalization can be applied by setting `normalize=True`.
+ """
+ if normalize:
+ cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+ print("Normalized confusion matrix")
+ else:
+ print('Confusion matrix, without normalization')
+
+ print(cm)
+
+ plt.imshow(cm, interpolation='nearest')
+ plt.title(title)
+ plt.colorbar()
+ tick_marks = np.arange(len(classes))
+ plt.xticks(tick_marks, classes, rotation=45)
+ plt.yticks(tick_marks, classes)
+
+ fmt = '.2f' if normalize else 'd'
+ thresh = cm.max() / 2.
+ for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
+ plt.text(j, i, format(cm[i, j], fmt),
+ horizontalalignment="center",
+ color="white" if cm[i, j] > thresh else "black")
+
+ plt.tight_layout()
+ plt.ylabel('True label')
+ plt.xlabel('Predicted label')
diff --git a/evaluation/hra_evaluator.py b/evaluation/hra_evaluator.py
new file mode 100644
index 0000000..210b0ee
--- /dev/null
+++ b/evaluation/hra_evaluator.py
@@ -0,0 +1,156 @@
+from __future__ import print_function
+import os
+
+from applications.hra_utils import prepare_input_data, predict_v2
+from applications.hra_vgg16 import HRA_VGG16
+from applications.hra_vgg19 import HRA_VGG19
+from applications.hra_resnet50 import HRA_ResNet50
+from applications.hra_vgg16_places365 import HRA_VGG16_Places365
+from sklearn.metrics import accuracy_score, classification_report, precision_score, confusion_matrix, average_precision_score
+from keras.preprocessing import image
+
+class BaseEvaluator(object):
+ """Perfofmance metrics base class.
+ """
+
+
+ def __init__(self,
+ model,
+ model_backend_name,
+ main_test_dir ='/home/sandbox/Desktop/Human_Rights_Archive_DB/test',
+ violation_class = 'cl'
+ ):
+
+ self.model = model
+ self.model_backend_name = model_backend_name
+ self.main_test_dir = main_test_dir
+ self.total_nb_of_test_images = sum([len(files) for r, d, files in os.walk(main_test_dir)])
+ self.sorted_categories_names = sorted(os.listdir(main_test_dir))
+ self.violation_class = violation_class
+ self.y_true = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+
+
+
+ def _obtain_y_pred(self,
+ prob_threshold=0.75):
+
+ y_pred = []
+ y_scores = []
+
+ predicted_class_list = []
+ actual_class_list = []
+ coverage_count = 0
+
+ for hra_class in self.sorted_categories_names:
+
+ # variable that contains the main dir alongside the selected category
+ tmp = os.path.join(self.main_test_dir, hra_class)
+ img_names = sorted(os.listdir(tmp))
+
+ for raw_img in img_names:
+ # variable that contains the final image to be loaded
+ print('Processing [' + raw_img + ']')
+ final_img = os.path.join(tmp, raw_img)
+
+ img = image.load_img(final_img, target_size=(224, 224))
+
+ # if self.model_backend_name == 'VGG16':
+ # x = prepare_input_data(img_path=final_img, objects_or_places_flag='objects')
+ #
+ # elif self.model_backend_name == 'VGG19':
+ # x = prepare_input_data(img_path=final_img, objects_or_places_flag='objects')
+ #
+ # elif self.model_backend_name == 'ResNet50':
+ # x = prepare_input_data(img_path=final_img, objects_or_places_flag='objects')
+ #
+ # elif self.model_backend_name == 'VGG16_Places365':
+ # x = prepare_input_data(img_path=final_img, objects_or_places_flag='places')
+
+
+ preds = predict_v2(violation_class=self.violation_class,
+ model=self.model,
+ img=img,
+ target_size=(224, 224))
+
+ y_pred.append(int(preds[0][0]))
+ y_scores.append(preds[0][2])
+
+ top_1_predicted_probability = preds[0][2]
+
+ # top_1_predicted = np.argmax(preds)
+ top_1_predicted_label = preds[0][1]
+
+ if top_1_predicted_probability >= prob_threshold:
+ coverage_count += 1
+
+ print ('`' + hra_class + '/' + raw_img + '` ===> `' +
+ top_1_predicted_label + '`' + ' with ' + str(top_1_predicted_probability) + ' P')
+
+ predicted_class_list.append(top_1_predicted_label)
+ actual_class_list.append(hra_class)
+
+ total_coverage_per = (coverage_count * 100) / self.total_nb_of_test_images
+
+ return y_pred, self.y_true, y_scores, total_coverage_per
+
+
+
+if __name__ == "__main__":
+
+ model_backend_name = 'VGG16'
+ nb_of_conv_layers_to_fine_tune = 1
+ violation_class = 'cl'
+
+ # ---------------------------------------------------- #
+
+
+
+ if violation_class == 'cl':
+ main_test_dir = '/home/sandbox/Desktop/HRA-2clas-full-test/ChildLabour'
+ elif violation_class =='dp':
+ main_test_dir = '/home/sandbox/Desktop/HRA-2clas-full-test/DisplacedPopulations'
+
+ if model_backend_name == 'ResNet50':
+ model = HRA_ResNet50(include_top=True, weights='HRA',
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune,
+ violation_class=violation_class)
+ elif model_backend_name == 'VGG16':
+ model = HRA_VGG16(include_top=True, weights='HRA',
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune,
+ violation_class=violation_class)
+ elif model_backend_name == 'VGG19':
+ model = HRA_VGG19(include_top=True, weights='HRA',
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune,
+ violation_class=violation_class)
+ elif model_backend_name == 'VGG16_Places365':
+ model = HRA_VGG16_Places365(include_top=True, weights='HRA',
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune,
+ violation_class=violation_class)
+
+
+
+
+ base_evaluator = BaseEvaluator(model=model,
+ model_backend_name=model_backend_name,
+ main_test_dir=main_test_dir,
+ violation_class=violation_class
+ )
+
+ y_pred, y_true, y_scores, total_coverage_per = base_evaluator._obtain_y_pred()
+
+ # print y_true
+ top1_acc = accuracy_score(y_true, y_pred)
+
+ AP = average_precision_score(y_true, y_scores, 'micro')
+
+
+ string = model_backend_name+'-'+violation_class+'-'+str(nb_of_conv_layers_to_fine_tune)+'layer(s)'
+
+ print('\n')
+ print( '============================= %s =============================' %string)
+ print(' Top-1 acc. => ' + str(top1_acc))
+ print(' Coverage => ' + str(total_coverage_per) + '%')
+ print(' Average Precision (AP) => ' + str(AP) + '%')
\ No newline at end of file
diff --git a/evaluator.sh b/evaluator.sh
new file mode 100644
index 0000000..6a648c2
--- /dev/null
+++ b/evaluator.sh
@@ -0,0 +1,20 @@
+python3 abusenet_evaluator_v2.py --violation_class dp --fold_number 1 --hra_model_backend_name VGG16_Places365 --nb_of_conv_layers 1 --emotic_model_a_backend_name VGG16
+python3 abusenet_evaluator_v2.py --violation_class dp --fold_number 2 --hra_model_backend_name VGG16_Places365 --nb_of_conv_layers 1 --emotic_model_a_backend_name VGG16
+python3 abusenet_evaluator_v2.py --violation_class dp --fold_number 3 --hra_model_backend_name VGG16_Places365 --nb_of_conv_layers 1 --emotic_model_a_backend_name VGG16
+python3 abusenet_evaluator_v2.py --violation_class dp --fold_number 4 --hra_model_backend_name VGG16_Places365 --nb_of_conv_layers 1 --emotic_model_a_backend_name VGG16
+python3 abusenet_evaluator_v2.py --violation_class dp --fold_number 5 --hra_model_backend_name VGG16_Places365 --nb_of_conv_layers 1 --emotic_model_a_backend_name VGG16
+
+
+python3 abusenet_evaluator_v2.py --violation_class dp --fold_number 1 --hra_model_backend_name VGG16_Places365 --nb_of_conv_layers 2 --emotic_model_a_backend_name VGG16
+python3 abusenet_evaluator_v2.py --violation_class dp --fold_number 2 --hra_model_backend_name VGG16_Places365 --nb_of_conv_layers 2 --emotic_model_a_backend_name VGG16
+python3 abusenet_evaluator_v2.py --violation_class dp --fold_number 3 --hra_model_backend_name VGG16_Places365 --nb_of_conv_layers 2 --emotic_model_a_backend_name VGG16
+python3 abusenet_evaluator_v2.py --violation_class dp --fold_number 4 --hra_model_backend_name VGG16_Places365 --nb_of_conv_layers 2 --emotic_model_a_backend_name VGG16
+python3 abusenet_evaluator_v2.py --violation_class dp --fold_number 5 --hra_model_backend_name VGG16_Places365 --nb_of_conv_layers 2 --emotic_model_a_backend_name VGG16
+
+
+python3 abusenet_evaluator_v2.py --violation_class dp --fold_number 1 --hra_model_backend_name VGG16_Places365 --nb_of_conv_layers 3 --emotic_model_a_backend_name VGG16
+python3 abusenet_evaluator_v2.py --violation_class dp --fold_number 2 --hra_model_backend_name VGG16_Places365 --nb_of_conv_layers 3 --emotic_model_a_backend_name VGG16
+python3 abusenet_evaluator_v2.py --violation_class dp --fold_number 3 --hra_model_backend_name VGG16_Places365 --nb_of_conv_layers 3 --emotic_model_a_backend_name VGG16
+python3 abusenet_evaluator_v2.py --violation_class dp --fold_number 4 --hra_model_backend_name VGG16_Places365 --nb_of_conv_layers 3 --emotic_model_a_backend_name VGG16
+python3 abusenet_evaluator_v2.py --violation_class dp --fold_number 5 --hra_model_backend_name VGG16_Places365 --nb_of_conv_layers 3 --emotic_model_a_backend_name VGG16
+
diff --git a/examples/__init__.py b/examples/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/examples/run_DisplaceNet.py b/examples/run_DisplaceNet.py
new file mode 100644
index 0000000..c6224e7
--- /dev/null
+++ b/examples/run_DisplaceNet.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+'''
+
+'''
+from __future__ import print_function
+import os
+from inference.displacenet_single_image_inference_unified import displaceNet_inference
+
+
+img_path = '/home/sandbox/Desktop/HRA-2clas-full-test/DisplacedPopulations/displaced_populations/displaced_populations_0000.jpg'
+violation_class = 'dp'
+hra_model_backend_name = 'VGG16'
+nb_of_conv_layers_to_fine_tune = 1
+
+emotic_model_a_backend_name = 'VGG16'
+emotic_model_b_backend_name = None
+emotic_model_c_backend_name = None
+
+
+DisplaceNet_preds = displaceNet_inference(img_path=img_path,
+ emotic_model_a_backend_name=emotic_model_a_backend_name,
+ emotic_model_b_backend_name=emotic_model_b_backend_name,
+ emotic_model_c_backend_name=emotic_model_c_backend_name,
+ hra_model_backend_name=hra_model_backend_name,
+ nb_of_fine_tuned_conv_layers=nb_of_conv_layers_to_fine_tune,
+ violation_class=violation_class)
+
+print (DisplaceNet_preds)
+print (DisplaceNet_preds[0])
\ No newline at end of file
diff --git a/hdf5_creation_example.py b/hdf5_creation_example.py
new file mode 100644
index 0000000..42191fa
--- /dev/null
+++ b/hdf5_creation_example.py
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+""" Creates a single HDF5 file containing a large number of images and their respective annotations (from EMOTIC Dataset).
+"""
+from preprocessing.emotic.hdf5_controller import Controller
+
+
+
+hdf5_file_name = 'EMOTIC-VAD-regression.hdf5'
+
+train_csv_file_path ='/home/gkallia/git/emotic-VAD-classification/dataset/EMOTIC_resources/CSV/train.csv'
+val_csv_file_path ='/home/gkallia/git/emotic-VAD-classification/dataset/EMOTIC_resources/CSV/val.csv'
+test_csv_file_path ='/home/gkallia/git/emotic-VAD-classification/dataset/EMOTIC_resources/CSV/test.csv'
+
+cropped_imgs_dir ='/home/gkallia/git/emotic-VAD-classification/dataset/EMOTIC_resources/cropped_imgs/'
+entire_imgs_dir = '/home/gkallia/git/emotic-VAD-classification/dataset/EMOTIC_resources/entire_multiple_imgs/'
+main_numpy_dir ='/home/gkallia/git/emotic-VAD-classification/dataset/EMOTIC_resources/VAD-regression/numpy_matrices/'
+
+controller = Controller(hdf5_file=hdf5_file_name,
+ train_csv_file_path=train_csv_file_path,
+ val_csv_file_path=val_csv_file_path,
+ test_csv_file_path=test_csv_file_path,
+ cropped_imgs_dir=cropped_imgs_dir,
+ entire_imgs_dir=entire_imgs_dir,
+ main_numpy_dir=main_numpy_dir)
+
+
+create_hdf5 = controller.create_hdf5_VAD_regression(dataset='EMOTIC', input_size=224)
diff --git a/hra_evaluator.py b/hra_evaluator.py
new file mode 100644
index 0000000..ec34fa0
--- /dev/null
+++ b/hra_evaluator.py
@@ -0,0 +1,163 @@
+from __future__ import print_function
+import os
+
+from applications.hra_utils import prepare_input_data, predict_v2
+from applications.hra_vgg16 import HRA_VGG16
+from applications.hra_vgg19 import HRA_VGG19
+from applications.hra_resnet50 import HRA_ResNet50
+from applications.hra_vgg16_places365 import HRA_VGG16_Places365
+from sklearn.metrics import accuracy_score, classification_report, precision_score, confusion_matrix, average_precision_score
+from keras.preprocessing import image
+
+class BaseEvaluator(object):
+ """Perfofmance metrics base class.
+ """
+
+
+ def __init__(self,
+ model,
+ model_backend_name,
+ main_test_dir ='/home/sandbox/Desktop/Human_Rights_Archive_DB/test',
+ violation_class = 'cl'
+ ):
+
+ self.model = model
+ self.model_backend_name = model_backend_name
+ self.main_test_dir = main_test_dir
+ self.total_nb_of_test_images = sum([len(files) for r, d, files in os.walk(main_test_dir)])
+ self.sorted_categories_names = sorted(os.listdir(main_test_dir))
+ self.violation_class = violation_class
+ self.y_true = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,]
+
+
+
+
+ def _obtain_y_pred(self,
+ prob_threshold=0.75):
+
+ y_pred = []
+ y_scores = []
+
+ predicted_class_list = []
+ actual_class_list = []
+ coverage_count = 0
+
+ for hra_class in self.sorted_categories_names:
+
+ # variable that contains the main dir alongside the selected category
+ tmp = os.path.join(self.main_test_dir, hra_class)
+ img_names = sorted(os.listdir(tmp))
+
+ for raw_img in img_names:
+ # variable that contains the final image to be loaded
+ print('Processing [' + raw_img + ']')
+ final_img = os.path.join(tmp, raw_img)
+
+ img = image.load_img(final_img, target_size=(224, 224))
+
+ # if self.model_backend_name == 'VGG16':
+ # x = prepare_input_data(img_path=final_img, objects_or_places_flag='objects')
+ #
+ # elif self.model_backend_name == 'VGG19':
+ # x = prepare_input_data(img_path=final_img, objects_or_places_flag='objects')
+ #
+ # elif self.model_backend_name == 'ResNet50':
+ # x = prepare_input_data(img_path=final_img, objects_or_places_flag='objects')
+ #
+ # elif self.model_backend_name == 'VGG16_Places365':
+ # x = prepare_input_data(img_path=final_img, objects_or_places_flag='places')
+
+
+ preds = predict_v2(violation_class=self.violation_class,
+ model=self.model,
+ img=img,
+ target_size=(224, 224))
+
+ y_pred.append(int(preds[0][0]))
+ y_scores.append(preds[0][2])
+
+ top_1_predicted_probability = preds[0][2]
+
+ # top_1_predicted = np.argmax(preds)
+ top_1_predicted_label = preds[0][1]
+
+ if top_1_predicted_probability >= prob_threshold:
+ coverage_count += 1
+
+ print ('`' + hra_class + '/' + raw_img + '` ===> `' +
+ top_1_predicted_label + '`' + ' with ' + str(top_1_predicted_probability) + ' P')
+
+ predicted_class_list.append(top_1_predicted_label)
+ actual_class_list.append(hra_class)
+
+ total_coverage_per = (coverage_count * 100) / self.total_nb_of_test_images
+
+ return y_pred, self.y_true, y_scores, total_coverage_per
+
+
+
+if __name__ == "__main__":
+
+ model_backend_name = 'VGG16'
+ nb_of_conv_layers_to_fine_tune = 3
+ violation_class = 'dp'
+
+ # ---------------------------------------------------- #
+
+
+
+ if violation_class == 'cl':
+ main_test_dir = '/home/sandbox/Desktop/HRA-2clas-full-test/ChildLabour'
+ elif violation_class =='dp':
+ main_test_dir = '/home/sandbox/Desktop/HRA-2clas-full-test/DisplacedPopulations'
+
+ if model_backend_name == 'ResNet50':
+ model = HRA_ResNet50(include_top=True, weights='HRA',
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune,
+ violation_class=violation_class)
+ elif model_backend_name == 'VGG16':
+ model = HRA_VGG16(include_top=True, weights='HRA',
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune,
+ violation_class=violation_class)
+ elif model_backend_name == 'VGG19':
+ model = HRA_VGG19(include_top=True, weights='HRA',
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune,
+ violation_class=violation_class)
+ elif model_backend_name == 'VGG16_Places365':
+ model = HRA_VGG16_Places365(include_top=True, weights='HRA',
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune,
+ violation_class=violation_class)
+
+
+
+
+ base_evaluator = BaseEvaluator(model=model,
+ model_backend_name=model_backend_name,
+ main_test_dir=main_test_dir,
+ violation_class=violation_class
+ )
+
+ y_pred, y_true, y_scores, total_coverage_per = base_evaluator._obtain_y_pred()
+
+ # print y_true
+ top1_acc = accuracy_score(y_true, y_pred)
+
+ AP = average_precision_score(y_true, y_scores, 'micro')
+
+
+ string = model_backend_name+'-'+violation_class+'-'+str(nb_of_conv_layers_to_fine_tune)+'layer(s)'
+
+ print('\n')
+ print( '============================= %s =============================' %string)
+ print(' Top-1 acc. => ' + str(top1_acc))
+ print(' Coverage => ' + str(total_coverage_per) + '%')
+ print(' Average Precision (AP) => ' + str(AP) + '%')
\ No newline at end of file
diff --git a/inference/__init__.py b/inference/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/inference/categories_HRA_2classCL.txt b/inference/categories_HRA_2classCL.txt
new file mode 100644
index 0000000..05e5540
--- /dev/null
+++ b/inference/categories_HRA_2classCL.txt
@@ -0,0 +1,2 @@
+/c/child_labour 0
+/n/no_child_labour 1
diff --git a/inference/categories_HRA_2classDP.txt b/inference/categories_HRA_2classDP.txt
new file mode 100644
index 0000000..b9daf53
--- /dev/null
+++ b/inference/categories_HRA_2classDP.txt
@@ -0,0 +1,2 @@
+/d/displaced_populations 0
+/n/no_displaced_populations 1
diff --git a/inference/displacenet_single_image_inference_unified.py b/inference/displacenet_single_image_inference_unified.py
new file mode 100644
index 0000000..b14a258
--- /dev/null
+++ b/inference/displacenet_single_image_inference_unified.py
@@ -0,0 +1,182 @@
+# -*- coding: utf-8 -*-
+'''
+
+'''
+
+from __future__ import print_function
+from engine.human_centric_branch.global_emotional_traits_branch import single_img_VAD_inference_return_only
+from engine.displaced_people_branch.single_image_inference_hra_2class import single_img_HRA_inference_return_only
+from utils.inference_utils import _obtain_emotional_traits_calibrated_predictions
+from applications.hra_utils import decode_predictions
+
+from keras.preprocessing import image
+from scipy.misc import imread
+from matplotlib import pyplot as plt
+
+def displaceNet_inference(img_path,
+ emotic_model_a_backend_name,
+ emotic_model_b_backend_name,
+ emotic_model_c_backend_name,
+ hra_model_backend_name,
+ nb_of_fine_tuned_conv_layers,
+ violation_class):
+
+
+
+
+
+
+ # obtain global emotional traits as VAD values
+ global_valence, global_dominance = single_img_VAD_inference_return_only(img_path=img_path,
+ object_detector_backend='RetinaNet',
+ model_a_backend_name=emotic_model_a_backend_name,
+ model_b_backend_name=emotic_model_b_backend_name,
+ model_c_backend_name=emotic_model_c_backend_name,
+ )
+
+ raw_HRA_preds = single_img_HRA_inference_return_only(img_path=img_path,
+ violation_class=violation_class,
+ model_backend_name=hra_model_backend_name,
+ nb_of_conv_layers_to_fine_tune=nb_of_fine_tuned_conv_layers)
+
+ # plain_preds = decode_predictions(violation_class=violation_class,
+ # preds=raw_HRA_preds,
+ # top=2)
+
+ # # Uncomment for extra verbosity
+ # plain_predicted_probability = plain_preds[0][0][2]
+ #
+ # plain_predicted_label = plain_preds[0][0][1]
+ #
+ # print ('\n')
+ #
+ # print('[INFO] Plain predictions: ',plain_predicted_label, '->', plain_predicted_probability)
+ #
+ # print ('Global dominance: ', global_dominance)
+
+ # Case where no people were detected
+ if global_dominance == 0:
+ # print ('No people were detected!')
+
+
+ final_preds = decode_predictions(violation_class=violation_class,
+ preds=raw_HRA_preds,
+ top=2)
+
+
+
+ else:
+
+ calibrated_preds = _obtain_emotional_traits_calibrated_predictions(emotional_trait=global_dominance,
+ raw_preds=raw_HRA_preds)
+
+
+ final_preds = decode_predictions(violation_class=violation_class,
+ preds=calibrated_preds,
+ top=2)
+
+
+
+
+ # # Uncomment for extra verbosity
+ # calibrated_predicted_probability = final_preds[0][0][2]
+ #
+ # calibrated_predicted_label = final_preds[0][0][1]
+ #
+ # print('[INFO] Calibrated predictions: ', calibrated_predicted_label, '->', calibrated_predicted_probability)
+
+
+ return final_preds
+
+
+
+if __name__ == "__main__":
+
+
+ img_path = '/home/sandbox/Desktop/Testing Images/human_right_viol_1.jpg'
+ violation_class = 'cl'
+ hra_model_backend_name = 'VGG16'
+ nb_of_fine_tuned_conv_layers = 1
+
+ emotic_model_a_backend_name = 'VGG16'
+ emotic_model_b_backend_name = None
+ emotic_model_c_backend_name = None
+
+
+ final_preds = displaceNet_inference(img_path,
+ emotic_model_a_backend_name,
+ emotic_model_b_backend_name,
+ emotic_model_c_backend_name,
+ hra_model_backend_name,
+ nb_of_fine_tuned_conv_layers,
+ violation_class)
+
+ print (final_preds)
+
+ img = image.load_img(img_path, target_size=(224, 224))
+
+ # plot_preds(violation_class, img, raw_preds[0])
+
+
+ numpy_img_path = imread(img_path)
+ plt.figure(figsize=(10, 12))
+ plt.imshow(numpy_img_path)
+
+ current_axis = plt.gca()
+
+ # configure colours for bounding box and text
+ violation_bounding_box_colour_rgbvar = (255, 3, 62)
+ violation_bounding_box_colour_rgbvar2 = [x / 255.0 for x in violation_bounding_box_colour_rgbvar]
+
+ no_violation_bounding_box_colour_rgbvar = (34, 139, 34)
+ no_violation_bounding_box_colour_rgbvar2 = [x / 255.0 for x in no_violation_bounding_box_colour_rgbvar]
+
+
+ overlayed_text = str(final_preds[0][0][1]) + ' (' + str(round(final_preds[0][0][2], 2)) + ')'
+
+ if violation_class == 'dp':
+ if final_preds[0][0][1] == 'displaced_populations':
+ current_axis.text(0, 0, overlayed_text, size='x-large', color='white',
+ bbox={'facecolor': violation_bounding_box_colour_rgbvar2, 'alpha': 1.0})
+
+ elif final_preds[0][0][1] == 'no_displaced_populations':
+ current_axis.text(0, 0, overlayed_text, size='x-large', color='white',
+ bbox={'facecolor': no_violation_bounding_box_colour_rgbvar2, 'alpha': 1.0})
+
+ else:
+ if final_preds[0][0][1] == 'child_labour':
+ current_axis.text(0, 0, overlayed_text, size='x-large', color='white',
+ bbox={'facecolor': violation_bounding_box_colour_rgbvar2, 'alpha': 1.0})
+
+ elif final_preds[0][0][1] == 'no_child_labour':
+ current_axis.text(0, 0, overlayed_text, size='x-large', color='white',
+ bbox={'facecolor': no_violation_bounding_box_colour_rgbvar2, 'alpha': 1.0})
+
+
+
+
+
+
+ plt.axis('off')
+ plt.show()
+
+
+ # img_path = '/home/sandbox/Desktop/Testing Images/camping.jpg'
+ # violation_class = 'dp'
+ # hra_model_backend_name = 'VGG16'
+ # nb_of_fine_tuned_conv_layers = 1
+ #
+ # emotic_model_a_backend_name = 'VGG16'
+ # emotic_model_b_backend_name = 'VGG19'
+ # emotic_model_c_backend_name = 'ResNet50'
+ #
+ #
+ # final_preds = displaceNet_inference(img_path,
+ # emotic_model_a_backend_name,
+ # emotic_model_b_backend_name,
+ # emotic_model_c_backend_name,
+ # hra_model_backend_name,
+ # nb_of_fine_tuned_conv_layers,
+ # violation_class)
+ #
+ # print (final_preds)
\ No newline at end of file
diff --git a/inference/displacenet_vs_sole_classifier.py b/inference/displacenet_vs_sole_classifier.py
new file mode 100644
index 0000000..90c1cd3
--- /dev/null
+++ b/inference/displacenet_vs_sole_classifier.py
@@ -0,0 +1,115 @@
+from __future__ import print_function
+from engine.displaced_people_branch.single_image_inference_hra_2class import single_img_HRA_inference
+
+
+from keras.preprocessing import image
+from scipy.misc import imread
+from matplotlib import pyplot as plt
+
+from inference.displacenet_single_image_inference_unified import displaceNet_inference
+
+
+img_path = '/home/sandbox/Desktop/a9e9e3527e55ccc4735af804a0807009.jpg'
+violation_class = 'dp'
+hra_model_backend_name = 'VGG19'
+nb_of_conv_layers_to_fine_tune = 1
+
+raw_preds, sole_classifier_overlayed_text, top_1_predicted_label = single_img_HRA_inference(img_path=img_path,
+ violation_class=violation_class,
+ model_backend_name=hra_model_backend_name,
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune)
+
+
+img = image.load_img(img_path, target_size=(224, 224))
+print ('Vanilla CNN prediction: ', raw_preds[0])
+
+
+emotic_model_a_backend_name = 'VGG19'
+emotic_model_b_backend_name = None
+emotic_model_c_backend_name = None
+
+final_preds = displaceNet_inference(img_path,
+ emotic_model_a_backend_name,
+ emotic_model_b_backend_name,
+ emotic_model_c_backend_name,
+ hra_model_backend_name,
+ nb_of_conv_layers_to_fine_tune,
+ violation_class)
+
+
+
+print('DisplaceNet prediction: ', final_preds)
+
+numpy_img_path = imread(img_path)
+plt.figure(figsize=(10, 12))
+plt.imshow(numpy_img_path)
+
+current_axis = plt.gca()
+
+# configure colours for bounding box and text
+violation_bounding_box_colour_rgbvar = (255, 3, 62)
+violation_bounding_box_colour_rgbvar2 = [x / 255.0 for x in violation_bounding_box_colour_rgbvar]
+
+no_violation_bounding_box_colour_rgbvar = (34, 139, 34)
+no_violation_bounding_box_colour_rgbvar2 = [x / 255.0 for x in no_violation_bounding_box_colour_rgbvar]
+
+abusenet_overlayed_text = str(final_preds[0][0][1]) + ' (' + str(round(final_preds[0][0][2], 2)) + ')'
+
+# print (abusenet_overlayed_text)
+
+abusenet_overlayed_text = 'DisplaceNet: '+abusenet_overlayed_text
+sole_classifier_overlayed_text = 'Vanilla CNN: '+ sole_classifier_overlayed_text
+
+
+if violation_class == 'dp':
+ if final_preds[0][0][1] == 'displaced_populations':
+ current_axis.text(0, -28, abusenet_overlayed_text, size='x-large', color='white',
+ bbox={'facecolor': violation_bounding_box_colour_rgbvar2, 'alpha': 1.0})
+
+ if top_1_predicted_label == 'displaced_populations':
+ current_axis.text(0, -7, sole_classifier_overlayed_text, size='x-large', color='white',
+ bbox={'facecolor': violation_bounding_box_colour_rgbvar2, 'alpha': 1.0})
+ else:
+ current_axis.text(0, -7, sole_classifier_overlayed_text, size='x-large', color='white',
+ bbox={'facecolor': no_violation_bounding_box_colour_rgbvar2, 'alpha': 1.0})
+
+ elif final_preds[0][0][1] == 'no_displaced_populations':
+ current_axis.text(0, -45, abusenet_overlayed_text, size='x-large', color='white',
+ bbox={'facecolor': no_violation_bounding_box_colour_rgbvar2, 'alpha': 1.0})
+
+ if top_1_predicted_label == 'displaced_populations':
+ current_axis.text(0, -7, sole_classifier_overlayed_text, size='x-large', color='white',
+ bbox={'facecolor': violation_bounding_box_colour_rgbvar2, 'alpha': 1.0})
+ else:
+ current_axis.text(0, -7, sole_classifier_overlayed_text, size='x-large', color='white',
+ bbox={'facecolor': no_violation_bounding_box_colour_rgbvar2, 'alpha': 1.0})
+
+else:
+ if final_preds[0][0][1] == 'child_labour':
+ current_axis.text(0, -38, abusenet_overlayed_text, size='x-large', color='white',
+ bbox={'facecolor': violation_bounding_box_colour_rgbvar2, 'alpha': 1.0})
+
+ if top_1_predicted_label == 'child_labour':
+ current_axis.text(0, -7, sole_classifier_overlayed_text, size='x-large', color='white',
+ bbox={'facecolor': violation_bounding_box_colour_rgbvar2, 'alpha': 1.0})
+ else:
+ current_axis.text(0, -7, sole_classifier_overlayed_text, size='x-large', color='white',
+ bbox={'facecolor': no_violation_bounding_box_colour_rgbvar2, 'alpha': 1.0})
+
+ elif final_preds[0][0][1] == 'no_child_labour':
+ current_axis.text(0, -38, abusenet_overlayed_text, size='x-large', color='white',
+ bbox={'facecolor': no_violation_bounding_box_colour_rgbvar2, 'alpha': 1.0})
+
+ if top_1_predicted_label == 'child_labour':
+ current_axis.text(0, -7, sole_classifier_overlayed_text, size='x-large', color='white',
+ bbox={'facecolor': violation_bounding_box_colour_rgbvar2, 'alpha': 1.0})
+ else:
+ current_axis.text(0, -7, sole_classifier_overlayed_text, size='x-large', color='white',
+ bbox={'facecolor': no_violation_bounding_box_colour_rgbvar2, 'alpha': 1.0})
+
+
+
+
+
+plt.axis('off')
+plt.show()
diff --git a/inference/emotic_single_image_inference_unified.py b/inference/emotic_single_image_inference_unified.py
new file mode 100644
index 0000000..dafbb82
--- /dev/null
+++ b/inference/emotic_single_image_inference_unified.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+'''
+Use three emotional dimensions - pleasure, arousal and dominance - to describe human perceptions of physical environments.
+
+Interpretations of pleasure: Positive versus negative affective states (e.g. excitement, relaxation, love, and
+tranquility versus cruelty, humiliation, disinterest, and boredom)
+
+Interpretations of arousal: Level of mental alertness and physical activity. (e.g. sleep, inactivity, boredom, and
+relaxation at the lower end versus wakefulness, bodily tension, strenuous
+exercise, and concentration at the higher end).
+
+Interpretations of dominance: Ranges from feelings of total lack control or influence on events and surroundings to
+the opposite extreme of feeling influential and in control
+
+'''
+
+from engine.human_centric_branch.global_emotional_traits_branch import single_img_VAD_inference, single_img_VAD_inference_with_bounding_boxes
+
+img_path = '/home/sandbox/Desktop/canggu-honeymoon-photography-013.jpg'
+model_a_backend_name = 'VGG16'
+model_b_backend_name = None
+model_c_backend_name = None
+
+valence, arousal, dominance = single_img_VAD_inference(img_path=img_path,
+ object_detector_backend='RetinaNet',
+ model_a_backend_name=model_a_backend_name,
+ model_b_backend_name=model_b_backend_name,
+ model_c_backend_name=model_c_backend_name,
+ )
+
diff --git a/inference/hra_2class_single_image_inference.py b/inference/hra_2class_single_image_inference.py
new file mode 100644
index 0000000..fd41c7e
--- /dev/null
+++ b/inference/hra_2class_single_image_inference.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+'''
+
+
+'''
+
+from engine.displaced_people_branch.single_image_inference_hra_2class import single_img_HRA_inference
+from applications.hra_utils import plot_preds
+from keras.preprocessing import image
+from matplotlib import pyplot as plt
+from scipy.misc import imread
+
+
+img_path = '/home/sandbox/Desktop/Testing Images/camping.jpg'
+violation_class = 'dp'
+model_backend_name = 'VGG16'
+nb_of_conv_layers_to_fine_tune = 1
+
+raw_preds, overlayed_text = single_img_HRA_inference(img_path=img_path,
+ violation_class=violation_class,
+ model_backend_name=model_backend_name,
+ nb_of_conv_layers_to_fine_tune=nb_of_conv_layers_to_fine_tune)
+
+
+img = image.load_img(img_path, target_size=(224, 224))
+print (raw_preds[0])
+
+# plot_preds(violation_class, img, raw_preds[0])
+
+
+numpy_img_path = imread(img_path)
+plt.figure(figsize=(10, 12))
+plt.imshow(numpy_img_path)
+
+current_axis = plt.gca()
+
+# configure colours for bounding box and text
+bounding_box_colour_rgbvar = (53, 42, 146)
+bounding_box_colour_rgbvar2 = [x / 255.0 for x in bounding_box_colour_rgbvar]
+
+text_colour_rgbvar = (214, 86, 100)
+text_colour_rgbvar2 = [x / 255.0 for x in text_colour_rgbvar]
+
+current_axis.text(0, 0, overlayed_text, size='x-large', color='white',
+ bbox={'facecolor': bounding_box_colour_rgbvar2, 'alpha': 1.0})
+
+plt.axis('off')
+plt.show()
\ No newline at end of file
diff --git a/paradigms/__init__.py b/paradigms/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/paradigms/inspect_arch.py b/paradigms/inspect_arch.py
new file mode 100644
index 0000000..88a80a9
--- /dev/null
+++ b/paradigms/inspect_arch.py
@@ -0,0 +1,45 @@
+from __future__ import division, print_function
+import os
+
+from keras.layers import Input
+from keras.layers.core import Dense
+from keras.models import Model
+from keras.layers.core import Dropout
+from keras.layers import GlobalAveragePooling2D
+from keras.utils.data_utils import get_file
+from keras import regularizers
+from keras.utils import plot_model
+
+
+from keras.applications.resnet50 import ResNet50
+from keras.layers.merge import concatenate
+from applications.vgg16_places_365 import VGG16_Places365
+from keras.optimizers import SGD
+
+from utils.generic_utils import euclidean_distance_loss, rmse
+
+
+body_inputs = Input(shape=(224, 224, 3), name='INPUT')
+image_inputs = Input(shape=(224, 224, 3), name='INPUT')
+
+# Body module
+tmp_model = ResNet50(include_top=False, weights='imagenet', input_tensor=body_inputs, pooling='avg')
+# tmp_model.summary()
+
+plot_model(tmp_model, to_file='original_model.png',show_shapes = True)
+
+for i, layer in enumerate(tmp_model.layers):
+ print(i, layer.name)
+
+new_model = Model(inputs=tmp_model.input, outputs=tmp_model.get_layer(index=169).output)
+
+for i, layer in enumerate(new_model.layers):
+ print(i, layer.name)
+
+# tmp_model.summary()
+
+plot_model(new_model, to_file='intermediate_layer.png',show_shapes = True)
+
+
+
+
diff --git a/paradigms/plot_CSV.py b/paradigms/plot_CSV.py
new file mode 100644
index 0000000..a8a0e8a
--- /dev/null
+++ b/paradigms/plot_CSV.py
@@ -0,0 +1,86 @@
+import matplotlib.pyplot as plt
+import numpy as np
+
+import matplotlib.cbook as cbook
+from matplotlib import style
+
+style.use('ggplot')
+
+epoch, acc, loss, val_acc, val_loss = np.loadtxt('/home/sandbox/Desktop/ResNet50__VGG16_Places365_emotic.csv',
+ unpack=True,
+ delimiter=',')
+
+
+
+
+
+
+plt.style.use("ggplot")
+plt.figure()
+plt.plot(epoch, acc, label='Training acc')
+plt.plot(epoch, val_acc,label='Validation acc')
+plt.title('Training and validation accuracy')
+plt.xlabel("Epoch #")
+plt.ylabel("Loss/Accuracy")
+plt.legend()
+
+plt.plot(epoch, loss, label='Training loss')
+plt.plot(epoch, val_loss, label='Validation loss')
+# plt.title('Training and validation fbeta_score')
+plt.legend()
+plt.show()
+
+#
+# # summarize history for accuracy
+# plt.plot(acc)
+# plt.plot(val_acc)
+# plt.title('model accuracy')
+# plt.ylabel('accuracy')
+# plt.xlabel('epoch')
+# plt.legend(['train', 'val'], loc='upper left')
+# plt.show()
+#
+# # summarize history for loss
+# plt.plot(loss)
+# plt.plot(val_loss)
+# plt.title('model loss')
+# plt.ylabel('loss')
+# plt.xlabel('epoch')
+# plt.legend(['train', 'val'], loc='upper left')
+# plt.show()
+
+
+# print (epoch)
+# print (acc)
+# print (loss)
+# print (top_3_categorical_accuracy)
+
+
+# plt.plot(acc)
+# plt.plot(loss)
+# plt.plot(top_3_categorical_accuracy)
+# plt.title('model accuracy')
+# plt.ylabel('accuracy')
+# plt.xlabel('epoch')
+# plt.legend(['acc', 'loss', 'top3'], loc='upper left')
+# plt.show()
+#
+#
+#
+# plt.plot(epoch,acc)
+# # plt.plot(epoch,loss)
+#
+#
+#
+# plt.title('Model Training History')
+# plt.ylabel('accuracy')
+# plt.xlabel('epoch')
+# # plt.legend(['Top-1 ','Top-3' ,'fbeta'], loc='upper left')
+# plt.legend(['Top-1 '], loc='upper left')
+#
+#
+# plt.show()
+#
+#
+
+
diff --git a/preprocessing/__init__.py b/preprocessing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/preprocessing/emotic/README.md b/preprocessing/emotic/README.md
new file mode 100644
index 0000000..9b70625
--- /dev/null
+++ b/preprocessing/emotic/README.md
@@ -0,0 +1,88 @@
+## Preprocessing (chronological order of completion)
+
+### annotations_browser.py
+Annotations browser base class which reads metadata from a .mat file [MATLAB formatted data] and saves them to a CSV file.
+
+### save_raw_imgs.py
+Reads image names (locations on disk) from a csv file,
+then loads them using the basic set of tools for image data provided by Keras (keras/preprocessing/image.py)
+and finally saves them in a numpy array.
+
+### csv_to_numpy.py
+Reads entries (different annotations) from a csv file, converts them to integer/list of integers(for multilabel-multiclass settings)
+and saves them in a numpy array.
+
+### load_data_from_numpy.py
+Utilities required to load data (image & their annotations) stored in numpy arrays.
+The main function is `load_data_from_numpy` which loads all the applicable arrays.
+The supporting function is `load_annotations_only_from_numpy` which loads only the annotations without the image tensors.
+
+
+### hdf5_controller.py
+Contains the HDF5 controller base class which can be used either to create a single HDF5 file
+containing a large number of images and their respective annotations (EMOTIC Dataset) or to load a previously saved HDF5 file.
+
+In order to create a single HDF5 file, `load_data_from_numpy.py` requires images (tensors) and
+their annotations in numpy arrays to comply with the following structure:
+
+ main_numpy_dir/
+ train/
+ x_train.npy
+ emotions_train.npy
+ valence_train.npy
+ arousal_train.npy
+ dominance_train.npy
+ age_train.npy
+
+ val/
+ x_val.npy
+ emotions_val.npy
+ valence_val.npy
+ arousal_val.npy
+ dominance_val.npy
+ age_val.npy
+
+ test/
+ x_test.npy
+ emotions_test.npy
+ valence_test.npy
+ arousal_test.npy
+ dominance_test.npy
+ age_test.npy
+
+Note that in order to obtain the numpy arrays you will need either to download them (images as tensors are not included due to file size) from:
+- https://github.com/GKalliatakis/Keras-EMOTIC/releases/download/0.3/train.zip
+- https://github.com/GKalliatakis/Keras-EMOTIC/releases/download/0.3/val.zip
+- https://github.com/GKalliatakis/Keras-EMOTIC/releases/download/0.3/test.zip
+
+or recreate them using the `csv_to_numpy.py` for the annotations and `save_raw_imgs.py` for the images.
+
+
+Also, `base_img_dir` must contain the raw images in the following structure:
+
+ base_img_dir/
+ train/
+ images/
+ xxxxxxxx.jpg
+ xxxxxxxx.jpg
+ ...
+
+
+ val/
+ images/
+ xxxxxxxx.jpg
+ xxxxxxxx.jpg
+ ...
+
+ test/
+ images/
+ xxxxxxxx.jpg
+ xxxxxxxx.jpg
+ ...
+
+Note that in order to end up with that structure you will need either to download the images from
+- https://github.com/GKalliatakis/Keras-EMOTIC/releases/download/0.1/train.zip
+- https://github.com/GKalliatakis/Keras-EMOTIC/releases/download/0.1/val.zip
+- https://github.com/GKalliatakis/Keras-EMOTIC/releases/download/0.1/test.zip
+
+or recreate them using the `crop_bounding_rectangles` function from `annotations_browser.py`
diff --git a/preprocessing/emotic/README.pdf b/preprocessing/emotic/README.pdf
new file mode 100644
index 0000000..7cb5e67
Binary files /dev/null and b/preprocessing/emotic/README.pdf differ
diff --git a/preprocessing/emotic/__init__.py b/preprocessing/emotic/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/preprocessing/emotic/annotations_browser.py b/preprocessing/emotic/annotations_browser.py
new file mode 100644
index 0000000..eea598f
--- /dev/null
+++ b/preprocessing/emotic/annotations_browser.py
@@ -0,0 +1,545 @@
+from __future__ import print_function
+import os
+from scipy.io import loadmat
+from utils.generic_utils import crop
+import csv
+
+from array import array
+
+import shutil
+import tqdm
+
+
+from PIL import ImageFile
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+
+class AnnotationsBrowser():
+
+ def __init__(self,
+ matfile = 'Annotations.mat',
+ EMOTIC_base_dir = '/home/sandbox/Desktop/EMOTIC_database/emotic',
+ mode = 'train'
+ ):
+ """Annotations browser base class.
+
+ Example
+ --------
+ >>> browser= AnnotationsBrowser(matfile='Annotations.mat',
+ >>> EMOTIC_base_dir='/home/sandbox/Desktop/EMOTIC_database/emotic',
+ >>> mode= 'train')
+ >>> if browser.mode == 'train':
+ >>> nb_samples = 17077
+ >>> elif browser.mode == 'val':
+ >>> nb_samples = 2088
+ >>> elif browser.mode == 'test':
+ >>> nb_samples = 4389
+
+ >>> dir_name = browser.mode +'/'
+
+ >>> bounding_rectangles_dir = dir_name + 'images'
+ >>> emotion_categories_filename = dir_name + 'emotions.csv'
+
+ >>> for field_number in range(0, nb_samples):
+ >>> browser.crop_bounding_rectangles(field_number=field_number, to_file=bounding_rectangles_dir)
+ >>> browser.emotion_categories(field_number,emotion_categories_filename)
+ """
+
+ self.matfile = matfile
+ self.matdata = loadmat(matfile)
+ self.emotic_base_dir = EMOTIC_base_dir
+ self.mode = mode
+
+ if mode == 'train':
+ self.categories_field_name = 'annotations_categories'
+ self.continuous_field_name = 'annotations_continuous'
+ else:
+ self.categories_field_name = 'combined_categories'
+ self.continuous_field_name = 'combined_continuous'
+
+
+
+ def copy_images(self,
+ field_number,
+ copy_dir):
+ """Saves a copy for every image found in annotations at a given directory.
+ # Arguments
+ field_number: serial number of every image sample in the database.
+ to_file: name of the csv file to save the exported data.
+ """
+
+ folder = self.matdata[self.mode][0]['folder'][field_number][0]
+ filename = self.matdata[self.mode][0]['filename'][field_number][0]
+
+ # print (filename)
+
+ src_full_path = self.emotic_base_dir + '/' + folder + '/' + filename
+ copy_full_path = str(copy_dir)+str(field_number)+'_'+str(filename)
+
+
+ # print (src_full_path)
+ # print (copy_full_path)
+
+ shutil.copyfile(src_full_path, copy_full_path)
+
+
+
+ def multi_copy_images(self,
+ field_number,
+ copy_dir):
+ """Saves a separate image copy for every person under consideration
+ (this is relevant for the entire image module of the CNN model) found in annotations.
+ # Arguments
+ field_number: serial number of every image sample in the database.
+ to_file: name of the csv file to save the exported data.
+ """
+
+
+ nb_annotated_persons = len(self.matdata[self.mode][0]['person'][field_number][0])
+
+ for x in range(0, nb_annotated_persons):
+
+ folder = self.matdata[self.mode][0]['folder'][field_number][0]
+ filename = self.matdata[self.mode][0]['filename'][field_number][0]
+
+ full_path = self.emotic_base_dir +'/'+ folder+'/'+filename
+
+
+ filename_only, file_extension_only = os.path.splitext(filename)
+
+ # EMOTIC dataset contains up to 14 different persons annotated in the images
+ if x == 0:
+ to_file = str(field_number) + '_' + filename_only + '.jpg'
+ elif x == 1:
+ to_file = str(field_number) + '_' + filename_only + '_B.jpg'
+ elif x == 2:
+ to_file = str(field_number) + '_' + filename_only + '_C.jpg'
+ elif x == 3:
+ to_file = str(field_number) + '_' + filename_only + '_D.jpg'
+ elif x == 4:
+ to_file = str(field_number) + '_' + filename_only + '_E.jpg'
+ elif x == 5:
+ to_file = str(field_number) + '_' + filename_only + '_F.jpg'
+ elif x == 6:
+ to_file = str(field_number) + '_' + filename_only + '_G.jpg'
+ elif x == 7:
+ to_file = str(field_number) + '_' + filename_only + '_H.jpg'
+ elif x == 8:
+ to_file = str(field_number) + '_' + filename_only + '_I.jpg'
+ elif x == 9:
+ to_file = str(field_number) + '_' + filename_only + '_J.jpg'
+ elif x == 10:
+ to_file = str(field_number) + '_' + filename_only + '_K.jpg'
+ elif x == 11:
+ to_file = str(field_number) + '_' + filename_only + '_L.jpg'
+ elif x == 12:
+ to_file = str(field_number) + '_' + filename_only + '_M.jpg'
+ elif x == 13:
+ to_file = str(field_number) + '_' + filename_only + '_N.jpg'
+
+
+ copy_full_path = copy_dir + to_file
+
+ # print (copy_full_path)
+
+ shutil.copyfile(full_path, copy_full_path)
+
+
+
+ def crop_bounding_rectangles(self,
+ field_number,
+ to_file,
+ ):
+ """Crops and saves a separate image for every person under consideration
+ based on the bounding rectangle of the corresponding 4 co-ordinates found in annotations.
+ # Arguments
+ field_number: serial number of every image sample in the database.
+ to_file: name of the csv file to save the exported data.
+ """
+
+ nb_annotated_persons = len(self.matdata[self.mode][0]['person'][field_number][0])
+
+ for x in range(0, nb_annotated_persons):
+
+ folder = self.matdata[self.mode][0]['folder'][field_number][0]
+ filename = self.matdata[self.mode][0]['filename'][field_number][0]
+
+ full_path = self.emotic_base_dir +'/'+ folder+'/'+filename
+
+ body_bbox_tuple = ()
+ x1_body_bbox = self.matdata[self.mode][0]['person'][field_number]['body_bbox'][0][x][0][0]
+ y1_body_bbox = self.matdata[self.mode][0]['person'][field_number]['body_bbox'][0][x][0][1]
+ x2_body_bbox = self.matdata[self.mode][0]['person'][field_number]['body_bbox'][0][x][0][2]
+ y2_body_bbox = self.matdata[self.mode][0]['person'][field_number]['body_bbox'][0][x][0][3]
+
+ body_bbox_tuple = body_bbox_tuple + (x1_body_bbox,y1_body_bbox,x2_body_bbox,y2_body_bbox,)
+
+ filename_only, file_extension_only = os.path.splitext(filename)
+
+ # EMOTIC dataset contains up to 14 different persons annotated in the images
+ if x == 0:
+ to_file = to_file + '/' + str(field_number) + '_' + filename_only + '.jpg'
+ elif x == 1:
+ to_file = to_file + '/' + str(field_number) + '_' + filename_only + '_B.jpg'
+ elif x == 2:
+ to_file = to_file + '/' + str(field_number) + '_' + filename_only + '_C.jpg'
+ elif x == 3:
+ to_file = to_file + '/' + str(field_number) + '_' + filename_only + '_D.jpg'
+ elif x == 4:
+ to_file = to_file + '/' + str(field_number) + '_' + filename_only + '_E.jpg'
+ elif x == 5:
+ to_file = to_file + '/' + str(field_number) + '_' + filename_only + '_F.jpg'
+ elif x == 6:
+ to_file = to_file + '/' + str(field_number) + '_' + filename_only + '_G.jpg'
+ elif x == 7:
+ to_file = to_file + '/' + str(field_number) + '_' + filename_only + '_H.jpg'
+ elif x == 8:
+ to_file = to_file + '/' + str(field_number) + '_' + filename_only + '_I.jpg'
+ elif x == 9:
+ to_file = to_file + '/' + str(field_number) + '_' + filename_only + '_J.jpg'
+ elif x == 10:
+ to_file = to_file + '/' + str(field_number) + '_' + filename_only + '_K.jpg'
+ elif x == 11:
+ to_file = to_file + '/' + str(field_number) + '_' + filename_only + '_L.jpg'
+ elif x == 12:
+ to_file = to_file + '/' + str(field_number) + '_' + filename_only + '_M.jpg'
+ elif x == 13:
+ to_file = to_file + '/' + str(field_number) + '_' + filename_only + '_N.jpg'
+
+
+ crop(full_path, body_bbox_tuple, to_file)
+
+
+ def emotion_categories(self,
+ field_number,
+ to_file):
+ """Exports and saves the one-hot encoded emotion categories for every person under consideration.
+ # Arguments
+ field_number: serial number of every image sample in the database.
+ to_file: name of the csv file to save the exported data.
+ """
+
+ emotion_categories_list = []
+ nb_annotated_persons = len(self.matdata[self.mode][0]['person'][field_number][0])
+
+
+ # define universe of possible input values
+ alphabet = ['Affection',
+ 'Anger',
+ 'Annoyance',
+ 'Anticipation',
+ 'Aversion',
+ 'Confidence',
+ 'Disapproval',
+ 'Disconnection',
+ 'Disquietment',
+ 'Doubt/Confusion',
+ 'Embarrassment',
+ 'Engagement',
+ 'Esteem',
+ 'Excitement',
+ 'Fatigue',
+ 'Fear',
+ 'Happiness',
+ 'Pain',
+ 'Peace',
+ 'Pleasure',
+ 'Sadness',
+ 'Sensitivity',
+ 'Suffering',
+ 'Surprise',
+ 'Sympathy',
+ 'Yearning']
+
+ # define a mapping of chars to integers
+ char_to_int = dict((c, i) for i, c in enumerate(alphabet))
+ int_to_char = dict((i, c) for i, c in enumerate(alphabet))
+
+
+ # Loop over every `person` under consideration (`nb_annotated_persons`)
+ for person in xrange(0, nb_annotated_persons):
+ # nb_emotion_categories = len(
+ # self.matdata[self.mode][0]['person'][field_number][self.categories_field_name][0][person][0][0][0][0])
+
+ if self.mode == 'train':
+ nb_emotion_categories = len(
+ self.matdata[self.mode][0]['person'][field_number][self.categories_field_name][0][person][0][0][0][0])
+
+ else:
+ nb_emotion_categories = len(
+ self.matdata[self.mode][0]['person'][field_number][self.categories_field_name][0][person][0])
+
+
+ multiple_persons_emotion_categories_list = []
+
+ integer_encoded_list = []
+ integer_encoded__multiple_list = []
+ # Loop over the total number of emotion categories found for `person`
+ for emotion in xrange(0, nb_emotion_categories):
+
+ if self.mode == 'train':
+ current_emotion_category = \
+ self.matdata[self.mode][0]['person'][field_number][self.categories_field_name][0][person][0][0][0][0][emotion][0]
+ else:
+ current_emotion_category = \
+ self.matdata[self.mode][0]['person'][field_number][self.categories_field_name][0][person][0][emotion][0]
+
+ # emotion_as_int = [char_to_int[char] for char in current_emotion_category]
+
+ if current_emotion_category == 'Affection':
+ emotion_as_int = 0
+
+ elif current_emotion_category == 'Anger':
+ emotion_as_int = 1
+
+ elif current_emotion_category == 'Annoyance':
+ emotion_as_int = 2
+
+ elif current_emotion_category == 'Anticipation':
+ emotion_as_int = 3
+
+ elif current_emotion_category == 'Aversion':
+ emotion_as_int = 4
+
+ elif current_emotion_category == 'Confidence':
+ emotion_as_int = 5
+
+ elif current_emotion_category == 'Disapproval':
+ emotion_as_int = 6
+
+ elif current_emotion_category == 'Disconnection':
+ emotion_as_int = 7
+
+ elif current_emotion_category == 'Disquietment':
+ emotion_as_int = 8
+
+ elif current_emotion_category == 'Doubt/Confusion':
+ emotion_as_int = 9
+
+ elif current_emotion_category == 'Embarrassment':
+ emotion_as_int = 10
+
+ elif current_emotion_category == 'Engagement':
+ emotion_as_int = 11
+
+ elif current_emotion_category == 'Esteem':
+ emotion_as_int = 12
+
+ elif current_emotion_category == 'Excitement':
+ emotion_as_int = 13
+
+ elif current_emotion_category == 'Fatigue':
+ emotion_as_int = 14
+
+ elif current_emotion_category == 'Fear':
+ emotion_as_int = 15
+
+ elif current_emotion_category == 'Happiness':
+ emotion_as_int = 16
+
+ elif current_emotion_category == 'Pain':
+ emotion_as_int = 17
+
+ elif current_emotion_category == 'Peace':
+ emotion_as_int = 18
+
+ elif current_emotion_category == 'Pleasure':
+ emotion_as_int = 19
+
+ elif current_emotion_category == 'Sadness':
+ emotion_as_int = 20
+
+ elif current_emotion_category == 'Sensitivity':
+ emotion_as_int = 21
+
+ elif current_emotion_category == 'Suffering':
+ emotion_as_int = 22
+
+ elif current_emotion_category == 'Surprise':
+ emotion_as_int = 23
+
+ elif current_emotion_category == 'Sympathy':
+ emotion_as_int = 24
+
+ elif current_emotion_category == 'Yearning':
+ emotion_as_int = 25
+
+
+
+ # Checks if the number of people under consideration is one (single person in the image) or
+ # more than one (multiple persons in the image) and create their corresponding lists
+ # that will hold the `current_emotion_category`
+ if nb_annotated_persons == 1:
+
+ emotion_categories_list.append(int(emotion_as_int))
+
+ else:
+
+ multiple_persons_emotion_categories_list.append(int(emotion_as_int))
+
+
+ # # integer encode for single and multiple persons
+ # integer_encoded = [char_to_int[char] for char in emotion_categories_list]
+ # integer_encoded_multiple = [char_to_int[char] for char in multiple_persons_emotion_categories_list]
+ #
+ # if nb_annotated_persons == 1:
+ # integer_encoded_list.append(integer_encoded)
+ # else:
+ # integer_encoded__multiple_list.append(integer_encoded_multiple)
+
+
+
+ with open(to_file, 'a') as resultFile:
+ wr = csv.writer(resultFile, dialect='excel')
+
+ if nb_annotated_persons == 1:
+ print (emotion_categories_list)
+ wr.writerow(emotion_categories_list)
+ else:
+ print (multiple_persons_emotion_categories_list)
+ wr.writerow(multiple_persons_emotion_categories_list)
+
+
+
+ def continuous_dimensions(self,
+ field_number,
+ dimension,
+ to_file):
+ """Exports and saves the continuous dimension annotations for every person under consideration.
+ # Arguments
+ field_number: serial number of every image sample in the database.
+ dimension: one of `valence` (how positive or pleasant an emotion is),
+ `arousal` (measures the agitation level of the person) or
+ `dominance` (measures the control level of the situation by the person).
+ to_file: name of the csv file to save the exported data.
+ """
+
+ if dimension == 'valence':
+ int_dimension = 0
+ elif dimension == 'arousal':
+ int_dimension = 1
+ elif dimension == 'dominance':
+ int_dimension = 2
+
+ dimension_list = []
+
+ nb_annotated_persons = len(self.matdata[self.mode][0]['person'][field_number][0])
+
+ print ('=== Processing field ' + str(field_number + 1) + ' ===')
+
+ for person in xrange(0, nb_annotated_persons):
+ multiple_persons_dimension_list = []
+
+ current_valence_dimension = \
+ self.matdata[self.mode][0]['person'][field_number][self.continuous_field_name][0][person][0][0][int_dimension][
+ 0][0]
+
+ # print current_valence_dimension
+
+
+
+ if nb_annotated_persons == 1:
+ dimension_list.append(current_valence_dimension)
+
+ with open(to_file, 'a') as resultFile:
+ print (dimension_list)
+ wr = csv.writer(resultFile, dialect='excel')
+ wr.writerow(dimension_list)
+ else:
+ multiple_persons_dimension_list.append(current_valence_dimension)
+
+ with open(to_file, 'a') as resultFile:
+ print (multiple_persons_dimension_list)
+ wr = csv.writer(resultFile, dialect='excel')
+ wr.writerow(multiple_persons_dimension_list)
+
+
+ def age(self,
+ field_number,
+ to_file):
+ """Exports and saves the annotated age categories for every person under consideration in train set
+ # Arguments
+ field_number: serial number of every image sample in the database.
+ to_file: name of the csv file to save the exported data.
+ """
+
+ age_list = []
+
+ nb_annotated_persons = len(self.matdata[self.mode][0]['person'][field_number][0])
+
+ print ('=== Processing field ' + str(field_number + 1) + ' ===')
+
+ for person in xrange(0, nb_annotated_persons):
+ multiple_persons_age_list = []
+
+ current_age = self.matdata[self.mode][0]['person'][field_number]['age'][0][person][0]
+
+ print (current_age)
+
+ if current_age == 'Kid':
+ current_age_int = 0
+ elif current_age == 'Teenager':
+ current_age_int = 1
+ elif current_age == 'Adult':
+ current_age_int = 2
+
+
+ if nb_annotated_persons == 1:
+ age_list.append(current_age_int)
+
+ with open(to_file, 'a') as resultFile:
+ wr = csv.writer(resultFile, dialect='excel')
+ wr.writerow(age_list)
+ else:
+ multiple_persons_age_list.append(current_age_int)
+
+ with open(to_file, 'a') as resultFile:
+ wr = csv.writer(resultFile, dialect='excel')
+ wr.writerow(multiple_persons_age_list)
+
+
+
+ def age_single_label_categorical(self,
+ field_number,
+ to_file):
+ """Exports and saves the annotated age categories for every person under consideration in train set
+ # Arguments
+ field_number: serial number of every image sample in the database.
+ to_file: name of the csv file to save the exported data.
+ """
+
+ age_list = []
+
+ nb_annotated_persons = len(self.matdata[self.mode][0]['person'][field_number][0])
+
+ print ('=== Processing field ' + str(field_number + 1) + ' ===')
+
+ for person in xrange(0, nb_annotated_persons):
+ multiple_persons_age_list = []
+
+ current_age = self.matdata[self.mode][0]['person'][field_number]['age'][0][person][0]
+
+ print (current_age)
+
+ if current_age == 'Kid':
+ current_age_int = 0
+ elif current_age == 'Teenager':
+ current_age_int = 1
+ elif current_age == 'Adult':
+ current_age_int = 2
+
+ if nb_annotated_persons == 1:
+ age_list.append(current_age_int)
+
+ with open(to_file, 'a') as resultFile:
+ wr = csv.writer(resultFile, dialect='excel')
+ wr.writerow(age_list)
+ else:
+ multiple_persons_age_list.append(current_age_int)
+
+ with open(to_file, 'a') as resultFile:
+ wr = csv.writer(resultFile, dialect='excel')
+ wr.writerow(multiple_persons_age_list)
+
+
+
diff --git a/preprocessing/emotic/csv_to_numpy.py b/preprocessing/emotic/csv_to_numpy.py
new file mode 100644
index 0000000..a03f140
--- /dev/null
+++ b/preprocessing/emotic/csv_to_numpy.py
@@ -0,0 +1,186 @@
+"""Python utility which reads entries (different annotations) from a csv file,
+ converts them to integer/list of integers(for multilabel-multiclass settings) and saves them in a numpy array.
+"""
+
+import pandas
+import numpy as np
+import math
+from keras.utils import np_utils
+
+
+def csv_to_numpy(csv_path,
+ nb_samples,
+ entry_type,
+ to_file):
+ """Reads entries (different annotations) from a csv file,
+ converts them to integer/list of integers(for multilabel-multiclass settings) and saves them in a numpy array.
+
+ # Arguments
+ csv_path: String to declare the full path of the csv file .
+ nb_samples: Number of entries in the csv file to iterate.
+ entry_type: The header name of the column to process. One of `emotions`, `valence`, `arousal`, `dominance` and `age`.
+ to_file: File name of the numpy array that will hold the converted entries.
+ """
+
+ csv_file = pandas.read_csv(csv_path)
+
+ # counter to iterate through all csv entries
+ field_number = 0
+
+ final_list = []
+
+ if entry_type == 'emotions':
+ for entry in csv_file.emotions:
+
+ # convert the retrieved csv entry (whose type is always str) into integer/list of integers
+ int_list = map(int, entry.split(','))
+
+ int_list = np_utils.to_categorical(int_list)
+
+ # append the converted integer/list of integers to the final list
+ final_list.append(int_list)
+
+
+ field_number += 1
+ if field_number > nb_samples - 1:
+ break
+
+ elif entry_type == 'valence':
+ for entry in csv_file.valence:
+
+ # ensure all the nan values for valence, arousal and dominance are transformed to 5.0
+ if math.isnan(entry):
+ entry = 5.0
+
+ # append the converted integer/list of integers to the final list
+ final_list.append(int(entry))
+
+ field_number += 1
+ if field_number > nb_samples - 1:
+ break
+
+ elif entry_type == 'arousal':
+ for entry in csv_file.arousal:
+
+ # ensure all the nan values for valence, arousal and dominance are transformed to 5.0
+ if math.isnan(entry):
+ entry = 5.0
+
+ final_list.append(int(entry))
+
+ field_number += 1
+ if field_number > nb_samples - 1:
+ break
+
+ elif entry_type == 'dominance':
+ for entry in csv_file.dominance:
+
+ # ensure all the nan values for valence, arousal and dominance are transformed to 5.0
+ if math.isnan(entry):
+ entry = 5.0
+
+ final_list.append(int(entry))
+
+ field_number += 1
+ if field_number > nb_samples - 1:
+ break
+
+ elif entry_type == 'age':
+ for entry in csv_file.age:
+
+ final_list.append(int(entry))
+
+ field_number += 1
+ if field_number > nb_samples - 1:
+ break
+
+ # expand dimensions from (xxxx,) to(xxxx, 1)
+ final_list = np.expand_dims(final_list, axis=1)
+ np.save(to_file, final_list)
+
+
+
+
+
+
+if __name__ == '__main__':
+
+
+
+ # csv_path= '/home/sandbox/Desktop/EMOTIC_database/RESOURCES/EMOTIC_CSV_FILES/test.csv'
+ # nb_samples = 7280
+ # entry_type = 'age'
+ #
+ # to_file= 'age_test'
+ #
+ # csv_to_numpy(csv_path =csv_path,
+ # nb_samples = nb_samples,
+ # entry_type = entry_type,
+ # to_file= to_file)
+ #
+ #
+ # x = np.load('age_test.npy')
+ #
+ # # print x.shape
+ # #
+ # # print x[0]
+ #
+ # from keras.utils import np_utils
+ #
+ # x = np_utils.to_categorical(x)
+ #
+ #
+ # np.save(to_file, x)
+ #
+ # x = np.load('age_test.npy')
+ #
+ # print x.shape
+ # print x[0]
+ #
+ # print type(x[0][0])
+
+
+ mode = 'train'
+ entry_type = 'age'
+
+
+
+ to_file = entry_type + '_'+mode
+ csv_path= '/home/sandbox/Desktop/Keras-EMOTIC/dataset/'+ mode +'.csv'
+
+
+ if mode == 'train':
+ nb_samples = 23706
+ elif mode == 'val':
+ nb_samples = 3334
+ elif mode == 'test':
+ nb_samples = 7280
+
+
+
+
+ csv_to_numpy(csv_path =csv_path,
+ nb_samples = nb_samples,
+ entry_type = entry_type,
+ to_file= to_file)
+
+
+ x = np.load(str(to_file)+'.npy')
+
+ print x.shape
+
+ print x[0]
+
+ # from keras.utils import np_utils
+ #
+ # x = np_utils.to_categorical(x)
+ #
+ #
+ # np.save(to_file, x)
+ #
+ # x = np.load(str(to_file) + '.npy')
+ #
+ # print x.shape
+ # print x[0]
+ #
+ # print type(x[0][0])
\ No newline at end of file
diff --git a/preprocessing/emotic/custom_generator.py b/preprocessing/emotic/custom_generator.py
new file mode 100644
index 0000000..b3f03c4
--- /dev/null
+++ b/preprocessing/emotic/custom_generator.py
@@ -0,0 +1,107 @@
+"""Custom generator for pentuple-output Keras models.
+"""
+
+from math import ceil
+
+
+def custom_generator(hdf5_file, nb_data, batch_size, mode):
+ """ Generates batches of tensor image data in form of ==> (x1, y1) ,(x2, y2).
+ # Reference
+ - https://stackoverflow.com/questions/50333532/load-images-and-annotations-from-csv-and-use-fit-generator-with-multi-output-mod
+ - http://machinelearninguru.com/deep_learning/data_preparation/hdf5/hdf5.html
+
+ # Arguments
+ hdf5_file: path or hdf5 object which contains the images and the annotations.
+ nb_data: total number of samples saved in the array.
+ batch_size: size of the batch to generate tensor image data for.
+ module: one of `body` or `image`.
+
+ # Returns
+ A generator object.
+
+ """
+
+ batches_list = list(range(int(ceil(float(nb_data) / batch_size))))
+
+ while True:
+
+ # loop over batches
+ for n, i in enumerate(batches_list):
+ i_s = i * batch_size # index of the first image in this batch
+ i_e = min([(i + 1) * batch_size, nb_data]) # index of the last image in this batch
+
+ if mode == 'train':
+ body_x = hdf5_file["x_cropped_train"][i_s:i_e, ...]
+ image_x = hdf5_file["x_entire_train"][i_s:i_e, ...]
+ # valence_body_y = hdf5_file["valence_cropped_train"][i_s:i_e]
+ valence_image_y = hdf5_file["valence_entire_train"][i_s:i_e]
+ # arousal_body_y = hdf5_file["arousal_cropped_train"][i_s:i_e]
+ arousal_image_y = hdf5_file["arousal_entire_train"][i_s:i_e]
+ # dominance_body_y = hdf5_file["dominance_cropped_train"][i_s:i_e]
+ dominance_image_y = hdf5_file["dominance_entire_train"][i_s:i_e]
+
+ elif mode == 'val':
+ body_x = hdf5_file["x_cropped_val"][i_s:i_e, ...]
+ image_x = hdf5_file["x_entire_val"][i_s:i_e, ...]
+ # valence_body_y = hdf5_file["valence_cropped_val"][i_s:i_e]
+ valence_image_y = hdf5_file["valence_entire_val"][i_s:i_e]
+ # arousal_body_y = hdf5_file["arousal_cropped_val"][i_s:i_e]
+ arousal_image_y = hdf5_file["arousal_entire_val"][i_s:i_e]
+ # dominance_body_y = hdf5_file["dominance_cropped_val"][i_s:i_e]
+ dominance_image_y = hdf5_file["dominance_entire_val"][i_s:i_e]
+
+ elif mode == 'test':
+ body_x = hdf5_file["x_cropped_test"][i_s:i_e, ...]
+ image_x = hdf5_file["x_entire_test"][i_s:i_e, ...]
+ # valence_body_y = hdf5_file["valence_cropped_test"][i_s:i_e]
+ valence_image_y = hdf5_file["valence_entire_test"][i_s:i_e]
+ # arousal_body_y = hdf5_file["arousal_cropped_test"][i_s:i_e]
+ arousal_image_y = hdf5_file["arousal_entire_test"][i_s:i_e]
+ # dominance_body_y = hdf5_file["dominance_cropped_test"][i_s:i_e]
+ dominance_image_y = hdf5_file["dominance_entire_test"][i_s:i_e]
+
+
+ yield [body_x,image_x], [valence_image_y,arousal_image_y,dominance_image_y]
+
+def custom_generator_single_output(hdf5_file, nb_data, batch_size, mode):
+ """ Generates batches of tensor image data in form of ==> (x1, y1) ,(x2, y2).
+ # Reference
+ - https://stackoverflow.com/questions/50333532/load-images-and-annotations-from-csv-and-use-fit-generator-with-multi-output-mod
+ - http://machinelearninguru.com/deep_learning/data_preparation/hdf5/hdf5.html
+
+ # Arguments
+ hdf5_file: path or hdf5 object which contains the images and the annotations.
+ nb_data: total number of samples saved in the array.
+ batch_size: size of the batch to generate tensor image data for.
+ module: one of `body` or `image`.
+
+ # Returns
+ A generator object.
+
+ """
+
+ batches_list = list(range(int(ceil(float(nb_data) / batch_size))))
+
+ while True:
+
+ # loop over batches
+ for n, i in enumerate(batches_list):
+ i_s = i * batch_size # index of the first image in this batch
+ i_e = min([(i + 1) * batch_size, nb_data]) # index of the last image in this batch
+
+ if mode == 'train':
+ body_x = hdf5_file["x_body_train"][i_s:i_e, ...]
+ image_x = hdf5_file["x_image_train"][i_s:i_e, ...]
+ y = hdf5_file["y_image_train"][i_s:i_e]
+
+ elif mode == 'val':
+ body_x = hdf5_file["x_body_val"][i_s:i_e, ...]
+ image_x = hdf5_file["x_image_val"][i_s:i_e, ...]
+ y = hdf5_file["y_image_val"][i_s:i_e]
+
+ elif mode == 'test':
+ body_x = hdf5_file["x_body_test"][i_s:i_e, ...]
+ image_x = hdf5_file["x_image_test"][i_s:i_e, ...]
+ y = hdf5_file["y_image_test"][i_s:i_e]
+
+ yield [body_x, image_x], y
\ No newline at end of file
diff --git a/preprocessing/emotic/entire_procedure_example.py b/preprocessing/emotic/entire_procedure_example.py
new file mode 100644
index 0000000..bfaafce
--- /dev/null
+++ b/preprocessing/emotic/entire_procedure_example.py
@@ -0,0 +1,34 @@
+from tqdm import tqdm
+from preprocessing.annotations_browser import AnnotationsBrowser
+
+
+matfile='/home/sandbox/Desktop/EMOTIC_database/annotations/annotations/Annotations.mat'
+EMOTIC_base_dir='/home/sandbox/Desktop/EMOTIC_database/emotic'
+mode='test'
+
+
+
+
+# First create an instance of the AnnotationsBrowser class
+browser = AnnotationsBrowser(matfile=matfile,
+ EMOTIC_base_dir=EMOTIC_base_dir,
+ mode=mode)
+
+if browser.mode == 'train':
+ nb_samples = 17077
+elif browser.mode == 'val':
+ nb_samples = 2088
+elif browser.mode == 'test':
+ nb_samples = 4389
+
+
+
+copy_entire_single_occurrence_imgs_dir = '/home/sandbox/Desktop/EMOTIC_database/entire_single_occurrence_imgs/'+ browser.mode + '/'
+copy_entire_multiple_imgs_dir = '/home/sandbox/Desktop/EMOTIC_database/entire_multiple_imgs/'+ browser.mode + '/'
+
+
+copy_dir = dir_name + 'images/'
+# emotion_categories_filename = dir_name + 'emotions.csv'
+
+for field_number in tqdm(range(0, nb_samples)):
+ browser.copy_images(field_number=field_number, copy_dir=dir_name)
diff --git a/preprocessing/emotic/hdf5_controller.py b/preprocessing/emotic/hdf5_controller.py
new file mode 100644
index 0000000..47d6dee
--- /dev/null
+++ b/preprocessing/emotic/hdf5_controller.py
@@ -0,0 +1,526 @@
+from __future__ import print_function
+from preprocessing.load_data_from_numpy import load_data_from_numpy,load_data_from_numpy_single_output
+from math import ceil
+from utils.generic_utils import progress
+
+import numpy as np
+import pandas
+import cv2
+import h5py
+import matplotlib.pyplot as plt
+
+
+class Controller():
+
+ def __init__(self,
+ hdf5_file,
+ train_csv_file_path,
+ val_csv_file_path,
+ test_csv_file_path,
+ cropped_imgs_dir,
+ entire_imgs_dir,
+ main_numpy_dir
+ ):
+ """HDF5 controller base class.
+ It can be used either to create a single HDF5 file
+ containing a large number of images and their respective annotations (from EMOTIC Dataset)
+ or to load a previously saved HDF5 file.
+
+ In order to create a single HDF5 file, images (tensors) and their annotations must be available
+ in numpy arrays in the following structure:
+
+ main_numpy_dir/
+ train/
+ x_train.npy
+ emotions_train.npy
+ valence_train.npy
+ arousal_train.npy
+ dominance_train.npy
+ age_train.npy
+
+ val/
+ x_val.npy
+ emotions_val.npy
+ valence_val.npy
+ arousal_val.npy
+ dominance_val.npy
+ age_val.npy
+
+ test/
+ x_test.npy
+ emotions_test.npy
+ valence_test.npy
+ arousal_test.npy
+ dominance_test.npy
+ age_test.npy
+
+
+ Also, `base_img_dir` must contain the raw images in the following structure:
+
+ base_img_dir/
+ train/
+ images/
+ xxxxxxxx.jpg
+ xxxxxxxx.jpg
+ ...
+
+
+ val/
+ images/
+ xxxxxxxx.jpg
+ xxxxxxxx.jpg
+ ...
+
+ test/
+ images/
+ xxxxxxxx.jpg
+ xxxxxxxx.jpg
+ ...
+
+ Note that in order to end up with that structure you will need either download the images from
+ - https://github.com/GKalliatakis/Keras-EMOTIC/releases/download/0.1/train.zip
+ - https://github.com/GKalliatakis/Keras-EMOTIC/releases/download/0.1/val.zip
+ - https://github.com/GKalliatakis/Keras-EMOTIC/releases/download/0.1/test.zip
+ or recreate them using the `export_annotations_diff_modes.py` found in `EMOTIC_database` project.
+
+ """
+
+ self.hdf5_file = hdf5_file
+ self.train_csv_file = pandas.read_csv(train_csv_file_path)
+ self.val_csv_file = pandas.read_csv(val_csv_file_path)
+ self.test_csv_file = pandas.read_csv(test_csv_file_path)
+ #
+ # (self.x_entire_train, self.x_cropped_train, self.valence_entire_train, self.valence_cropped_train, self.arousal_entire_train,
+ # self.arousal_cropped_train, self.dominance_entire_train, self.dominance_cropped_train), \
+ # (self.x_entire_val, self.x_cropped_val, self.valence_entire_val, self.valence_cropped_val, self.arousal_entire_val, self.arousal_cropped_val,
+ # self.dominance_entire_val, self.dominance_cropped_val), \
+ # (self.x_entire_test, self.x_cropped_test, self.valence_entire_test, self.valence_cropped_test, self.arousal_entire_test,
+ # self.arousal_cropped_test, self.dominance_entire_test, self.dominance_cropped_test) = load_data_from_numpy(main_numpy_dir=main_numpy_dir,verbose=1)
+
+ (self.x_image_train, self.x_body_train, self.y_image_train, self.y_body_train), \
+ (self.x_image_val, self.x_body_val, self.y_image_val, self.y_body_val), \
+ (self.x_image_test, self.x_body_test, self.y_image_test, self.y_body_test) = load_data_from_numpy_single_output(main_numpy_dir=main_numpy_dir,verbose=1)
+
+ self.cropped_imgs_dir = cropped_imgs_dir
+ self.entire_imgs_dir = entire_imgs_dir
+
+
+
+
+
+ def create_hdf5_VAD_classification(self, dataset, input_size):
+ """ Saves a large number of images and their respective annotations (from EMOTIC Dataset) in a single HDF5 file.
+ # Reference
+ - http://machinelearninguru.com/deep_learning/data_preparation/hdf5/hdf5.html
+ - http://sunai.uoc.edu/emotic/
+
+ # Arguments
+ dataset: name of the dataset that the HDF5 file will be created for.
+ input_size: the default input size for the model (ref https://keras.io/applications/).
+ All models have input size of 224x224
+ except Xception,InceptionV3 and InceptionResNetV2 which have input size of 299x299.
+ """
+
+ if not (dataset in {'EMOTIC'}):
+ raise ValueError('The `dataset` argument can be set to `EMOTIC` only. '
+ 'More datasets will be added in future releases.')
+
+
+ if dataset == 'EMOTIC':
+ nb_train_samples = 23706
+ nb_val_samples = 3332
+ nb_test_samples = 7280
+
+ train_shape = (nb_train_samples, input_size, input_size, 3)
+ val_shape = (nb_val_samples, input_size, input_size, 3)
+ test_shape = (nb_test_samples, input_size, input_size, 3)
+
+ print('[INFO] Open the hdf5 file `'+ str(self.hdf5_file) +'` and start creating arrays.')
+
+ # open a hdf5 file and create arrays
+ hdf5_file = h5py.File(self.hdf5_file, mode='w')
+ hdf5_file.create_dataset("x_entire_train", train_shape, np.uint8)
+ hdf5_file.create_dataset("x_entire_val", val_shape, np.uint8)
+ hdf5_file.create_dataset("x_entire_test", test_shape, np.uint8)
+
+ hdf5_file.create_dataset("x_cropped_train", train_shape, np.uint8)
+ hdf5_file.create_dataset("x_cropped_val", val_shape, np.uint8)
+ hdf5_file.create_dataset("x_cropped_test", test_shape, np.uint8)
+
+ # train arrays
+ hdf5_file.create_dataset("valence_entire_train", (nb_train_samples, 10), np.float64)
+ hdf5_file["valence_entire_train"][...] = self.valence_entire_train
+
+ hdf5_file.create_dataset("arousal_entire_train", (nb_train_samples, 10), np.float64)
+ hdf5_file["arousal_entire_train"][...] = self.arousal_entire_train
+
+ hdf5_file.create_dataset("dominance_entire_train", (nb_train_samples, 10), np.float64)
+ hdf5_file["dominance_entire_train"][...] = self.dominance_entire_train
+
+
+ hdf5_file.create_dataset("valence_cropped_train", (nb_train_samples, 10), np.float64)
+ hdf5_file["valence_cropped_train"][...] = self.valence_cropped_train
+
+ hdf5_file.create_dataset("arousal_cropped_train", (nb_train_samples, 10), np.float64)
+ hdf5_file["arousal_cropped_train"][...] = self.arousal_cropped_train
+
+ hdf5_file.create_dataset("dominance_cropped_train", (nb_train_samples, 10), np.float64)
+ hdf5_file["dominance_cropped_train"][...] = self.dominance_cropped_train
+
+
+
+ # val arrays
+ hdf5_file.create_dataset("valence_entire_val", (nb_val_samples, 10), np.float64)
+ hdf5_file["valence_entire_val"][...] = self.valence_entire_val
+
+ hdf5_file.create_dataset("arousal_entire_val", (nb_val_samples, 10), np.float64)
+ hdf5_file["arousal_entire_val"][...] = self.arousal_entire_val
+
+ hdf5_file.create_dataset("dominance_entire_val", (nb_val_samples, 10), np.float64)
+ hdf5_file["dominance_entire_val"][...] = self.dominance_entire_val
+
+ hdf5_file.create_dataset("valence_cropped_val", (nb_val_samples, 10), np.float64)
+ hdf5_file["valence_cropped_val"][...] = self.valence_cropped_val
+
+ hdf5_file.create_dataset("arousal_cropped_val", (nb_val_samples, 10), np.float64)
+ hdf5_file["arousal_cropped_val"][...] = self.arousal_cropped_val
+
+ hdf5_file.create_dataset("dominance_cropped_val", (nb_val_samples, 10), np.float64)
+ hdf5_file["dominance_cropped_val"][...] = self.dominance_cropped_val
+
+
+ # test arrays
+ hdf5_file.create_dataset("valence_entire_test", (nb_test_samples, 10), np.float64)
+ hdf5_file["valence_entire_test"][...] = self.valence_entire_test
+
+ hdf5_file.create_dataset("arousal_entire_test", (nb_test_samples, 10), np.float64)
+ hdf5_file["arousal_entire_test"][...] = self.arousal_entire_test
+
+ hdf5_file.create_dataset("dominance_entire_test", (nb_test_samples, 10), np.float64)
+ hdf5_file["dominance_entire_test"][...] = self.dominance_entire_test
+
+ hdf5_file.create_dataset("valence_cropped_test", (nb_test_samples, 10), np.float64)
+ hdf5_file["valence_cropped_test"][...] = self.valence_cropped_test
+
+ hdf5_file.create_dataset("arousal_cropped_test", (nb_test_samples, 10), np.float64)
+ hdf5_file["arousal_cropped_test"][...] = self.arousal_cropped_test
+
+ hdf5_file.create_dataset("dominance_cropped_test", (nb_test_samples, 10), np.float64)
+ hdf5_file["dominance_cropped_test"][...] = self.dominance_cropped_test
+
+ print('[INFO] Arrays have been created.')
+
+
+ field_number = 0
+ print('[INFO] Start reading cropped images from train set.')
+ # loop over cropped images train addresses
+ for img_name in self.train_csv_file.filename:
+ progress(field_number, nb_train_samples)
+
+ img_name = self.cropped_imgs_dir + img_name
+
+ img = cv2.imread(img_name)
+ img = cv2.resize(img, (input_size, input_size), interpolation=cv2.INTER_CUBIC)
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+ # save the image and calculate the mean so far
+ hdf5_file["x_cropped_train"][field_number, ...] = img[None]
+
+ field_number += 1
+ if field_number > nb_train_samples - 1:
+ break
+
+
+ print('[INFO] Start reading entire images from train set.')
+ field_number = 0
+ # loop over entire images train addresses
+ for img_name in self.train_csv_file.filename:
+ progress(field_number, nb_train_samples)
+
+ img_name = self.entire_imgs_dir + img_name
+
+ img = cv2.imread(img_name)
+ img = cv2.resize(img, (input_size, input_size), interpolation=cv2.INTER_CUBIC)
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+ # save the image and calculate the mean so far
+ hdf5_file["x_entire_train"][field_number, ...] = img[None]
+
+ field_number += 1
+ if field_number > nb_train_samples - 1:
+ break
+
+
+
+ print('[INFO] Start reading cropped images from validation set.')
+ field_number = 0
+ # loop over val addresses
+ for img_name in self.val_csv_file.filename:
+ progress(field_number, nb_val_samples)
+
+ img_name = self.cropped_imgs_dir + img_name
+
+ img = cv2.imread(img_name)
+ img = cv2.resize(img, (input_size, input_size), interpolation=cv2.INTER_CUBIC)
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+ # save the image and calculate the mean so far
+ hdf5_file["x_cropped_val"][field_number, ...] = img[None]
+
+ field_number += 1
+ if field_number > nb_val_samples - 1:
+ break
+
+
+ print('[INFO] Start reading entire images from validation set.')
+ field_number = 0
+ # loop over val addresses
+ for img_name in self.val_csv_file.filename:
+ progress(field_number, nb_val_samples)
+
+ img_name = self.entire_imgs_dir + img_name
+
+ img = cv2.imread(img_name)
+ img = cv2.resize(img, (input_size, input_size), interpolation=cv2.INTER_CUBIC)
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+ # save the image and calculate the mean so far
+ hdf5_file["x_entire_val"][field_number, ...] = img[None]
+
+ field_number += 1
+ if field_number > nb_val_samples - 1:
+ break
+
+
+ print('[INFO] Start reading cropped images from test set.')
+ field_number = 0
+ # loop over val addresses
+ for img_name in self.test_csv_file.filename:
+ progress(field_number, nb_test_samples)
+
+ img_name = self.cropped_imgs_dir + img_name
+
+ img = cv2.imread(img_name)
+ img = cv2.resize(img, (input_size, input_size), interpolation=cv2.INTER_CUBIC)
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+ # save the image and calculate the mean so far
+ hdf5_file["x_cropped_test"][field_number, ...] = img[None]
+
+ field_number += 1
+ if field_number > nb_test_samples - 1:
+ break
+
+
+ print('[INFO] Start reading entire images from test set.')
+ field_number = 0
+ # loop over val addresses
+ for img_name in self.test_csv_file.filename:
+ progress(field_number, nb_test_samples)
+
+ img_name = self.entire_imgs_dir + img_name
+
+ img = cv2.imread(img_name)
+ img = cv2.resize(img, (input_size, input_size), interpolation=cv2.INTER_CUBIC)
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+ # save the image and calculate the mean so far
+ hdf5_file["x_entire_test"][field_number, ...] = img[None]
+
+ field_number += 1
+ if field_number > nb_test_samples - 1:
+ break
+
+
+ hdf5_file.close()
+
+
+
+
+ def create_hdf5_VAD_regression(self, dataset, input_size):
+ """ Saves a large number of images and their respective annotations (from EMOTIC Dataset) in a single HDF5 file.
+ # Reference
+ - http://machinelearninguru.com/deep_learning/data_preparation/hdf5/hdf5.html
+ - http://sunai.uoc.edu/emotic/
+
+ # Arguments
+ dataset: name of the dataset that the HDF5 file will be created for.
+ input_size: the default input size for the model (ref https://keras.io/applications/).
+ All models have input size of 224x224
+ except Xception,InceptionV3 and InceptionResNetV2 which have input size of 299x299.
+ """
+
+ if not (dataset in {'EMOTIC'}):
+ raise ValueError('The `dataset` argument can be set to `EMOTIC` only. '
+ 'More datasets will be added in future releases.')
+
+
+ if dataset == 'EMOTIC':
+ nb_train_samples = 23706
+ nb_val_samples = 3332
+ nb_test_samples = 7280
+
+ train_shape = (nb_train_samples, input_size, input_size, 3)
+ val_shape = (nb_val_samples, input_size, input_size, 3)
+ test_shape = (nb_test_samples, input_size, input_size, 3)
+
+ print('[INFO] Open the hdf5 file `'+ str(self.hdf5_file) +'` and start creating arrays.')
+
+ # open a hdf5 file and create arrays
+ hdf5_file = h5py.File(self.hdf5_file, mode='w')
+ hdf5_file.create_dataset("x_image_train", train_shape, np.uint8)
+ hdf5_file.create_dataset("x_image_val", val_shape, np.uint8)
+ hdf5_file.create_dataset("x_image_test", test_shape, np.uint8)
+
+ hdf5_file.create_dataset("x_body_train", train_shape, np.uint8)
+ hdf5_file.create_dataset("x_body_val", val_shape, np.uint8)
+ hdf5_file.create_dataset("x_body_test", test_shape, np.uint8)
+
+ # train arrays
+ hdf5_file.create_dataset("y_image_train", (nb_train_samples, 3), np.uint8)
+ hdf5_file["y_image_train"][...] = self.y_image_train
+
+ hdf5_file.create_dataset("y_body_train", (nb_train_samples, 3), np.uint8)
+ hdf5_file["y_body_train"][...] = self.y_body_train
+
+
+ # val arrays
+ hdf5_file.create_dataset("y_image_val", (nb_val_samples, 3), np.uint8)
+ hdf5_file["y_image_val"][...] = self.y_image_val
+
+ hdf5_file.create_dataset("y_body_val", (nb_val_samples, 3), np.uint8)
+ hdf5_file["y_body_val"][...] = self.y_body_val
+
+
+ # test arrays
+ hdf5_file.create_dataset("y_image_test", (nb_test_samples, 3), np.uint8)
+ hdf5_file["y_image_test"][...] = self.y_image_test
+
+ hdf5_file.create_dataset("y_body_test", (nb_test_samples, 3), np.uint8)
+ hdf5_file["y_body_test"][...] = self.y_body_test
+
+ print('[INFO] Arrays have been created.')
+
+
+ field_number = 0
+ print('[INFO] Start reading body images from train set.')
+ # loop over cropped images train addresses
+ for img_name in self.train_csv_file.filename:
+ progress(field_number, nb_train_samples)
+
+ img_name = self.cropped_imgs_dir + img_name
+
+ img = cv2.imread(img_name)
+ img = cv2.resize(img, (input_size, input_size), interpolation=cv2.INTER_CUBIC)
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+ # save the image and calculate the mean so far
+ hdf5_file["x_body_train"][field_number, ...] = img[None]
+
+ field_number += 1
+ if field_number > nb_train_samples - 1:
+ break
+
+
+ print('[INFO] Start reading entire images from train set.')
+ field_number = 0
+ # loop over entire images train addresses
+ for img_name in self.train_csv_file.filename:
+ progress(field_number, nb_train_samples)
+
+ img_name = self.entire_imgs_dir + img_name
+
+ img = cv2.imread(img_name)
+ img = cv2.resize(img, (input_size, input_size), interpolation=cv2.INTER_CUBIC)
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+ # save the image and calculate the mean so far
+ hdf5_file["x_image_train"][field_number, ...] = img[None]
+
+ field_number += 1
+ if field_number > nb_train_samples - 1:
+ break
+
+
+
+ print('[INFO] Start reading body images from validation set.')
+ field_number = 0
+ # loop over val addresses
+ for img_name in self.val_csv_file.filename:
+ progress(field_number, nb_val_samples)
+
+ img_name = self.cropped_imgs_dir + img_name
+
+ img = cv2.imread(img_name)
+ img = cv2.resize(img, (input_size, input_size), interpolation=cv2.INTER_CUBIC)
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+ # save the image and calculate the mean so far
+ hdf5_file["x_body_val"][field_number, ...] = img[None]
+
+ field_number += 1
+ if field_number > nb_val_samples - 1:
+ break
+
+
+ print('[INFO] Start reading entire images from validation set.')
+ field_number = 0
+ # loop over val addresses
+ for img_name in self.val_csv_file.filename:
+ progress(field_number, nb_val_samples)
+
+ img_name = self.entire_imgs_dir + img_name
+
+ img = cv2.imread(img_name)
+ img = cv2.resize(img, (input_size, input_size), interpolation=cv2.INTER_CUBIC)
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+ # save the image and calculate the mean so far
+ hdf5_file["x_image_val"][field_number, ...] = img[None]
+
+ field_number += 1
+ if field_number > nb_val_samples - 1:
+ break
+
+
+ print('[INFO] Start reading body images from test set.')
+ field_number = 0
+ # loop over val addresses
+ for img_name in self.test_csv_file.filename:
+ progress(field_number, nb_test_samples)
+
+ img_name = self.cropped_imgs_dir + img_name
+
+ img = cv2.imread(img_name)
+ img = cv2.resize(img, (input_size, input_size), interpolation=cv2.INTER_CUBIC)
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+ # save the image and calculate the mean so far
+ hdf5_file["x_body_test"][field_number, ...] = img[None]
+
+ field_number += 1
+ if field_number > nb_test_samples - 1:
+ break
+
+
+ print('[INFO] Start reading entire images from test set.')
+ field_number = 0
+ # loop over val addresses
+ for img_name in self.test_csv_file.filename:
+ progress(field_number, nb_test_samples)
+
+ img_name = self.entire_imgs_dir + img_name
+
+ img = cv2.imread(img_name)
+ img = cv2.resize(img, (input_size, input_size), interpolation=cv2.INTER_CUBIC)
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+ # save the image and calculate the mean so far
+ hdf5_file["x_image_test"][field_number, ...] = img[None]
+
+ field_number += 1
+ if field_number > nb_test_samples - 1:
+ break
+
+
+ hdf5_file.close()
\ No newline at end of file
diff --git a/preprocessing/emotic/hdf5_creation_example.py b/preprocessing/emotic/hdf5_creation_example.py
new file mode 100644
index 0000000..803d471
--- /dev/null
+++ b/preprocessing/emotic/hdf5_creation_example.py
@@ -0,0 +1,24 @@
+from hdf5_controller import Controller
+
+
+
+hdf5_file_name = 'EMOTIC-VAD-Classification-rescale.hdf5'
+
+train_csv_file_path ='/home/gkallia/git/emotic-VAD-classification/dataset/EMOTIC_resources/CSV/train.csv'
+val_csv_file_path ='/home/gkallia/git/emotic-VAD-classification/dataset/EMOTIC_resources/CSV/val.csv'
+test_csv_file_path ='/home/gkallia/git/emotic-VAD-classification/dataset/EMOTIC_resources/CSV/test.csv'
+
+cropped_imgs_dir ='/home/gkallia/git/emotic-VAD-classification/dataset/EMOTIC_resources/cropped_imgs/'
+entire_imgs_dir = '/home/gkallia/git/emotic-VAD-classification/dataset/EMOTIC_resources/entire_multiple_imgs/'
+main_numpy_dir ='/home/gkallia/git/emotic-VAD-classification/dataset/EMOTIC_resources/numpy_matrices/'
+
+controller = Controller(hdf5_file=hdf5_file_name,
+ train_csv_file_path=train_csv_file_path,
+ val_csv_file_path=val_csv_file_path,
+ test_csv_file_path=test_csv_file_path,
+ cropped_imgs_dir=cropped_imgs_dir,
+ entire_imgs_dir=entire_imgs_dir,
+ main_numpy_dir=main_numpy_dir)
+
+
+create_hdf5 = controller.create_hdf5_VAD_regression(dataset='EMOTIC', input_size=224)
diff --git a/preprocessing/emotic/load_data_from_numpy.py b/preprocessing/emotic/load_data_from_numpy.py
new file mode 100644
index 0000000..1de5367
--- /dev/null
+++ b/preprocessing/emotic/load_data_from_numpy.py
@@ -0,0 +1,164 @@
+"""Python utilities required to load data (image & their annotations) stored in numpy arrays.
+ Functions `load_numpy_arrays_single_output` & `load_numpy_arrays_emotions_age_only` are deprecated.
+ Use either the main function `load_data_from_numpy` to load all the applicable arrays
+ or the supporting `load_annotations_only_from_numpy` instead.
+"""
+
+from __future__ import print_function
+import numpy as np
+
+def load_data_from_numpy(main_numpy_dir,
+ verbose = 1):
+
+ print ('[INFO] Loading data from numpy arrays...')
+
+ x_entire_train = np.load(main_numpy_dir + 'X_train/x_entire_train.npy')
+ x_cropped_train = np.load(main_numpy_dir + 'X_train/x_cropped_train.npy')
+
+ valence_entire_train = np.load(main_numpy_dir + 'Y_train/valence_train.npy')
+ valence_cropped_train = np.load(main_numpy_dir + 'Y_train/valence_train.npy')
+
+ arousal_entire_train = np.load(main_numpy_dir + 'Y_train/arousal_train.npy')
+ arousal_cropped_train = np.load(main_numpy_dir + 'Y_train/arousal_train.npy')
+
+ dominance_entire_train = np.load(main_numpy_dir + 'Y_train/dominance_train.npy')
+ dominance_cropped_train = np.load(main_numpy_dir + 'Y_train/dominance_train.npy')
+
+
+
+ x_entire_val = np.load(main_numpy_dir + 'X_train/x_entire_val.npy')
+ x_cropped_val = np.load(main_numpy_dir + 'X_train/x_cropped_val.npy')
+
+
+
+ valence_entire_val = np.load(main_numpy_dir + 'Y_train/valence_val.npy')
+ valence_cropped_val = np.load(main_numpy_dir + 'Y_train/valence_val.npy')
+
+ arousal_entire_val = np.load(main_numpy_dir + 'Y_train/arousal_val.npy')
+ arousal_cropped_val = np.load(main_numpy_dir + 'Y_train/arousal_val.npy')
+
+ dominance_entire_val = np.load(main_numpy_dir + 'Y_train/dominance_val.npy')
+ dominance_cropped_val = np.load(main_numpy_dir + 'Y_train/dominance_val.npy')
+
+
+
+ x_entire_test = np.load(main_numpy_dir + 'X_train/x_entire_test.npy')
+ x_cropped_test = np.load(main_numpy_dir + 'X_train/x_cropped_test.npy')
+
+ valence_entire_test = np.load(main_numpy_dir + 'Y_train/valence_test.npy')
+ valence_cropped_test = np.load(main_numpy_dir + 'Y_train/valence_test.npy')
+
+ arousal_entire_test = np.load(main_numpy_dir + 'Y_train/arousal_test.npy')
+ arousal_cropped_test = np.load(main_numpy_dir + 'Y_train/arousal_test.npy')
+
+ dominance_entire_test = np.load(main_numpy_dir + 'Y_train/dominance_test.npy')
+ dominance_cropped_test = np.load(main_numpy_dir + 'Y_train/dominance_test.npy')
+
+
+
+ print('[INFO] Data have been successfully loaded')
+ print('---------------------------------------------------------------------------------------------------')
+ if verbose == 1:
+ print('x_entire_train shape:', x_entire_train.shape)
+ print('x_cropped_train shape:', x_cropped_train.shape)
+ print('valence_entire_train shape:', valence_entire_train.shape)
+ print('valence_cropped_train shape:', valence_cropped_train.shape)
+ print('arousal_entire_train shape:', arousal_entire_train.shape)
+ print('arousal_cropped_train shape:', arousal_cropped_train.shape)
+ print('dominance_entire_train shape:', dominance_entire_train.shape)
+ print('dominance_cropped_train shape:', dominance_cropped_train.shape)
+
+ print ('---------------------------------------------------------------------------------------------------')
+
+ print('x_entire_val shape:', x_entire_val.shape)
+ print('x_cropped_val shape:', x_cropped_val.shape)
+ print('valence_entire_val shape:', valence_entire_val.shape)
+ print('valence_cropped_val shape:', valence_cropped_val.shape)
+ print('arousal_entire_val shape:', arousal_entire_val.shape)
+ print('arousal_cropped_val shape:', arousal_cropped_val.shape)
+ print('dominance_entire_val shape:', dominance_entire_val.shape)
+ print('dominance_cropped_val shape:', dominance_cropped_val.shape)
+
+ print ('---------------------------------------------------------------------------------------------------')
+
+ print('x_entire_test shape:', x_entire_test.shape)
+ print('x_cropped_test shape:', x_cropped_test.shape)
+ print('valence_entire_test shape:', valence_entire_test.shape)
+ print('valence_cropped_test shape:', valence_cropped_test.shape)
+ print('arousal_entire_test shape:', arousal_entire_test.shape)
+ print('arousal_cropped_test shape:', arousal_cropped_test.shape)
+ print('dominance_entire_test shape:', dominance_entire_test.shape)
+ print('dominance_cropped_test shape:', dominance_cropped_test.shape)
+
+ print ('---------------------------------------------------------------------------------------------------')
+
+ return (x_entire_train, x_cropped_train,valence_entire_train,valence_cropped_train,arousal_entire_train,arousal_cropped_train,dominance_entire_train,dominance_cropped_train), \
+ (x_entire_val, x_cropped_val, valence_entire_val,valence_cropped_val,arousal_entire_val,arousal_cropped_val,dominance_entire_val,dominance_cropped_val), \
+ (x_entire_test, x_cropped_test, valence_entire_test,valence_cropped_test,arousal_entire_test,arousal_cropped_test,dominance_entire_test,dominance_cropped_test)
+
+
+
+def load_data_from_numpy_single_output(main_numpy_dir,
+ verbose=1):
+
+ print ('[INFO] Loading data from numpy arrays...')
+
+ x_image_train = np.load(main_numpy_dir + 'X_train/x_image_train.npy')
+ x_body_train = np.load(main_numpy_dir + 'X_train/x_body_train.npy')
+
+ y_image_train = np.load(main_numpy_dir + 'Y_train/y_train.npy')
+ y_body_train = np.load(main_numpy_dir + 'Y_train/y_train.npy')
+
+
+ x_image_val = np.load(main_numpy_dir + 'X_train/x_image_val.npy')
+ x_body_val = np.load(main_numpy_dir + 'X_train/x_body_val.npy')
+
+ y_image_val = np.load(main_numpy_dir + 'Y_train/y_val.npy')
+ y_body_val = np.load(main_numpy_dir + 'Y_train/y_val.npy')
+
+
+ x_image_test = np.load(main_numpy_dir + 'X_train/x_image_test.npy')
+ x_body_test = np.load(main_numpy_dir + 'X_train/x_body_test.npy')
+
+ y_image_test = np.load(main_numpy_dir + 'Y_train/y_test.npy')
+ y_body_test = np.load(main_numpy_dir + 'Y_train/y_test.npy')
+
+
+
+ print('[INFO] Data have been successfully loaded')
+ print('---------------------------------------------------------------------------------------------------')
+ if verbose == 1:
+ print('x_image_train shape:', x_image_train.shape)
+ print('x_body_train shape:', x_body_train.shape)
+ print('y_image_train shape:', y_image_train.shape)
+ print('y_body_train shape:', y_body_train.shape)
+
+
+ print ('---------------------------------------------------------------------------------------------------')
+
+ print('x_image_val shape:', x_image_val.shape)
+ print('x_body_val shape:', x_body_val.shape)
+ print('y_image_val shape:', y_image_val.shape)
+ print('y_body_val shape:', y_body_val.shape)
+
+
+ print ('---------------------------------------------------------------------------------------------------')
+
+ print('x_image_test shape:', x_image_test.shape)
+ print('x_body_test shape:', x_body_test.shape)
+ print('y_image_test shape:', y_image_test.shape)
+ print('y_body_test shape:', y_body_test.shape)
+
+
+ print ('---------------------------------------------------------------------------------------------------')
+
+ return (x_image_train, x_body_train,y_image_train,y_body_train), \
+ (x_image_val, x_body_val, y_image_val,y_body_val), \
+ (x_image_test, x_body_test,y_image_test,y_body_test)
+
+
+
+
+
+
+
diff --git a/preprocessing/emotic/save_raw_imgs.py b/preprocessing/emotic/save_raw_imgs.py
new file mode 100644
index 0000000..71e92a3
--- /dev/null
+++ b/preprocessing/emotic/save_raw_imgs.py
@@ -0,0 +1,92 @@
+"""Python utility which reads image names (locations on disk) from a csv file,
+ loads them using the basic set of tools for image data provided by Keras (keras/preprocessing/image.py)
+ and saves them in a numpy array.
+"""
+
+import numpy as np
+import pandas
+from utils.generic_utils import progress
+from keras.preprocessing import image
+from keras.applications.vgg16 import preprocess_input
+
+
+# reference https://stackoverflow.com/questions/12984426/python-pil-ioerror-image-file-truncated-with-big-images
+from PIL import ImageFile
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+
+def save_img_to_numpy(base_img_dir,
+ base_csv_dir,
+ input_size,
+ mode='train',
+ to_file = 'numpy_annotations/x_train'
+ ):
+ """ Saves images loaded from a CSV to numpy array.
+
+ # Arguments
+ base_img_dir: the directory where the raw images are stored.
+ In our setup, we:
+ - created train/ val/ and test/ subfolders inside EMOTIC_database/
+
+ base_csv_dir: the directory where the CSV files are stored.
+ input_size: the default input size for the model (ref https://keras.io/applications/).
+ All models have input size of 224x224 except Xception,InceptionV3 and InceptionResNetV2 which have input size of 299x299.
+ mode: one of `train` (train set), `val` (validation set)
+ or `test` (test set).
+ to_file: the name or path of the numpy array where the images will be saved.
+ """
+
+
+ # Load CSV File With Pandas
+ csv_name = base_csv_dir + mode + '.csv'
+ csv_file = pandas.read_csv(csv_name)
+
+ if mode == 'train':
+ nb_samples = 23706
+ elif mode == 'val':
+ nb_samples = 3332
+ elif mode == 'test':
+ nb_samples = 7280
+
+ field_number = 0
+
+ # pre-allocating the data array, and then loading the data directly into it
+ # ref: https://hjweide.github.io/efficient-image-loading
+ data = np.empty((nb_samples, input_size, input_size, 3), dtype=np.uint8)
+
+ for img_name in csv_file.filename:
+ progress(field_number, nb_samples)
+
+ img_name = base_img_dir + img_name
+ img = image.load_img(img_name, target_size=(input_size, input_size))
+
+ x = image.img_to_array(img) # this is a Numpy array with shape (input_size, input_size, 3)
+ x = np.expand_dims(x, axis=0) # this is a Numpy array with shape (1, input_size, input_size, 3)
+ x = preprocess_input(x)
+
+ data[field_number, ...] = x
+
+ field_number += 1
+ if field_number > nb_samples - 1:
+ break
+
+ np.save(to_file, data)
+
+ return data
+
+
+if __name__ == '__main__':
+
+ # x_train= save_img_to_numpy(base_img_dir ='/home/sandbox/Desktop/EMOTIC_database/',
+ # base_csv_dir = '/home/sandbox/Desktop/Keras-EMOTIC/dataset/',
+ # input_size = 299,
+ # mode='test',
+ # to_file = 'x_test')
+
+
+
+ x_train = save_img_to_numpy(base_img_dir='/home/sandbox/Desktop/EMOTIC_resources/v0.3_divisible/entire_multiple_imgs/',
+ base_csv_dir='/home/sandbox/Desktop/EMOTIC_resources/v0.3_divisible/',
+ input_size=224,
+ mode='val',
+ to_file='x_entire_val')
diff --git a/preprocessing/emotic/x_train_csv_to_numpy.py b/preprocessing/emotic/x_train_csv_to_numpy.py
new file mode 100644
index 0000000..46d02b4
--- /dev/null
+++ b/preprocessing/emotic/x_train_csv_to_numpy.py
@@ -0,0 +1,117 @@
+"""Python utility which reads image names (locations on disk) from a csv file,
+ loads them using the basic set of tools for image data provided by Keras (keras/preprocessing/image.py)
+ and saves them in a numpy array.
+"""
+
+import numpy as np
+import pandas
+from utils.generic_utils import progress, imagenet_preprocess_input, places_preprocess_input
+from keras.preprocessing import image
+from keras.applications.vgg16 import preprocess_input
+from keras.applications.resnet50 import preprocess_input
+
+
+# reference https://stackoverflow.com/questions/12984426/python-pil-ioerror-image-file-truncated-with-big-images
+from PIL import ImageFile
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+
+def save_x_train_to_numpy(base_img_dir,
+ base_csv_dir,
+ input_size,
+ input_img_mode,
+ mode='train',
+ to_file = 'numpy_annotations/x_train'
+ ):
+ """ Saves images loaded from a CSV to numpy array.
+
+ # Arguments
+ base_img_dir: the directory where the raw images are stored.
+ In our setup, we:
+ - created train/ val/ and test/ subfolders inside EMOTIC_database/
+
+ base_csv_dir: the directory where the CSV files are stored.
+ input_size: the default input size for the model (ref https://keras.io/applications/).
+ All models have input size of 224x224 except Xception,InceptionV3 and InceptionResNetV2 which have input size of 299x299.
+ input_img_mode: one of `body` (cropped images) or `body` (entire images)
+ mode: one of `train` (train set), `val` (validation set)
+ or `test` (test set).
+ to_file: the name or path of the numpy array where the images will be saved.
+ """
+
+
+ # Load CSV File With Pandas
+ csv_name = base_csv_dir + mode + '.csv'
+ csv_file = pandas.read_csv(csv_name)
+
+ if mode == 'train':
+ nb_samples = 23706
+ elif mode == 'val':
+ nb_samples = 3332
+ elif mode == 'test':
+ nb_samples = 7280
+
+ field_number = 0
+
+ # pre-allocating the data array, and then loading the data directly into it
+ # ref: https://hjweide.github.io/efficient-image-loading
+ data = np.empty((nb_samples, input_size, input_size, 3), dtype=np.uint8)
+
+ for img_name in csv_file.filename:
+ progress(field_number, nb_samples)
+
+ img_name = base_img_dir + img_name
+ img = image.load_img(img_name, target_size=(input_size, input_size)) # load an image from file
+ x = image.img_to_array(img) # this is a Numpy array with shape (input_size, input_size, 3)
+ x = np.expand_dims(x, axis=0) # this is a Numpy array with shape (1, input_size, input_size, 3)
+
+ # prepare the image for either ImageNet-based or Places-based models
+ if input_img_mode == 'body':
+ x = imagenet_preprocess_input(x)
+ elif input_img_mode == 'image':
+ x = places_preprocess_input(x)
+
+
+ # x = (x / 255.).astype(np.float32)
+
+ # scaling the RGB values to a 0-1.0 range.
+ # reference https://www.linkedin.com/pulse/keras-image-preprocessing-scaling-pixels-training-adwin-jahn/
+ # x /= 255
+ data[field_number, ...] = x
+
+ field_number += 1
+ if field_number > nb_samples - 1:
+ break
+
+ np.save(to_file, data)
+
+ return data
+
+
+if __name__ == '__main__':
+
+ # one of `train` (train set), `val` (validation set) or `test` (test set).
+ mode = 'test'
+
+ # one of `body` or `image`
+ input_img_mode = 'image'
+
+
+
+
+
+ if input_img_mode == 'body':
+ base_img_dir = '/home/sandbox/Desktop/EMOTIC_resources/raw_refined_images/cropped_imgs/'
+ to_file = 'x_body_'+mode
+ elif input_img_mode == 'image':
+ base_img_dir = '/home/sandbox/Desktop/EMOTIC_resources/raw_refined_images/entire_multiple_imgs/'
+ to_file = 'x_image_' + mode
+
+
+
+ x_train = save_x_train_to_numpy(base_img_dir=base_img_dir,
+ base_csv_dir='/home/sandbox/Desktop/EMOTIC_resources/VAD-classification/CSV/',
+ input_size=224,
+ input_img_mode = input_img_mode,
+ mode=mode,
+ to_file=to_file)
diff --git a/preprocessing/emotic/y_train_csv_to_numpy.py b/preprocessing/emotic/y_train_csv_to_numpy.py
new file mode 100644
index 0000000..eb6320c
--- /dev/null
+++ b/preprocessing/emotic/y_train_csv_to_numpy.py
@@ -0,0 +1,209 @@
+"""Python utility which reads entries (different annotations) from a csv file,
+ converts them to integer/list of integers(for multilabel-multiclass settings) and saves them in a numpy array.
+"""
+
+import pandas
+import numpy as np
+import math
+from keras.utils import np_utils
+
+
+def multi_output(csv_path,
+ nb_samples,
+ entry_type,
+ to_file):
+ """Reads entries (different annotations) from a csv file,
+ converts them to integer/list of integers(for multilabel-multiclass settings) and saves them in a numpy array.
+
+ # Arguments
+ csv_path: String to declare the full path of the csv file .
+ nb_samples: Number of entries in the csv file to iterate.
+ entry_type: The header name of the column to process. One of `emotions`, `valence`, `arousal`, `dominance` and `age`.
+ to_file: File name of the numpy array that will hold the converted entries.
+ """
+
+ csv_file = pandas.read_csv(csv_path)
+
+ # counter to iterate through all csv entries
+ field_number = 0
+
+ final_list = []
+
+ if entry_type == 'valence':
+ for entry in csv_file.valence:
+
+ # convert the retrieved csv entry (whose type is always str) into integer/list of integers
+ # int_list = map(int, entry.split(','))
+ #
+ # int_list = np_utils.to_categorical(int_list)
+
+ # append the converted integer/list of integers to the final list
+ # we need to subtrack 1 in order to transform the paper's 1-10 scale to pythonic way 0-9
+ final_list.append(entry-1)
+
+
+ field_number += 1
+ if field_number > nb_samples - 1:
+ break
+
+ elif entry_type == 'arousal':
+ for entry in csv_file.arousal:
+
+ # convert the retrieved csv entry (whose type is always str) into integer/list of integers
+ # int_list = map(int, entry.split(','))
+ #
+ # int_list = np_utils.to_categorical(int_list)
+
+ # append the converted integer/list of integers to the final list
+ # we need to subtrack 1 in order to transform the paper's 1-10 scale to pythonic way 0-9
+ final_list.append(entry - 1)
+
+ field_number += 1
+ if field_number > nb_samples - 1:
+ break
+
+
+ elif entry_type == 'dominance':
+ for entry in csv_file.dominance:
+
+ # convert the retrieved csv entry (whose type is always str) into integer/list of integers
+ # int_list = map(int, entry.split(','))
+ #
+ # int_list = np_utils.to_categorical(int_list)
+
+ # append the converted integer/list of integers to the final list
+ # we need to subtrack 1 in order to transform the paper's 1-10 scale to pythonic way 0-9
+ final_list.append(entry - 1)
+
+ field_number += 1
+ if field_number > nb_samples - 1:
+ break
+
+ # expand dimensions from (xxxx,) to(xxxx, 1)
+ # final_list = np.expand_dims(final_list, axis=1)
+ np.save(to_file, final_list)
+
+
+def three_neurons_single_output(csv_path,
+ nb_samples,
+ entry_type,
+ to_file):
+ """Reads entries (different annotations) from a csv file,
+ converts them to integer/list of integers(for multilabel-multiclass settings) and saves them in a numpy array.
+
+ # Arguments
+ csv_path: String to declare the full path of the csv file .
+ nb_samples: Number of entries in the csv file to iterate.
+ entry_type: The header name of the column to process. One of `emotions`, `valence`, `arousal`, `dominance` and `age`.
+ to_file: File name of the numpy array that will hold the converted entries.
+ """
+
+ csv_file = pandas.read_csv(csv_path)
+
+ # counter to iterate through all csv entries
+ field_number = 0
+
+ final_list = []
+
+ if entry_type == 'VAD':
+ for entry in csv_file.VAD:
+
+ # convert the retrieved csv entry (whose type is always str) into integer/list of integers
+ int_list = map(int, entry.split(','))
+
+ # int_list = np_utils.to_categorical(int_list)
+
+ # append the converted integer/list of integers to the final list
+ final_list.append(int_list)
+
+
+ field_number += 1
+ if field_number > nb_samples - 1:
+ break
+
+
+
+ # expand dimensions from (xxxx,) to(xxxx, 1)
+ # final_list = np.expand_dims(final_list, axis=1)
+ np.save(to_file, final_list)
+
+
+
+
+if __name__ == '__main__':
+ #
+ # # for multi-output model
+ #
+ # mode = 'val'
+ # entry_type = 'valence'
+ #
+ #
+ #
+ # to_file = entry_type + '_'+mode
+ # csv_path= '/home/sandbox/Desktop/EMOTIC_resources/VAD-classification/labels/'+ mode +'.csv'
+ #
+ #
+ # if mode == 'train':
+ # nb_samples = 23706
+ # elif mode == 'val':
+ # nb_samples = 3332
+ # elif mode == 'test':
+ # nb_samples = 7280
+ #
+ #
+ #
+ #
+ # multi_output(csv_path =csv_path,
+ # nb_samples = nb_samples,
+ # entry_type = entry_type,
+ # to_file= to_file)
+ #
+ #
+ # x = np.load(str(to_file)+'.npy')
+ #
+ # print x.shape
+ #
+ # # print x[2]
+ #
+ # from keras.utils import np_utils
+ #
+ # x = np_utils.to_categorical(x)
+ #
+ #
+ # np.save(to_file, x)
+ #
+ # x = np.load(str(to_file) + '.npy')
+ #
+ # print x.shape
+
+
+
+ # for three neurons single output model
+
+ mode = 'test'
+ entry_type = 'VAD'
+
+ to_file = 'y_' + mode +'.npy'
+
+ csv_path= '/home/sandbox/Desktop/EMOTIC_resources/VAD-classification/CSV/'+ mode +'.csv'
+
+
+ if mode == 'train':
+ nb_samples = 23706
+ elif mode == 'val':
+ nb_samples = 3332
+ elif mode == 'test':
+ nb_samples = 7280
+
+
+ three_neurons_single_output(csv_path=csv_path,
+ nb_samples=nb_samples,
+ entry_type=entry_type,
+ to_file=to_file)
+
+
+ x = np.load(str(to_file))
+
+ print x.shape
+
+ print x[0]
\ No newline at end of file
diff --git a/train_emotic_unified.py b/train_emotic_unified.py
new file mode 100644
index 0000000..83006dc
--- /dev/null
+++ b/train_emotic_unified.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+""" Training script for continuous emotion recognition in VAD space.
+
+# Reference
+- [Emotion Recognition in Context](http://sunai.uoc.edu/emotic/pdf/EMOTIC_cvpr2017.pdf)
+- https://stackoverflow.com/questions/43452441/keras-all-layer-names-should-be-unique
+
+"""
+
+from __future__ import print_function
+import argparse
+from engine.human_centric_branch.emotic_vad_model import EMOTIC_VAD
+from applications.emotic_utils import _obtain_weights_CSVLogger_filenames as regression_obtain_weights_CSVLogger_filenames
+
+
+
+
+def get_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--body_backbone_CNN", type = str,help = 'One of `VGG16`, `VGG19`, `ResNet50`, `VGG16_Places365`')
+ parser.add_argument("--image_backbone_CNN", type = str,help = 'One of `VGG16`, `VGG19`, `ResNet50`, `VGG16_Places365`')
+ parser.add_argument("--modelCheckpoint_quantity", type=str, help='Quantity to monitor when `ModelCheckpoint` is enabled')
+ parser.add_argument("--earlyStopping_quantity", type=str, help='Quantity to monitor when `EarlyStopping` is enabled')
+ parser.add_argument("--nb_of_epochs", type=int, help="Total number of iterations on the data")
+
+ args = parser.parse_args()
+ return args
+
+
+
+args = get_args()
+
+hdf5_file = '/home/gkallia/git/emotic-VAD-classification/dataset/EMOTIC-VAD-regression.hdf5'
+
+
+modelCheckpoint_quantity = args.modelCheckpoint_quantity
+earlyStopping_quantity = args.earlyStopping_quantity
+
+weights_filename, CSVLogger_filename = regression_obtain_weights_CSVLogger_filenames(body_backbone_CNN=args.body_backbone_CNN,
+ image_backbone_CNN=args.image_backbone_CNN)
+
+emotic_model = EMOTIC_VAD(hdf5_file=hdf5_file,
+ body_backbone_CNN=args.body_backbone_CNN,
+ image_backbone_CNN=args.image_backbone_CNN,
+ nb_of_epochs=args.nb_of_epochs,
+ weights_to_file=weights_filename,
+ modelCheckpoint_quantity=modelCheckpoint_quantity,
+ earlyStopping_quantity=earlyStopping_quantity,
+ CSVLogger_filename=CSVLogger_filename)
+
+
+
+
+emotic_model.train()
\ No newline at end of file
diff --git a/train_hra_2class_unified.py b/train_hra_2class_unified.py
new file mode 100644
index 0000000..af85ed8
--- /dev/null
+++ b/train_hra_2class_unified.py
@@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+""" Top-level (abstract) script for training (fine-tuning) various CNNs on the HRA dataset with 2 classes.
+
+ Example
+ --------
+ >>> python train_hra_2class_unified.py --violation_class cl --pre_trained_model vgg16 --nb_of_conv_layers_to_fine_tune 1 --nb_of_epochs 50
+
+"""
+
+from __future__ import print_function
+import argparse
+import os
+
+from applications.hra_utils import _obtain_weights_CSVLogger_filenames,_obtain_train_mode, _obtain_first_phase_trained_weights
+from wrappers.hra_transfer_cnn_manager import HRA_Transfer_CNN_Manager
+
+def get_args():
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument("--violation_class", type = str, help = " One of `cl` ([i]'child_labour' & [ii]'no violation') "
+ " or `dp` ([i]'displaced_populations' & [ii]'no violation')")
+ parser.add_argument("--pre_trained_model", type = str,help = 'One of `vgg16`, `vgg19`, `resnet50` or `vgg16_places365`')
+ parser.add_argument("--nb_of_conv_layers_to_fine_tune", type = int, default=None, help = "Number of conv. layers to fine-tune")
+ parser.add_argument("--nb_of_epochs", type = int, help = "Total number of iterations on the data")
+
+ args = parser.parse_args()
+ return args
+
+
+
+# --------- Configure and pass a tensorflow session to Keras to restrict GPU memory fraction --------- #
+import tensorflow as tf
+from keras.backend.tensorflow_backend import set_session
+config = tf.ConfigProto()
+config.gpu_options.per_process_gpu_memory_fraction = 0.50
+set_session(tf.Session(config=config))
+
+
+args = get_args()
+
+# feature extraction case
+if args.nb_of_conv_layers_to_fine_tune is None:
+ first_phase_trained_weights = None
+
+# fine-tune case
+elif args.nb_of_conv_layers_to_fine_tune in {1, 2, 3}:
+ first_phase_trained_weights = _obtain_first_phase_trained_weights(violation_class = args.violation_class, model_name= args.pre_trained_model)
+ # check if the first_phase_trained_weights does exist
+ if os.path.isfile(first_phase_trained_weights) is False:
+ raise IOError("No such weights file: `" + first_phase_trained_weights + "`. ")
+
+train_mode = _obtain_train_mode(nb_of_conv_layers_to_fine_tune=args.nb_of_conv_layers_to_fine_tune)
+
+weights_filename, CSVLogger_filename = _obtain_weights_CSVLogger_filenames(violation_class=args.violation_class,
+ train_mode=train_mode,
+ model_name=args.pre_trained_model,
+ nb_of_conv_layers_to_fine_tune=args.nb_of_conv_layers_to_fine_tune
+ )
+
+modelCheckpoint_quantity = 'val_loss'
+earlyStopping_quantity = 'val_loss'
+
+
+
+transfer_cnn_manager = HRA_Transfer_CNN_Manager(violation_class = args.violation_class,
+ train_mode=train_mode,
+ pre_trained_model = args.pre_trained_model,
+ nb_of_conv_layers_to_fine_tune = args.nb_of_conv_layers_to_fine_tune,
+ weights_to_file = weights_filename,
+ first_phase_trained_weights = first_phase_trained_weights,
+ nb_of_epochs = args.nb_of_epochs,
+ modelCheckpoint_quantity = modelCheckpoint_quantity,
+ earlyStopping_quantity = earlyStopping_quantity,
+ CSVLogger_filename = CSVLogger_filename,
+ )
+
+
+transfer_cnn_manager.train()
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/utils/generic_utils.py b/utils/generic_utils.py
new file mode 100644
index 0000000..f29d9e3
--- /dev/null
+++ b/utils/generic_utils.py
@@ -0,0 +1,342 @@
+# -*- coding: utf-8 -*-
+"""Python utilities required by `AbuseNet`. """
+
+from PIL import Image
+from keras.callbacks import LearningRateScheduler
+from keras import backend as K
+import numpy as np
+import matplotlib.pyplot as plt
+import itertools
+import sys
+import tensorflow as tf
+import keras.backend.tensorflow_backend as tfb
+
+
+def crop(image_path, coords, saved_location):
+ """ Crops an image.
+ # Reference
+ - https://www.blog.pythonlibrary.org/2017/10/03/how-to-crop-a-photo-with-python/
+
+ # Arguments
+ image_path: The path to the image to edit.
+ coords: A tuple of x/y coordinates (x1, y1, x2, y2).
+ saved_location: Path to save the cropped image.
+ """
+ image_obj = Image.open(image_path)
+ cropped_image = image_obj.crop(coords)
+ cropped_image.save(saved_location)
+
+def crop_no_save(image_path, coords):
+ """ Crops an image.
+ # Reference
+ - https://www.blog.pythonlibrary.org/2017/10/03/how-to-crop-a-photo-with-python/
+
+ # Arguments
+ image_path: The path to the image to edit.
+ coords: A tuple of x/y coordinates (x1, y1, x2, y2).
+ saved_location: Path to save the cropped image.
+ """
+ image_obj = Image.open(image_path)
+ cropped_image = image_obj.crop(coords)
+
+ return cropped_image
+
+
+
+def progress(count, total):
+ """ Command line progress bar.
+ # Reference
+ - https://gist.github.com/vladignatyev/06860ec2040cb497f0f3
+
+ """
+ bar_len = 60
+ filled_len = int(round(bar_len * count / float(total)))
+
+ # percents = round(100.0 * count / float(total), 1)
+
+ tmp = str(count)+'/'+str(total)
+ bar = '=' * filled_len + '.' * (bar_len - filled_len)
+
+ print ('%s [%s]\r' % (tmp, bar))
+ # sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
+ # sys.stdout.flush()
+
+
+def hms_string(sec_elapsed):
+ """ Formats the nb of seconds returned for a process.
+ """
+ h = int(sec_elapsed / (60 * 60))
+ m = int((sec_elapsed % (60 * 60)) / 60)
+ s = sec_elapsed % 60.
+ return "{}:{:>02}:{:>05.2f}".format(h, m, s)
+
+
+
+def preprocess_input(x, v2=True):
+ x = x.astype('float32')
+ x = x / 255.0
+ if v2:
+ x = x - 0.5
+ x = x * 2.0
+ return x
+
+
+
+def step_decay_schedule(initial_lr=1e-3, decay_factor=0.75, step_size=10):
+ '''Wrapper function to create a LearningRateScheduler with step decay schedule.
+ # Reference
+ - https://gist.github.com/jeremyjordan/86398d7c05c02396c24661baa4c88165
+ '''
+ def schedule(epoch):
+ return initial_lr * (decay_factor ** np.floor(epoch / step_size))
+
+ return LearningRateScheduler(schedule)
+
+
+
+def plot_confusion_matrix(cm, classes,
+ normalize=False,
+ title='Confusion matrix',
+ cmap=plt.cm.Blues):
+ """
+ This function prints and plots the confusion matrix.
+ Normalization can be applied by setting `normalize=True`.
+ """
+ if normalize:
+ cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+ print("Normalized confusion matrix")
+ else:
+ print('Confusion matrix, without normalization')
+
+ print(cm)
+
+ plt.imshow(cm, interpolation='nearest', cmap=cmap)
+ plt.title(title)
+ plt.colorbar()
+ tick_marks = np.arange(len(classes))
+ plt.xticks(tick_marks, classes, rotation=45)
+ plt.yticks(tick_marks, classes)
+
+ fmt = '.2f' if normalize else 'd'
+ thresh = cm.max() / 2.
+ for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
+ plt.text(j, i, format(cm[i, j], fmt),
+ horizontalalignment="center",
+ color="white" if cm[i, j] > thresh else "black")
+
+ plt.tight_layout()
+ plt.ylabel('True label')
+ plt.xlabel('Predicted label')
+
+
+# Print iterations progress
+# reference https://gist.github.com/aubricus/f91fb55dc6ba5557fbab06119420dd6a
+def print_progress(iteration, total, prefix='', suffix='', decimals=1, bar_length=100):
+ """
+ Call in a loop to create terminal progress bar
+ @params:
+ iteration - Required : current iteration (Int)
+ total - Required : total iterations (Int)
+ prefix - Optional : prefix string (Str)
+ suffix - Optional : suffix string (Str)
+ decimals - Optional : positive number of decimals in percent complete (Int)
+ bar_length - Optional : character length of bar (Int)
+ """
+ str_format = "{0:." + str(decimals) + "f}"
+ percents = str_format.format(100 * (iteration / float(total)))
+ filled_length = int(round(bar_length * iteration / float(total)))
+ bar = '█' * filled_length + '-' * (bar_length - filled_length)
+
+ sys.stdout.write('\r%s |%s| %s%s %s' % (prefix, bar, percents, '%', suffix)),
+
+ if iteration == total:
+ sys.stdout.write('\n')
+ sys.stdout.flush()
+
+
+def places_preprocess_input(x, dim_ordering='default'):
+ if dim_ordering == 'default':
+ dim_ordering = K.image_dim_ordering()
+ assert dim_ordering in {'tf', 'th'}
+
+ if dim_ordering == 'th':
+ x[:, 0, :, :] -= 104.006
+ x[:, 1, :, :] -= 116.669
+ x[:, 2, :, :] -= 122.679
+ # 'RGB'->'BGR'
+ x = x[:, ::-1, :, :]
+ else:
+ x[:, :, :, 0] -= 104.006
+ x[:, :, :, 1] -= 116.669
+ x[:, :, :, 2] -= 122.679
+ # 'RGB'->'BGR'
+ x = x[:, :, :, ::-1]
+ return x
+
+
+def imagenet_preprocess_input(x, dim_ordering='default'):
+ if dim_ordering == 'default':
+ dim_ordering = K.image_dim_ordering()
+ assert dim_ordering in {'tf', 'th'}
+
+ if dim_ordering == 'th':
+ x[:, 0, :, :] -= 103.939
+ x[:, 1, :, :] -= 116.779
+ x[:, 2, :, :] -= 123.68
+ # 'RGB'->'BGR'
+ x = x[:, ::-1, :, :]
+ else:
+ x[:, :, :, 0] -= 103.939
+ x[:, :, :, 1] -= 116.779
+ x[:, :, :, 2] -= 123.68
+ # 'RGB'->'BGR'
+ x = x[:, :, :, ::-1]
+ return x
+
+
+def round_number(number):
+
+ rounded_number = int(round(number))
+
+ # make sure there are no values above 10 (which is the maximum value stated in the paper)
+ if rounded_number > 10:
+
+ rounded_number = 10
+
+ return rounded_number
+
+
+
+# -------------------------------------------------------------------------------- #
+# Additional loss functions & metrics
+# -------------------------------------------------------------------------------- #
+
+def euclidean_distance_loss(y_true, y_pred):
+ """
+ Euclidean distance loss
+ https://en.wikipedia.org/wiki/Euclidean_distance
+ http://www.riptutorial.com/keras/example/32022/euclidean-distance-loss
+ :param y_true: TensorFlow/Theano tensor
+ :param y_pred: TensorFlow/Theano tensor of the same shape as y_true
+ :return: float
+ """
+ return K.sqrt(K.sum(K.square(y_pred - y_true), axis=-1))
+
+
+def rmse(y_true, y_pred):
+ """
+ Root mean squared error
+ https://en.wikipedia.org/wiki/Euclidean_distance
+ http://www.riptutorial.com/keras/example/32022/euclidean-distance-loss
+ :param y_true: TensorFlow/Theano tensor
+ :param y_pred: TensorFlow/Theano tensor of the same shape as y_true
+ :return: float
+ """
+ return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))
+
+
+
+
+class WeightedEuclideanDistance(object):
+ """
+ A weighted version of Euclidean distance loss for keras. This lets you apply a weight to unbalanced classes.
+ # reference: implementation based on https://github.com/keras-team/keras/issues/2115#issuecomment-315571824
+
+ Usage:
+ The constructor expects a dictionary with same structure as `class_weight` param from model.fit
+ """
+
+ def __init__(self, weights):
+ nb_cl = len(weights)
+ self.weights = np.ones((nb_cl, nb_cl))
+ for class_idx, class_weight in weights.items():
+ self.weights[0][class_idx] = class_weight
+ self.weights[class_idx][0] = class_weight
+ self.__name__ = 'weighted_euclidean_distance'
+
+ def __call__(self, y_true, y_pred):
+ return self.weighted_euclidean_distance(y_true, y_pred)
+
+ def weighted_euclidean_distance(self, y_true, y_pred):
+ nb_cl = len(self.weights)
+ final_mask = K.zeros_like(y_pred[..., 0])
+ y_pred_max = K.max(y_pred, axis=-1)
+ y_pred_max = K.expand_dims(y_pred_max, axis=-1)
+ y_pred_max_mat = K.equal(y_pred, y_pred_max)
+ for c_p, c_t in itertools.product(range(nb_cl), range(nb_cl)):
+ w = K.cast(self.weights[c_t, c_p], K.floatx())
+ y_p = K.cast(y_pred_max_mat[..., c_p], K.floatx())
+ y_t = K.cast(y_pred_max_mat[..., c_t], K.floatx())
+ final_mask += w * y_p * y_t
+ return K.sqrt(K.sum(K.square(y_pred - y_true), axis=-1)) * final_mask
+
+
+class WeightedBinaryCrossEntropy(object):
+ """
+ A weighted version of Euclidean distance loss for keras. This lets you apply a weight to unbalanced classes.
+ # reference: implementation based on https://github.com/keras-team/keras/issues/2115#issuecomment-315571824
+
+ Usage:
+ The constructor expects a dictionary with same structure as `class_weight` param from model.fit
+ """
+
+ def __init__(self, weights):
+ nb_cl = len(weights)
+ self.weights = np.ones((nb_cl, nb_cl))
+ for class_idx, class_weight in weights.items():
+ self.weights[0][class_idx] = class_weight
+ self.weights[class_idx][0] = class_weight
+ self.__name__ = 'weighted_binary_crossentropy'
+
+ def __call__(self, y_true, y_pred):
+ return self.weighted_binary_crossentropy(y_true, y_pred)
+
+ def weighted_binary_crossentropy(self, y_true, y_pred):
+ nb_cl = len(self.weights)
+ final_mask = K.zeros_like(y_pred[..., 0])
+ y_pred_max = K.max(y_pred, axis=-1)
+ y_pred_max = K.expand_dims(y_pred_max, axis=-1)
+ y_pred_max_mat = K.equal(y_pred, y_pred_max)
+ for c_p, c_t in itertools.product(range(nb_cl), range(nb_cl)):
+ w = K.cast(self.weights[c_t, c_p], K.floatx())
+ y_p = K.cast(y_pred_max_mat[..., c_p], K.floatx())
+ y_t = K.cast(y_pred_max_mat[..., c_t], K.floatx())
+ final_mask += w * y_p * y_t
+ return K.mean(K.binary_crossentropy(y_true, y_pred), axis=-1) * final_mask
+
+
+
+def weighted_binary_crossentropy2(target, output):
+ """
+ Weighted binary crossentropy between an output tensor
+ and a target tensor. POS_WEIGHT is used as a multiplier
+ for the positive targets.
+
+ Combination of the following functions:
+ * keras.losses.binary_crossentropy
+ * keras.backend.tensorflow_backend.binary_crossentropy
+ * tf.nn.weighted_cross_entropy_with_logits
+
+ reference: https://stackoverflow.com/a/47313183/979377
+ """
+ # transform back to logits
+
+ POS_WEIGHT = 10 # multiplier for positive targets, needs to be tuned
+
+ _epsilon = tfb._to_tensor(tfb.epsilon(), output.dtype.base_dtype)
+ output = tf.clip_by_value(output, _epsilon, 1 - _epsilon)
+ output = tf.log(output / (1 - output))
+ # compute weighted loss
+ loss = tf.nn.weighted_cross_entropy_with_logits(targets=target,
+ logits=output,
+ pos_weight=POS_WEIGHT)
+ return tf.reduce_mean(loss, axis=-1)
+
+
+
+# reference: https://github.com/yu4u/age-gender-estimation/blob/3c3a0a2681c045264c2c294e548ffb1b84f24b9e/age_estimation/model.py#L8-L12
+def vad_mean_absolute_error(y_true, y_pred):
+ true_vad = K.sum(y_true * K.arange(1, 10, dtype="float32"), axis=-1)
+ pred_vad = K.sum(y_pred * K.arange(1, 10, dtype="float32"), axis=-1)
+ mae = K.mean(K.abs(true_vad - pred_vad))
+ return mae
\ No newline at end of file
diff --git a/utils/inference_utils.py b/utils/inference_utils.py
new file mode 100644
index 0000000..0ccccee
--- /dev/null
+++ b/utils/inference_utils.py
@@ -0,0 +1,58 @@
+from __future__ import print_function
+import numpy as np
+
+def _obtain_emotional_traits_calibrated_predictions(emotional_trait,
+ raw_preds):
+
+
+
+ # print('[INFO] Violation before valence & dominance: ',raw_preds[0][0] )
+ # print('[INFO] No-violation before valence & dominance: ', raw_preds[0][1])
+
+ #Dominance
+ # neutral interval -- the same raw predictions will be returned
+ if 4.5 <= emotional_trait <= 5.5:
+ # print('neutral dominance')
+ violation = raw_preds[0][0]
+ no_violation = raw_preds[0][1]
+
+
+ # positive dominance
+ elif emotional_trait > 5.5:
+ # print('positive dominance')
+ diff_from_neutral = emotional_trait-5.5
+ adjustment = diff_from_neutral * 0.11
+ # adjustment = diff_from_neutral * 0.05
+ violation = raw_preds[0][0]-adjustment
+ no_violation = raw_preds[0][1]+adjustment
+
+ # negative dominance
+ elif emotional_trait < 4.5:
+ # print ('negative dominance')
+ diff_from_neutral = 4.5-emotional_trait
+ adjustment = diff_from_neutral * 0.11
+ # adjustment = diff_from_neutral * 0.05
+ violation = raw_preds[0][0]+adjustment
+ no_violation = raw_preds[0][1]-adjustment
+
+
+ #
+ # print('[INFO] Violation after dominance: ',violation )
+ # print('[INFO] No-violation after dominance: ', no_violation)
+
+
+ calibrated_preds = np.array([[violation, no_violation]])
+
+
+ return calibrated_preds
+
+
+
+
+
+
+
+
+
+
+
diff --git a/wrappers/__init__.py b/wrappers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/wrappers/hra_transfer_cnn_manager.py b/wrappers/hra_transfer_cnn_manager.py
new file mode 100644
index 0000000..c0c22d3
--- /dev/null
+++ b/wrappers/hra_transfer_cnn_manager.py
@@ -0,0 +1,334 @@
+# -*- coding: utf-8 -*-
+""" HRA_Transfer_CNN_Manager is a wrapper class which 'encapsulates' the functionalities needed for preparing (class instantiation)
+ and training different CNNs (`train_model`) on the HRA dataset with 2 classes.
+"""
+
+from __future__ import print_function
+import os
+import sys
+import math
+import numpy as np
+import os.path
+import time
+import datetime
+import h5py
+from keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger
+from keras.preprocessing.image import ImageDataGenerator
+
+from applications.hra_vgg16 import HRA_VGG16
+from applications.hra_vgg19 import HRA_VGG19
+from applications.hra_resnet50 import HRA_ResNet50
+from applications.hra_vgg16_places365 import HRA_VGG16_Places365
+
+from utils.generic_utils import hms_string
+
+
+class HRA_Transfer_CNN_Manager():
+ """Loads the parameters needed for the training process on class instantiation
+ & starts the training process.
+
+ # Arguments
+ violation_class: violation_class: one of `cl` (HRA dataset with 2 classes - [i]'child_labour' and [ii]'no violation')
+ or `dp` (HRA dataset with 2 classes - [i]'displaced_populations' and [ii]'no violation')
+ train_mode: String to declare the train mode of the model (how many layers will be frozen during training).
+ - `feature_extraction` taking the convolutional base of a previously-trained network,
+ running the new data through it, and training a new classifier on top of the output.
+ - `fine_tuning` unfreezing a few of the top layers of a frozen conv. base used for feature extraction,
+ and jointly training both the newly added part of the model and these top layers.
+ pre_trained_model: One of `vgg16`, `vgg19`, `resnet50` or `vgg16_places365`.
+ nb_of_conv_layers_to_fine_tune: integer to indicate the number of convolutional layers to fine-tune.
+ One of `None` (indicates feature extraction mode), `1`, `2` or `3`.
+ weights_to_file: File name or full path for saving the weights of the current training process.
+ first_phase_trained_weights: Weights of an already trained feature extraction model.
+ Only relevant when using `fine_tuning` as train_mode after `feature_extraction` weights have been saved.
+ nb_of_epochs: Integer, total number of iterations on the data.
+ modelCheckpoint_quantity: Quantity to monitor when saving the model after every epoch is enabled.
+ earlyStopping_quantity: Quantity to monitor when stopping training when a monitored quantity has stopped improving is enabled.
+ CSVLogger_filename: filename of the csv file, where the CSVLogger callback will stream epoch results to.
+
+ # Raises
+ ValueError: in case of invalid argument for `nb_of_conv_layers_to_fine_tune`
+ or invalid argument for `first_phase_trained_weights`.
+ """
+
+ def __init__(self,
+ violation_class,
+ train_mode,
+ pre_trained_model,
+ nb_of_conv_layers_to_fine_tune,
+ weights_to_file,
+ first_phase_trained_weights,
+ nb_of_epochs,
+ modelCheckpoint_quantity,
+ earlyStopping_quantity,
+ CSVLogger_filename,
+ ):
+
+
+ # extra check for the case when fine-tuning is selected without providing the correct first_phase_trained_weights.
+ if nb_of_conv_layers_to_fine_tune in {1, 2, 3} and first_phase_trained_weights is None:
+ raise ValueError('The `first_phase_trained_weights` argument can be set to None only when '
+ '`nb_of_conv_layers_to_fine_tune` is None (feature extraction).'
+ 'When `nb_of_conv_layers_to_fine_tune` is either 1 or 2, '
+ 'the weights of an already trained feature extraction model must be saved prior to fine-tuning the model.')
+
+ # Base directory for saving the trained models
+ self.trained_models_dir = '/home/gkallia/git/Human-Rights-Violations-Conceptron/trained_models'
+ self.feature_extraction_dir = os.path.join(self.trained_models_dir, 'feature_extraction/')
+ self.fine_tuning_dir = os.path.join(self.trained_models_dir, 'fine_tuning/')
+ self.logs_dir = os.path.join(self.trained_models_dir, 'logs/')
+
+ if violation_class == 'cl':
+ self.train_dir = os.path.join('/home/gkallia/git/Human-Rights-Violations-Conceptron/datasets/Two-class-HRV/ChildLabour', 'train')
+ self.val_dir = os.path.join('/home/gkallia/git/Human-Rights-Violations-Conceptron/datasets/Two-class-HRV/ChildLabour', 'val')
+
+ elif violation_class == 'dp':
+ self.train_dir = os.path.join('/home/gkallia/git/Human-Rights-Violations-Conceptron/datasets/Two-class-HRV/DisplacedPopulations', 'train')
+ self.val_dir = os.path.join('/home/gkallia/git/Human-Rights-Violations-Conceptron/datasets/Two-class-HRV/DisplacedPopulations', 'val')
+
+
+
+ # Augmentation configuration with only rescaling.
+ # Rescale is a value by which we will multiply the data before any other processing.
+ # Our original images consist in RGB coefficients in the 0-255, but such values would
+ # be too high for our models to process (given a typical learning rate),
+ # so we target values between 0 and 1 instead by scaling with a 1/255. factor.
+ datagen = ImageDataGenerator(rescale=1. / 255)
+
+ img_width, img_height = 224, 224
+
+ self.train_batch_size = 21
+ self.val_batch_size = 10
+
+
+ print('[INFO] Setting up image data generators...')
+
+ self.train_generator = datagen.flow_from_directory(self.train_dir, target_size=(img_width, img_height),
+ class_mode='categorical',
+ shuffle=False,
+ batch_size=self.train_batch_size)
+
+ self.val_generator = datagen.flow_from_directory(self.val_dir, target_size=(img_width, img_height),
+ class_mode='categorical',
+ shuffle=False,
+ batch_size=self.val_batch_size)
+
+
+ num_classes = len(self.train_generator.class_indices)
+
+ print('[INFO] Number of classes: ', num_classes)
+
+ self.nb_train_samples = len(self.train_generator.filenames)
+ # train_labels = self.train_generator.classes
+ # self.train_labels = to_categorical(train_labels, num_classes=num_classes)
+ # self.predict_size_train = int(math.ceil(self.nb_train_samples / self.train_batch_size))
+
+ print ('[INFO] Number of train samples: ', self.nb_train_samples)
+
+ # print('[INFO] Predict size train: ', self.predict_size_train)
+
+ # save the class indices to use use later in predictions
+ # np.save('class_indices.npy', self.train_generator.class_indices)
+
+
+ self.nb_val_samples = len(self.val_generator.filenames)
+ # val_labels = self.val_generator.classes
+ # self.val_labels = to_categorical(val_labels, num_classes=num_classes)
+ # self.predict_size_test = int(math.ceil(self.nb_val_samples / self.val_batch_size))
+
+ print ('[INFO] Number of test samples: ', self.nb_val_samples)
+ # print('[INFO] Predict size test: ', self.predict_size_test)
+
+ self.steps_per_epoch = self.nb_train_samples // self.train_batch_size
+ self.val_steps = self.nb_val_samples // self.val_batch_size
+
+
+
+ # -------------------------------------------------------------------------------- #
+ # Usage of callbacks
+ # -------------------------------------------------------------------------------- #
+
+ self.train_mode = train_mode
+ self.pre_trained_model = pre_trained_model
+ self.nb_of_conv_layers_to_fine_tune = nb_of_conv_layers_to_fine_tune
+ self.weights_to_file = weights_to_file
+ self.first_phase_trained_weights = first_phase_trained_weights
+ self.nb_of_epochs = nb_of_epochs
+ # self.modelCheckpoint_quantity = modelCheckpoint_quantity
+ # self.earlyStopping_quantity = earlyStopping_quantity
+ # self.CSVLogger_filename = CSVLogger_filename
+
+ # self.steps_per_epoch = self.nb_train_samples // self.train_batch_size
+ #
+ #
+ # self.val_steps = self.nb_val_samples // self.val_batch_size
+
+
+
+ # CSVLogger
+ model_log = 'trained_models/logs/' + CSVLogger_filename
+ csv_logger = CSVLogger(model_log, append=True, separator=',')
+
+
+ # ModelCheckpoint
+ checkpointer = ModelCheckpoint(filepath=weights_to_file,
+ monitor=modelCheckpoint_quantity,
+ verbose=1,
+ save_best_only=True,
+ mode='auto',
+ period=1,
+ save_weights_only=True)
+
+ early_stop = EarlyStopping(monitor=earlyStopping_quantity, patience=5, mode='auto')
+
+ self.callbacks_list = [checkpointer, early_stop, csv_logger]
+
+
+ def train(self):
+ """Loads the selected model & starts the training process.
+ """
+
+ if self.pre_trained_model == 'vgg16':
+
+ print('[INFO] Instantiating HRA-2CLASS-VGG16...')
+
+ if self.train_mode == 'feature_extraction':
+
+ model = HRA_VGG16(include_top=True, weights=None,
+ input_tensor=None, input_shape=None,
+ nb_of_conv_layers_to_fine_tune=self.nb_of_conv_layers_to_fine_tune,
+ first_phase_trained_weights=None,
+ verbose=1)
+ else:
+
+ if os.path.isfile(self.first_phase_trained_weights) is False:
+ raise IOError("No such weights file: `" + self.first_phase_trained_weights + "`. ")
+
+ model = HRA_VGG16(include_top=True, weights=None,
+ input_tensor=None, input_shape=None,
+ nb_of_conv_layers_to_fine_tune=self.nb_of_conv_layers_to_fine_tune,
+ first_phase_trained_weights=self.first_phase_trained_weights,
+ verbose=1)
+
+ print('[INFO] HRA-2CLASS-VGG16 model loaded')
+
+
+
+ elif self.pre_trained_model == 'vgg19':
+
+ print('[INFO] Instantiating HRA-2CLASS-VGG19...')
+
+ if self.train_mode == 'feature_extraction':
+
+ model = HRA_VGG19(include_top=True, weights=None,
+ input_tensor=None, input_shape=None,
+ nb_of_conv_layers_to_fine_tune=self.nb_of_conv_layers_to_fine_tune,
+ first_phase_trained_weights=None,
+ verbose=1)
+ else:
+
+ if os.path.isfile(self.first_phase_trained_weights) is False:
+ raise IOError("No such weights file: `" + self.first_phase_trained_weights + "`. ")
+
+ model = HRA_VGG19(include_top=True, weights=None,
+ input_tensor=None, input_shape=None,
+ nb_of_conv_layers_to_fine_tune=self.nb_of_conv_layers_to_fine_tune,
+ first_phase_trained_weights=self.first_phase_trained_weights,
+ verbose=1)
+
+ print('[INFO] HRA-2CLASS-VGG19 model loaded')
+
+
+
+
+ elif self.pre_trained_model == 'resnet50':
+
+ print('[INFO] Instantiating HRA-2CLASS-ResNet50...')
+
+ if self.train_mode == 'feature_extraction':
+
+ model = HRA_ResNet50(include_top=True, weights=None,
+ input_tensor=None, input_shape=None,
+ nb_of_conv_layers_to_fine_tune=self.nb_of_conv_layers_to_fine_tune,
+ first_phase_trained_weights=None,
+ verbose=1)
+ else:
+
+ if os.path.isfile(self.first_phase_trained_weights) is False:
+ raise IOError("No such weights file: `" + self.first_phase_trained_weights + "`. ")
+
+ model = HRA_ResNet50(include_top=True, weights=None,
+ input_tensor=None, input_shape=None,
+ nb_of_conv_layers_to_fine_tune=self.nb_of_conv_layers_to_fine_tune,
+ first_phase_trained_weights=self.first_phase_trained_weights,
+ verbose=1)
+
+ print('[INFO] HRA-2CLASS-ResNet50 model loaded')
+
+
+
+
+ elif self.pre_trained_model == 'vgg16_places365':
+
+ print('[INFO] Instantiating HRA-2CLASS-VGG16_Places365...')
+
+ if self.train_mode == 'feature_extraction':
+
+ model = HRA_VGG16_Places365(include_top=True, weights=None,
+ input_tensor=None, input_shape=None,
+ nb_of_conv_layers_to_fine_tune=self.nb_of_conv_layers_to_fine_tune,
+ first_phase_trained_weights=None,
+ verbose=1)
+ else:
+
+ if os.path.isfile(self.first_phase_trained_weights) is False:
+ raise IOError("No such weights file: `" + self.first_phase_trained_weights + "`. ")
+
+ model = HRA_VGG16_Places365(include_top=True, weights=None,
+ input_tensor=None, input_shape=None,
+ nb_of_conv_layers_to_fine_tune=self.nb_of_conv_layers_to_fine_tune,
+ first_phase_trained_weights=self.first_phase_trained_weights,
+ verbose=1)
+
+ print('[INFO] HRA-2CLASS-VGG16-Places365 model loaded')
+
+
+
+ # Finally start fitting the dataset
+
+ if self.train_mode == 'feature_extraction':
+ print('[INFO] Start training the randomly initialised classifier on top of the pre-trained conv. base...')
+
+ start_time = time.time()
+
+ history = model.fit_generator(self.train_generator,
+ epochs=self.nb_of_epochs,
+ steps_per_epoch=self.steps_per_epoch,
+ validation_data=self.val_generator,
+ validation_steps=self.val_steps,
+ callbacks=self.callbacks_list)
+
+ end_time = time.time()
+ print("[INFO] It took {} to train the randomly initialised classifier on top of the pre-trained conv. base".format(
+ hms_string(end_time - start_time)))
+
+ print('[INFO] Saved trained model as: %s ' % self.weights_to_file)
+
+ else:
+ print('[INFO] Start fine-tuning the model...')
+
+ start_time = time.time()
+
+ history = model.fit_generator(self.train_generator,
+ epochs=self.nb_of_epochs,
+ steps_per_epoch=self.steps_per_epoch,
+ validation_data=self.val_generator,
+ validation_steps=self.val_steps,
+ callbacks=self.callbacks_list)
+
+ end_time = time.time()
+ print("[INFO] It took {} to fine-tune the top layers of the frozen conv. base".format(
+ hms_string(end_time - start_time)))
+
+ print('[INFO] Saved trained model as: %s ' % self.weights_to_file)
+