Merge pull request openvinotoolkit#1417 from Wovchena/interactive_face_detection-update

Roman Donchenko · web-flow · commit b2e6625559ea · 2020-08-19T12:54:15.000+03:00
interactive_face_detection: update
diff --git a/demos/interactive_face_detection_demo/README.md b/demos/interactive_face_detection_demo/README.md
@@ -38,7 +38,6 @@ The new Async API operates with a new notion of the Infer Request that encapsula
 Running the application with the `-h` option yields the following usage message:
 
 ```
-./interactive_face_detection_demo -h
 InferenceEngine:
     API version ............ <version>
     Build .................. <number>
@@ -47,7 +46,8 @@ interactive_face_detection_demo [OPTION]
 Options:
 
     -h                         Print a usage message
-    -i "<path>"                Required. Path to a video file (specify "cam" to work with camera).
+    -i                         Required. An input to process. The input must be a single image, a folder of images or anything that cv::VideoCapture can process.
+    -loop                      Optional. Enable reading the input in a loop.
     -o "<path>"                Optional. Path to an output video file.
     -m "<path>"                Required. Path to an .xml file with a trained Face Detection model.
     -m_ag "<path>"             Optional. Path to an .xml file with a trained Age/Gender Recognition model.
@@ -71,7 +71,6 @@ Options:
     -dyn_em                    Optional. Enable dynamic batch size for Emotions Recognition network
     -dyn_lm                    Optional. Enable dynamic batch size for Facial Landmarks Estimation network
     -async                     Optional. Enable asynchronous mode
-    -no_wait                   Optional. Do not wait for key press in the end.
     -no_show                   Optional. Do not show processed video.
     -pc                        Optional. Enable per-layer performance report
     -r                         Optional. Output inference results as raw values
@@ -80,7 +79,6 @@ Options:
     -dx_coef                   Optional. Coefficient to shift the bounding box around the detected face along the Ox axis
     -dy_coef                   Optional. Coefficient to shift the bounding box around the detected face along the Oy axis
     -fps                       Optional. Maximum FPS for playing video
-    -loop_video                Optional. Enable playing video on a loop
     -no_smooth                 Optional. Do not smooth person attributes
     -no_show_emotion_bar       Optional. Do not show emotion bar
     -u                         Optional. List of monitors to show initially.
diff --git a/demos/interactive_face_detection_demo/interactive_face_detection.hpp b/demos/interactive_face_detection_demo/interactive_face_detection.hpp
@@ -10,8 +10,11 @@
 #include <gflags/gflags.h>
 #include <iostream>
 
+#include <samples/default_flags.hpp>
+
+DEFINE_INPUT_FLAGS
+
 static const char help_message[] = "Print a usage message";
-static const char input_video_message[] = "Required. Path to a video file (specify \"cam\" to work with camera).";
 static const char output_video_message[] = "Optional. Path to an output video file.";
 static const char face_detection_model_message[] = "Required. Path to an .xml file with a trained Face Detection model.";
 static const char age_gender_model_message[] = "Optional. Path to an .xml file with a trained Age/Gender Recognition model.";
@@ -34,7 +37,7 @@ static const char target_device_message_em[] = "Optional. Target device for Emot
                                                "The demo will look for a suitable plugin for a specified device.";
 static const char target_device_message_lm[] = "Optional. Target device for Facial Landmarks Estimation network "
                                                "(the list of available devices is shown below). Default value is CPU. Use \"-d HETERO:<comma-separated_devices_list>\" format to specify HETERO plugin. "
-                                               "The demo will look for a suitable plugin for device specified.";
+                                               "The demo will look for a suitable plugin for a specified device.";
 static const char num_batch_ag_message[] = "Optional. Number of maximum simultaneously processed faces for Age/Gender Recognition network "
                                            "(by default, it is 16)";
 static const char num_batch_hp_message[] = "Optional. Number of maximum simultaneously processed faces for Head Pose Estimation network "
@@ -55,19 +58,16 @@ static const char custom_cpu_library_message[] = "Required for CPU custom layers
 static const char thresh_output_message[] = "Optional. Probability threshold for detections";
 static const char bb_enlarge_coef_output_message[] = "Optional. Coefficient to enlarge/reduce the size of the bounding box around the detected face";
 static const char raw_output_message[] = "Optional. Output inference results as raw values";
-static const char no_wait_for_keypress_message[] = "Optional. Do not wait for key press in the end.";
 static const char no_show_processed_video[] = "Optional. Do not show processed video.";
 static const char async_message[] = "Optional. Enable asynchronous mode";
 static const char dx_coef_output_message[] = "Optional. Coefficient to shift the bounding box around the detected face along the Ox axis";
 static const char dy_coef_output_message[] = "Optional. Coefficient to shift the bounding box around the detected face along the Oy axis";
 static const char fps_output_message[] = "Optional. Maximum FPS for playing video";
-static const char loop_video_output_message[] = "Optional. Enable playing video on a loop";
 static const char no_smooth_output_message[] = "Optional. Do not smooth person attributes";
 static const char no_show_emotion_bar_message[] = "Optional. Do not show emotion bar";
 static const char utilization_monitors_message[] = "Optional. List of monitors to show initially.";
 
 DEFINE_bool(h, false, help_message);
-DEFINE_string(i, "", input_video_message);
 DEFINE_string(o, "", output_video_message);
 DEFINE_string(m, "", face_detection_model_message);
 DEFINE_string(m_ag, "", age_gender_model_message);
@@ -93,13 +93,11 @@ DEFINE_string(l, "", custom_cpu_library_message);
 DEFINE_bool(r, false, raw_output_message);
 DEFINE_double(t, 0.5, thresh_output_message);
 DEFINE_double(bb_enlarge_coef, 1.2, bb_enlarge_coef_output_message);
-DEFINE_bool(no_wait, false, no_wait_for_keypress_message);
 DEFINE_bool(no_show, false, no_show_processed_video);
 DEFINE_bool(async, false, async_message);
 DEFINE_double(dx_coef, 1, dx_coef_output_message);
 DEFINE_double(dy_coef, 1, dy_coef_output_message);
-DEFINE_double(fps, -1, fps_output_message);
-DEFINE_bool(loop_video, false, loop_video_output_message);
+DEFINE_double(fps, -std::numeric_limits<double>::infinity(), fps_output_message);
 DEFINE_bool(no_smooth, false, no_smooth_output_message);
 DEFINE_bool(no_show_emotion_bar, false, no_show_emotion_bar_message);
 DEFINE_string(u, "", utilization_monitors_message);
@@ -111,11 +109,12 @@ DEFINE_string(u, "", utilization_monitors_message);
 
 static void showUsage() {
     std::cout << std::endl;
-    std::cout << "interactive_face_detection [OPTION]" << std::endl;
+    std::cout << "interactive_face_detection_demo [OPTION]" << std::endl;
     std::cout << "Options:" << std::endl;
     std::cout << std::endl;
     std::cout << "    -h                         " << help_message << std::endl;
-    std::cout << "    -i \"<path>\"                " << input_video_message << std::endl;
+    std::cout << "    -i                         " << input_message << std::endl;
+    std::cout << "    -loop                      " << loop_message << std::endl;
     std::cout << "    -o \"<path>\"                " << output_video_message << std::endl;
     std::cout << "    -m \"<path>\"                " << face_detection_model_message<< std::endl;
     std::cout << "    -m_ag \"<path>\"             " << age_gender_model_message << std::endl;
@@ -139,7 +138,6 @@ static void showUsage() {
     std::cout << "    -dyn_em                    " << dyn_batch_em_message << std::endl;
     std::cout << "    -dyn_lm                    " << dyn_batch_lm_message << std::endl;
     std::cout << "    -async                     " << async_message << std::endl;
-    std::cout << "    -no_wait                   " << no_wait_for_keypress_message << std::endl;
     std::cout << "    -no_show                   " << no_show_processed_video << std::endl;
     std::cout << "    -pc                        " << performance_counter_message << std::endl;
     std::cout << "    -r                         " << raw_output_message << std::endl;
@@ -148,7 +146,6 @@ static void showUsage() {
     std::cout << "    -dx_coef                   " << dx_coef_output_message << std::endl;
     std::cout << "    -dy_coef                   " << dy_coef_output_message << std::endl;
     std::cout << "    -fps                       " << fps_output_message << std::endl;
-    std::cout << "    -loop_video                " << loop_video_output_message << std::endl;
     std::cout << "    -no_smooth                 " << no_smooth_output_message << std::endl;
     std::cout << "    -no_show_emotion_bar       " << no_show_emotion_bar_message << std::endl;
     std::cout << "    -u                         " << utilization_monitors_message << std::endl;
diff --git a/demos/interactive_face_detection_demo/main.cpp b/demos/interactive_face_detection_demo/main.cpp
@@ -25,6 +25,7 @@
 #include <inference_engine.hpp>
 
 #include <monitors/presenter.h>
+#include <samples/images_capture.h>
 #include <samples/ocv_common.hpp>
 #include <samples/slog.hpp>
 
@@ -61,10 +62,6 @@ bool ParseAndCheckCommandLine(int argc, char *argv[]) {
     if (FLAGS_n_hp < 1) {
         throw std::logic_error("Parameter -n_hp cannot be 0");
     }
-
-    // no need to wait for a key press from a user if an output image/video file is not shown.
-    FLAGS_no_wait |= FLAGS_no_show;
-
     return true;
 }
 
@@ -77,27 +74,6 @@ int main(int argc, char *argv[]) {
             return 0;
         }
 
-        slog::info << "Reading input" << slog::endl;
-        cv::VideoCapture cap;
-        if (!(FLAGS_i == "cam" ? cap.open(0) : cap.open(FLAGS_i))) {
-            throw std::logic_error("Cannot open input file or camera: " + FLAGS_i);
-        }
-
-        Timer timer;
-        // read input (video) frame
-        cv::Mat frame;
-        if (!cap.read(frame)) {
-            throw std::logic_error("Failed to get frame from cv::VideoCapture");
-        }
-
-        const size_t width  = static_cast<size_t>(frame.cols);
-        const size_t height = static_cast<size_t>(frame.rows);
-
-        cv::VideoWriter videoWriter;
-        if (!FLAGS_o.empty()) {
-            videoWriter.open(FLAGS_o, cv::VideoWriter::fourcc('I', 'Y', 'U', 'V'), 25, cv::Size(width, height));
-        }
-        // ---------------------------------------------------------------------------------------------------
         // --------------------------- 1. Loading Inference Engine -----------------------------
 
         Core ie;
@@ -163,73 +139,75 @@ int main(int argc, char *argv[]) {
         Load(facialLandmarksDetector).into(ie, FLAGS_d_lm, FLAGS_dyn_lm);
         // ----------------------------------------------------------------------------------------------------
 
-        // --------------------------- 3. Doing inference -----------------------------------------------------
-        // Starting inference & calculating performance
-        slog::info << "Start inference " << slog::endl;
-
         bool isFaceAnalyticsEnabled = ageGenderDetector.enabled() || headPoseDetector.enabled() ||
                                       emotionsDetector.enabled() || facialLandmarksDetector.enabled();
 
+        Timer timer;
         std::ostringstream out;
         size_t framesCounter = 0;
-        int delay = 1;
-        double msrate = -1;
-        cv::Mat prev_frame, next_frame;
+        double msrate = 1000.0 / FLAGS_fps;
         std::list<Face::Ptr> faces;
         size_t id = 0;
 
-        if (FLAGS_fps > 0) {
-            msrate = 1000.f / FLAGS_fps;
+        std::unique_ptr<ImagesCapture> cap = openImagesCapture(FLAGS_i, FLAGS_loop);
+        cv::Mat frame = cap->read();
+        if (!frame.data) {
+            throw std::runtime_error("Can't read an image from the input");
         }
 
-        Visualizer::Ptr visualizer;
-        if (!FLAGS_no_show || !FLAGS_o.empty()) {
-            visualizer = std::make_shared<Visualizer>(cv::Size(width, height));
-            if (!FLAGS_no_show_emotion_bar && emotionsDetector.enabled()) {
-                visualizer->enableEmotionBar(emotionsDetector.emotionsVec);
+        const cv::Point THROUGHPUT_METRIC_POSITION{10, 45};
+        Presenter presenter(FLAGS_u, THROUGHPUT_METRIC_POSITION.y + 15, {frame.cols / 4, 60});
+
+        Visualizer visualizer{frame.size()};
+        if (!FLAGS_no_show_emotion_bar && emotionsDetector.enabled()) {
+                visualizer.enableEmotionBar(emotionsDetector.emotionsVec);
+        }
+
+        cv::VideoWriter videoWriter;
+        if (!FLAGS_o.empty()) {
+            videoWriter.open(FLAGS_o, cv::VideoWriter::fourcc('I', 'Y', 'U', 'V'),
+                !FLAGS_no_show && FLAGS_fps > 0.0 ? FLAGS_fps : cap->fps(), frame.size());
+            if (!videoWriter.isOpened()) {
+                throw std::runtime_error("Can't open video writer");
             }
         }
 
         // Detecting all faces on the first frame and reading the next one
         faceDetector.enqueue(frame);
         faceDetector.submitRequest();
 
-        prev_frame = frame.clone();
-
-        // Reading the next frame
-        bool frameReadStatus = cap.read(frame);
+        cv::Mat next_frame = cap->read();
 
         std::cout << "To close the application, press 'CTRL+C' here";
         if (!FLAGS_no_show) {
             std::cout << " or switch to the output window and press Q or Esc";
         }
         std::cout << std::endl;
 
-        const cv::Point THROUGHPUT_METRIC_POSITION{10, 45};
-
-        cv::Size graphSize{static_cast<int>(cap.get(cv::CAP_PROP_FRAME_WIDTH) / 4), 60};
-        Presenter presenter(FLAGS_u, THROUGHPUT_METRIC_POSITION.y + 15, graphSize);
-
-        while (true) {
+        while (frame.data) {
             timer.start("total");
+            cv::Mat prev_frame = std::move(frame);
+            frame = std::move(next_frame);
             framesCounter++;
-            bool isLastFrame = !frameReadStatus;
 
             // Retrieving face detection results for the previous frame
             faceDetector.wait();
             faceDetector.fetchResults();
             auto prev_detection_results = faceDetector.results;
 
             // No valid frame to infer if previous frame is the last
-            if (!isLastFrame) {
+            if (frame.data) {
+                if (frame.size() != prev_frame.size()) {
+                    throw std::runtime_error("Images of different size are not supported");
+                }
                 faceDetector.enqueue(frame);
                 faceDetector.submitRequest();
             }
 
             // Filling inputs of face analytics networks
             for (auto &&face : prev_detection_results) {
                 if (isFaceAnalyticsEnabled) {
-                    auto clippedRect = face.location & cv::Rect(0, 0, width, height);
+                    cv::Rect clippedRect = face.location & cv::Rect({0, 0}, prev_frame.size());
                     cv::Mat face = prev_frame(clippedRect);
                     ageGenderDetector.enqueue(face);
                     headPoseDetector.enqueue(face);
@@ -246,16 +224,8 @@ int main(int argc, char *argv[]) {
                 facialLandmarksDetector.submitRequest();
             }
 
-            // Reading the next frame if the current one is not the last
-            if (!isLastFrame) {
-                frameReadStatus = cap.read(next_frame);
-                if (FLAGS_loop_video && !frameReadStatus) {
-                    if (!(FLAGS_i == "cam" ? cap.open(0) : cap.open(FLAGS_i))) {
-                        throw std::logic_error("Cannot open input file or camera: " + FLAGS_i);
-                    }
-                    frameReadStatus = cap.read(next_frame);
-                }
-            }
+            // Read the next frame while waiting for inference results
+            next_frame = cap->read();
 
             if (isFaceAnalyticsEnabled) {
                 ageGenderDetector.wait();
@@ -276,7 +246,7 @@ int main(int argc, char *argv[]) {
             // For every detected face
             for (size_t i = 0; i < prev_detection_results.size(); i++) {
                 auto& result = prev_detection_results[i];
-                cv::Rect rect = result.location & cv::Rect(0, 0, width, height);
+                cv::Rect rect = result.location & cv::Rect({0, 0}, prev_frame.size());
 
                 Face::Ptr face;
                 if (!FLAGS_no_smooth) {
@@ -327,44 +297,23 @@ int main(int argc, char *argv[]) {
 
             presenter.drawGraphs(prev_frame);
 
-            //  Visualizing results
-            if (!FLAGS_no_show || !FLAGS_o.empty()) {
-                out.str("");
-                out << "Total image throughput: " << std::fixed << std::setprecision(2)
-                    << 1000.f / (timer["total"].getSmoothedDuration()) << " fps";
-                cv::putText(prev_frame, out.str(), THROUGHPUT_METRIC_POSITION, cv::FONT_HERSHEY_TRIPLEX, 1,
-                            cv::Scalar(255, 0, 0), 2);
-
-                // drawing faces
-                visualizer->draw(prev_frame, faces);
-
-                if (!FLAGS_no_show) {
-                    cv::imshow("Detection results", prev_frame);
-                }
-            }
-
-            if (!FLAGS_o.empty()) {
-                videoWriter.write(prev_frame);
-            }
-
-            prev_frame = frame;
-            frame = next_frame;
-            next_frame = cv::Mat();
+            // drawing faces
+            visualizer.draw(prev_frame, faces);
 
             timer.finish("total");
+            out.str("");
+            out << "Total image throughput: " << std::fixed << std::setprecision(1)
+                << 1000.0 / (timer["total"].getSmoothedDuration()) << " fps";
+            cv::putText(prev_frame, out.str(), THROUGHPUT_METRIC_POSITION, cv::FONT_HERSHEY_TRIPLEX, 1,
+                        cv::Scalar(255, 0, 0), 2);
 
-            if (FLAGS_fps > 0) {
-                delay = std::max(1, static_cast<int>(msrate - timer["total"].getLastCallDuration()));
+            if (videoWriter.isOpened()) {
+                videoWriter.write(prev_frame);
             }
 
-            // End of file (or a single frame file like an image). The last frame is displayed to let you check what is shown
-            if (isLastFrame) {
-                if (!FLAGS_no_wait) {
-                    std::cout << "No more frames to process!" << std::endl;
-                    cv::waitKey(0);
-                }
-                break;
-            } else if (!FLAGS_no_show) {
+            int delay = std::max(1, static_cast<int>(msrate - timer["total"].getLastCallDuration()));
+            if (!FLAGS_no_show) {
+                cv::imshow("Detection results", prev_frame);
                 int key = cv::waitKey(delay);
                 if (27 == key || 'Q' == key || 'q' == key) {
                     break;
@@ -374,7 +323,7 @@ int main(int argc, char *argv[]) {
         }
 
         slog::info << "Number of processed frames: " << framesCounter << slog::endl;
-        slog::info << "Total image throughput: " << framesCounter * (1000.f / timer["total"].getTotalDuration()) << " fps" << slog::endl;
+        slog::info << "Total image throughput: " << framesCounter * (1000.0 / timer["total"].getTotalDuration()) << " fps" << slog::endl;
 
         // Showing performance results
         if (FLAGS_pc) {
@@ -386,17 +335,6 @@ int main(int argc, char *argv[]) {
         }
 
         std::cout << presenter.reportMeans() << '\n';
-        // ---------------------------------------------------------------------------------------------------
-
-        if (!FLAGS_o.empty()) {
-            videoWriter.release();
-        }
-
-        // release input video stream
-        cap.release();
-
-        // close windows
-        cv::destroyAllWindows();
     }
     catch (const std::exception& error) {
         slog::err << error.what() << slog::endl;
diff --git a/demos/interactive_face_detection_demo/visualizer.hpp b/demos/interactive_face_detection_demo/visualizer.hpp
@@ -80,8 +80,6 @@ class HeadPoseVisualizer {
 // Drawing detected faces on the frame
 class Visualizer {
 public:
-    using Ptr = std::shared_ptr<Visualizer>;
-
     enum AnchorType {
         TL = 0,
         TR,
diff --git a/demos/tests/cases.py b/demos/tests/cases.py
diff --git a/demos/tests/data_sequences.py b/demos/tests/data_sequences.py