Merge pull request openvinotoolkit#1415 from Wovchena/gaze_estimation-add-images_capture

Roman Donchenko · web-flow · commit 3f545dcc59af · 2020-08-19T12:56:17.000+03:00
gaze_estimation: add images_capture
diff --git a/demos/common/include/samples/args_helper.hpp b/demos/common/include/samples/args_helper.hpp
@@ -14,6 +14,8 @@
 #include <string>
 #include <vector>
 
+#include <opencv2/core/types.hpp>
+
 /**
 * @brief This function checks input args and existence of specified files in a given folder
 * @param arg path to a file to be checked for existence
@@ -34,3 +36,5 @@ std::vector<std::string> parseDevices(const std::string& device_string);
 
 std::map<std::string, uint32_t> parseValuePerDevice(const std::set<std::string>& devices,
                                                     const std::string& values_string);
+
+cv::Size stringToSize(const std::string& str);
diff --git a/demos/common/include/samples/images_capture.h b/demos/common/include/samples/images_capture.h
@@ -29,5 +29,5 @@ class ImagesCapture {
 // }
 std::unique_ptr<ImagesCapture> openImagesCapture(const std::string &input,
     bool loop, size_t initialImageId=0,  // Non camera options
-    size_t readLengthLimit=std::numeric_limits<size_t>::max()  // General option
-);
+    size_t readLengthLimit=std::numeric_limits<size_t>::max(),  // General option
+    cv::Size cameraResolution={1280, 720});
diff --git a/demos/common/monitors/src/presenter.cpp b/demos/common/monitors/src/presenter.cpp
@@ -141,7 +141,11 @@ void Presenter::drawGraphs(cv::Mat& frame) {
 
     if (cpuMonitor.getHistorySize() > 1 && possibleHistorySize > 1 && --numberOfEnabledMonitors >= 0) {
         std::deque<std::vector<double>> lastHistory = cpuMonitor.getLastHistory();
-        cv::Mat graph = frame(cv::Rect{cv::Point{graphPos, yPos}, graphSize} & cv::Rect(0, 0, frame.cols, frame.rows));
+        cv::Rect intersection = cv::Rect{cv::Point(graphPos, yPos), graphSize} & cv::Rect{0, 0, frame.cols, frame.rows};
+        if (!intersection.area()) {
+            return;
+        }
+        cv::Mat graph = frame(intersection);
         graph = graph / 2 + cv::Scalar{127, 127, 127};
 
         int lineXPos = graph.cols - 1;
@@ -182,7 +186,11 @@ void Presenter::drawGraphs(cv::Mat& frame) {
 
     if (distributionCpuEnabled && --numberOfEnabledMonitors >= 0) {
         std::deque<std::vector<double>> lastHistory = cpuMonitor.getLastHistory();
-        cv::Mat graph = frame(cv::Rect{cv::Point{graphPos, yPos}, graphSize} & cv::Rect(0, 0, frame.cols, frame.rows));
+        cv::Rect intersection = cv::Rect{cv::Point(graphPos, yPos), graphSize} & cv::Rect{0, 0, frame.cols, frame.rows};
+        if (!intersection.area()) {
+            return;
+        }
+        cv::Mat graph = frame(intersection);
         graph = graph / 2 + cv::Scalar{127, 127, 127};
 
         if (!lastHistory.empty()) {
@@ -227,7 +235,11 @@ void Presenter::drawGraphs(cv::Mat& frame) {
 
     if (memoryMonitor.getHistorySize() > 1 && possibleHistorySize > 1 && --numberOfEnabledMonitors >= 0) {
         std::deque<std::pair<double, double>> lastHistory = memoryMonitor.getLastHistory();
-        cv::Mat graph = frame(cv::Rect{cv::Point{graphPos, yPos}, graphSize} & cv::Rect(0, 0, frame.cols, frame.rows));
+        cv::Rect intersection = cv::Rect{cv::Point(graphPos, yPos), graphSize} & cv::Rect{0, 0, frame.cols, frame.rows};
+        if (!intersection.area()) {
+            return;
+        }
+        cv::Mat graph = frame(intersection);
         graph = graph / 2 + cv::Scalar{127, 127, 127};
         int histxPos = graph.cols - 1;
         double range = std::min(memoryMonitor.getMaxMemTotal() + memoryMonitor.getMaxSwap(),
diff --git a/demos/common/src/args_helper.cpp b/demos/common/src/args_helper.cpp
@@ -128,3 +128,11 @@ std::map<std::string, uint32_t> parseValuePerDevice(const std::set<std::string>&
     }
     return result;
 }
+
+cv::Size stringToSize(const std::string& str) {
+    std::vector<std::string> strings = split(str, 'x');
+    if (strings.size() != 2) {
+        throw std::invalid_argument("Can't convert std::string to cv::Size. The string must contain exactly one x");
+    }
+    return {std::stoi(strings[0]), std::stoi(strings[1])};
+}
diff --git a/demos/common/src/images_capture.cpp b/demos/common/src/images_capture.cpp
@@ -112,14 +112,15 @@ class VideoCapWrapper : public ImagesCapture {
     size_t readLengthLimit;
 
 public:
-    VideoCapWrapper(const std::string &input, bool loop, size_t initialImageId, size_t readLengthLimit)
+    VideoCapWrapper(const std::string &input, bool loop, size_t initialImageId, size_t readLengthLimit,
+                cv::Size cameraResolution)
             : ImagesCapture{loop}, nextImgId{0}, initialImageId{static_cast<double>(initialImageId)} {
         try {
             cap.open(std::stoi(input));
             this->readLengthLimit = loop ? std::numeric_limits<size_t>::max() : readLengthLimit;
             cap.set(cv::CAP_PROP_BUFFERSIZE, 1);
-            cap.set(cv::CAP_PROP_FRAME_WIDTH, 1280);
-            cap.set(cv::CAP_PROP_FRAME_HEIGHT, 720);
+            cap.set(cv::CAP_PROP_FRAME_WIDTH, cameraResolution.width);
+            cap.set(cv::CAP_PROP_FRAME_HEIGHT, cameraResolution.height);
             cap.set(cv::CAP_PROP_AUTOFOCUS, true);
             cap.set(cv::CAP_PROP_FOURCC, cv::VideoWriter::fourcc('M', 'J', 'P', 'G'));
         } catch (const std::invalid_argument&) {
@@ -162,7 +163,7 @@ class VideoCapWrapper : public ImagesCapture {
 };
 
 std::unique_ptr<ImagesCapture> openImagesCapture(const std::string &input, bool loop, size_t initialImageId,
-        size_t readLengthLimit) {
+        size_t readLengthLimit, cv::Size cameraResolution) {
     if (readLengthLimit == 0) throw std::runtime_error{"Read length limit must be positive"};
     try {
         return std::unique_ptr<ImagesCapture>(new ImreadWrapper{input, loop});
@@ -171,7 +172,8 @@ std::unique_ptr<ImagesCapture> openImagesCapture(const std::string &input, bool
         return std::unique_ptr<ImagesCapture>(new DirReader{input, loop, initialImageId, readLengthLimit});
     } catch (const InvalidInput &) {}
     try {
-        return std::unique_ptr<ImagesCapture>(new VideoCapWrapper{input, loop, initialImageId, readLengthLimit});
+        return std::unique_ptr<ImagesCapture>(new VideoCapWrapper{input, loop, initialImageId, readLengthLimit,
+            cameraResolution});
     } catch (const InvalidInput &) {}
     throw std::runtime_error{"Can't read " + input};
 }
diff --git a/demos/gaze_estimation_demo/README.md b/demos/gaze_estimation_demo/README.md
@@ -29,7 +29,6 @@ Other demo objectives are:
 
 Running the application with the `-h` option yields the following usage message:
 ```
-./gaze_estimation_demo -h
 InferenceEngine:
     API version ............ <version>
     Build .................. <number>
@@ -38,7 +37,9 @@ gaze_estimation_demo [OPTION]
 Options:
 
     -h                       Print a usage message.
-    -i "<path>"              Optional. Path to a video file. Default value is "cam" to work with camera.
+    -i                       Required. An input to process. The input must be a single image, a folder of images or anything that cv::VideoCapture can process.
+    -loop                    Optional. Enable reading the input in a loop.
+    -res "<WxH>"             Optional. Set camera resolution in format WxH.
     -m "<path>"              Required. Path to an .xml file with a trained Gaze Estimation model.
     -m_fd "<path>"           Required. Path to an .xml file with a trained Face Detection model.
     -m_hp "<path>"           Required. Path to an .xml file with a trained Head Pose Estimation model.
@@ -48,8 +49,7 @@ Options:
     -d_fd "<device>"         Optional. Target device for Face Detection network (the list of available devices is shown below). Use "-d HETERO:<comma-separated_devices_list>" format to specify HETERO plugin. The demo will look for a suitable plugin for a specified device. Default value is "CPU".
     -d_hp "<device>"         Optional. Target device for Head Pose Estimation network (the list of available devices is shown below). Use "-d HETERO:<comma-separated_devices_list>" format to specify HETERO plugin. The demo will look for a suitable plugin for a specified device. Default value is "CPU".
     -d_lm "<device>"         Optional. Target device for Facial Landmarks Estimation network (the list of available devices is shown below). Use "-d HETERO:<comma-separated_devices_list>" format to specify HETERO plugin. The demo will look for a suitable plugin for a specified device. Default value is "CPU".
-    -d_es "<device>"         Optional. Target device for Open/Closed Eye Estimation network (the list of available devices is shown below). Use "-d HETERO:<comma-separated_devices_list>" format to specify HETERO plugin. The demo will look for a suitable plugin for a specified device. Default value is "CPU".
-    -res "<WxH>"             Optional. Set camera resolution in format WxH.
+    -d_es "<device>"         Optional. Target device for Open/Closed Eye network (the list of available devices is shown below). Use "-d HETERO:<comma-separated_devices_list>" format to specify HETERO plugin. The demo will look for a suitable plugin for a specified device. Default value is "CPU".
     -fd_reshape              Optional. Reshape Face Detector network so that its input resolution has the same aspect ratio as the input frame.
     -no_show                 Optional. Do not show processed video.
     -pc                      Optional. Enable per-layer performance report.
diff --git a/demos/gaze_estimation_demo/gaze_estimation_demo.hpp b/demos/gaze_estimation_demo/gaze_estimation_demo.hpp
@@ -10,8 +10,12 @@
 #include <gflags/gflags.h>
 #include <iostream>
 
+#include <samples/default_flags.hpp>
+
+DEFINE_INPUT_FLAGS
+
 static const char help_message[] = "Print a usage message.";
-static const char video_message[] = "Optional. Path to a video file. Default value is \"cam\" to work with camera.";
+static const char camera_resolution_message[] = "Optional. Set camera resolution in format WxH.";
 static const char gaze_estimation_model_message[] = "Required. Path to an .xml file with a trained Gaze Estimation model.";
 static const char face_detection_model_message[] = "Required. Path to an .xml file with a trained Face Detection model.";
 static const char head_pose_model_message[] = "Required. Path to an .xml file with a trained Head Pose Estimation model.";
@@ -30,11 +34,10 @@ static const char target_device_message_hp[] = "Optional. Target device for Head
                                                "The demo will look for a suitable plugin for a specified device. Default value is \"CPU\".";
 static const char target_device_message_lm[] = "Optional. Target device for Facial Landmarks Estimation network "
                                                "(the list of available devices is shown below). Use \"-d HETERO:<comma-separated_devices_list>\" format to specify HETERO plugin. "
-                                               "The demo will look for a suitable plugin for device specified. Default value is \"CPU\".";
+                                               "The demo will look for a suitable plugin for a specified device. Default value is \"CPU\".";
 static const char target_device_message_es[] = "Optional. Target device for Open/Closed Eye network "
                                                "(the list of available devices is shown below). Use \"-d HETERO:<comma-separated_devices_list>\" format to specify HETERO plugin. "
-                                               "The demo will look for a suitable plugin for device specified. Default value is \"CPU\".";
-static const char camera_resolution_message[] = "Optional. Set camera resolution in format WxH.";
+                                               "The demo will look for a suitable plugin for a specified device. Default value is \"CPU\".";
 static const char performance_counter_message[] = "Optional. Enable per-layer performance report.";
 static const char thresh_output_message[] = "Optional. Probability threshold for Face Detector. The default value is 0.5.";
 static const char raw_output_message[] = "Optional. Output inference results as raw values.";
@@ -43,7 +46,7 @@ static const char no_show_processed_video[] = "Optional. Do not show processed v
 static const char utilization_monitors_message[] = "Optional. List of monitors to show initially.";
 
 DEFINE_bool(h, false, help_message);
-DEFINE_string(i, "cam", video_message);
+DEFINE_string(res, "1280x720", camera_resolution_message);
 DEFINE_string(m, "", gaze_estimation_model_message);
 DEFINE_string(m_fd, "", face_detection_model_message);
 DEFINE_string(m_hp, "", head_pose_model_message);
@@ -54,7 +57,6 @@ DEFINE_string(d_fd, "CPU", target_device_message_fd);
 DEFINE_string(d_hp, "CPU", target_device_message_hp);
 DEFINE_string(d_lm, "CPU", target_device_message_lm);
 DEFINE_string(d_es, "CPU", target_device_message_es);
-DEFINE_string(res, "", camera_resolution_message);
 DEFINE_bool(fd_reshape, false, fd_reshape_message);
 DEFINE_bool(pc, false, performance_counter_message);
 DEFINE_bool(r, false, raw_output_message);
@@ -72,7 +74,9 @@ static void showUsage() {
     std::cout << "Options:" << std::endl;
     std::cout << std::endl;
     std::cout << "    -h                       " << help_message << std::endl;
-    std::cout << "    -i \"<path>\"              " << video_message << std::endl;
+    std::cout << "    -i                       " << input_message << std::endl;
+    std::cout << "    -loop                    " << loop_message << std::endl;
+    std::cout << "    -res \"<WxH>\"             " << camera_resolution_message << std::endl;
     std::cout << "    -m \"<path>\"              " << gaze_estimation_model_message << std::endl;
     std::cout << "    -m_fd \"<path>\"           " << face_detection_model_message << std::endl;
     std::cout << "    -m_hp \"<path>\"           " << head_pose_model_message << std::endl;
@@ -83,7 +87,6 @@ static void showUsage() {
     std::cout << "    -d_hp \"<device>\"         " << target_device_message_hp << std::endl;
     std::cout << "    -d_lm \"<device>\"         " << target_device_message_lm << std::endl;
     std::cout << "    -d_es \"<device>\"         " << target_device_message_es << std::endl;
-    std::cout << "    -res \"<WxH>\"             " << camera_resolution_message << std::endl;
     std::cout << "    -fd_reshape              " << fd_reshape_message << std::endl;
     std::cout << "    -no_show                 " << no_show_processed_video << std::endl;
     std::cout << "    -pc                      " << performance_counter_message << std::endl;
diff --git a/demos/gaze_estimation_demo/main.cpp b/demos/gaze_estimation_demo/main.cpp
@@ -28,6 +28,8 @@
 #include <inference_engine.hpp>
 
 #include <monitors/presenter.h>
+#include <samples/args_helper.hpp>
+#include <samples/images_capture.h>
 #include <samples/ocv_common.hpp>
 #include <samples/slog.hpp>
 
@@ -87,36 +89,6 @@ int main(int argc, char *argv[]) {
             return 0;
         }
 
-        slog::info << "Reading input" << slog::endl;
-        cv::VideoCapture cap;
-
-        if (!(FLAGS_i == "cam" ? cap.open(0) : cap.open(FLAGS_i))) {
-            throw std::logic_error("Cannot open input file or camera: " + FLAGS_i);
-        }
-
-        // Parse camera resolution parameter and set camera resolution
-        if (FLAGS_i == "cam" && FLAGS_res != "") {
-            auto xPos = FLAGS_res.find("x");
-            if (xPos == std::string::npos)
-                throw std::runtime_error("Incorrect -res parameter format, please use 'x' to separate width and height");
-            int frameWidth, frameHeight;
-            std::stringstream widthStream(FLAGS_res.substr(0, xPos));
-            widthStream >> frameWidth;
-            std::stringstream heightStream(FLAGS_res.substr(xPos + 1));
-            heightStream >> frameHeight;
-            cap.set(cv::CAP_PROP_FRAME_WIDTH, frameWidth);
-            cap.set(cv::CAP_PROP_FRAME_HEIGHT, frameHeight);
-        }
-
-        // read input (video) frame
-        cv::Mat frame;
-        if (!cap.read(frame)) {
-            throw std::logic_error("Failed to get frame from cv::VideoCapture");
-        }
-
-        bool flipImage = false;
-        ResultsMarker resultsMarker(false, false, false, true, true);
-
         // Loading Inference Engine
         std::vector<std::pair<std::string, std::string>> cmdOptions = {
             {FLAGS_d, FLAGS_m}, {FLAGS_d_fd, FLAGS_m_fd},
@@ -150,10 +122,21 @@ int main(int argc, char *argv[]) {
         ExponentialAverager overallTimeAverager(smoothingFactor, 30.);
         ExponentialAverager inferenceTimeAverager(smoothingFactor, 30.);
 
+        bool flipImage = false;
+        ResultsMarker resultsMarker(false, false, false, true, true);
         int delay = 1;
         std::string windowName = "Gaze estimation demo";
-        cv::Size graphSize{static_cast<int>(cap.get(cv::CAP_PROP_FRAME_WIDTH) / 4), 60};
-        Presenter presenter(FLAGS_u, static_cast<int>(cap.get(cv::CAP_PROP_FRAME_HEIGHT)) - graphSize.height - 10, graphSize);
+
+        std::unique_ptr<ImagesCapture> cap = openImagesCapture(FLAGS_i, FLAGS_loop, 0,
+            std::numeric_limits<size_t>::max(), stringToSize(FLAGS_res));
+        cv::Mat frame = cap->read();
+        if (!frame.data) {
+            throw std::runtime_error("Can't read an image from the input");
+        }
+
+        cv::Size graphSize{frame.cols / 4, 60};
+        Presenter presenter(FLAGS_u, frame.rows - graphSize.height - 10, graphSize);
+
         auto tIterationBegins = cv::getTickCount();
         do {
             if (flipImage) {
@@ -192,10 +175,6 @@ int main(int argc, char *argv[]) {
                 }
             }
 
-            if (FLAGS_no_show) {
-                continue;
-            }
-
             presenter.drawGraphs(frame);
 
             // Display the results
@@ -204,20 +183,23 @@ int main(int argc, char *argv[]) {
             }
             putTimingInfoOnFrame(frame, overallTimeAverager.getAveragedValue(),
                                  inferenceTimeAverager.getAveragedValue());
-            cv::imshow(windowName, frame);
-
-            // Controls the information being displayed while demo runs
-            int key = cv::waitKey(delay);
-            resultsMarker.toggle(key);
-
-            // Press 'Esc' to quit, 'f' to flip the video horizontally
-            if (key == 27)
-                break;
-            else if (key == 'f')
-                flipImage = !flipImage;
-            else
-                presenter.handleKey(key);
-        } while (cap.read(frame));
+            if (!FLAGS_no_show) {
+                cv::imshow(windowName, frame);
+
+                // Controls the information being displayed while demo runs
+                int key = cv::waitKey(delay);
+                resultsMarker.toggle(key);
+
+                // Press 'Esc' to quit, 'f' to flip the video horizontally
+                if (key == 27)
+                    break;
+                if (key == 'f')
+                    flipImage = !flipImage;
+                else
+                    presenter.handleKey(key);
+            }
+            frame = cap->read();
+        } while (frame.data);
         std::cout << presenter.reportMeans() << '\n';
     }
     catch (const std::exception& error) {
diff --git a/demos/gaze_estimation_demo/src/eye_state_estimator.cpp b/demos/gaze_estimation_demo/src/eye_state_estimator.cpp

Original file line number	Diff line number	Diff line change
`@@ -128,3 +128,11 @@ std::map<std::string, uint32_t> parseValuePerDevice(const std::set<std::string>&`
`128`	`128`	`}`
`129`	`129`	`return result;`
`130`	`130`	`}`
	`131`	`+`
	`132`	`+cv::Size stringToSize(const std::string& str) {`
	`133`	`+ std::vector<std::string> strings = split(str, 'x');`
	`134`	`+ if (strings.size() != 2) {`
	`135`	`+ throw std::invalid_argument("Can't convert std::string to cv::Size. The string must contain exactly one x");`
	`136`	`+ }`
	`137`	`+ return {std::stoi(strings[0]), std::stoi(strings[1])};`
	`138`	`+}`