Skip to content

Commit b2e6625

Browse files
author
Roman Donchenko
authored
Merge pull request openvinotoolkit#1417 from Wovchena/interactive_face_detection-update
interactive_face_detection: update
2 parents 128677a + c9f0aa3 commit b2e6625

File tree

6 files changed

+71
-127
lines changed

6 files changed

+71
-127
lines changed

demos/interactive_face_detection_demo/README.md

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ The new Async API operates with a new notion of the Infer Request that encapsula
3838
Running the application with the `-h` option yields the following usage message:
3939

4040
```
41-
./interactive_face_detection_demo -h
4241
InferenceEngine:
4342
API version ............ <version>
4443
Build .................. <number>
@@ -47,7 +46,8 @@ interactive_face_detection_demo [OPTION]
4746
Options:
4847
4948
-h Print a usage message
50-
-i "<path>" Required. Path to a video file (specify "cam" to work with camera).
49+
-i Required. An input to process. The input must be a single image, a folder of images or anything that cv::VideoCapture can process.
50+
-loop Optional. Enable reading the input in a loop.
5151
-o "<path>" Optional. Path to an output video file.
5252
-m "<path>" Required. Path to an .xml file with a trained Face Detection model.
5353
-m_ag "<path>" Optional. Path to an .xml file with a trained Age/Gender Recognition model.
@@ -71,7 +71,6 @@ Options:
7171
-dyn_em Optional. Enable dynamic batch size for Emotions Recognition network
7272
-dyn_lm Optional. Enable dynamic batch size for Facial Landmarks Estimation network
7373
-async Optional. Enable asynchronous mode
74-
-no_wait Optional. Do not wait for key press in the end.
7574
-no_show Optional. Do not show processed video.
7675
-pc Optional. Enable per-layer performance report
7776
-r Optional. Output inference results as raw values
@@ -80,7 +79,6 @@ Options:
8079
-dx_coef Optional. Coefficient to shift the bounding box around the detected face along the Ox axis
8180
-dy_coef Optional. Coefficient to shift the bounding box around the detected face along the Oy axis
8281
-fps Optional. Maximum FPS for playing video
83-
-loop_video Optional. Enable playing video on a loop
8482
-no_smooth Optional. Do not smooth person attributes
8583
-no_show_emotion_bar Optional. Do not show emotion bar
8684
-u Optional. List of monitors to show initially.

demos/interactive_face_detection_demo/interactive_face_detection.hpp

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,11 @@
1010
#include <gflags/gflags.h>
1111
#include <iostream>
1212

13+
#include <samples/default_flags.hpp>
14+
15+
DEFINE_INPUT_FLAGS
16+
1317
static const char help_message[] = "Print a usage message";
14-
static const char input_video_message[] = "Required. Path to a video file (specify \"cam\" to work with camera).";
1518
static const char output_video_message[] = "Optional. Path to an output video file.";
1619
static const char face_detection_model_message[] = "Required. Path to an .xml file with a trained Face Detection model.";
1720
static const char age_gender_model_message[] = "Optional. Path to an .xml file with a trained Age/Gender Recognition model.";
@@ -34,7 +37,7 @@ static const char target_device_message_em[] = "Optional. Target device for Emot
3437
"The demo will look for a suitable plugin for a specified device.";
3538
static const char target_device_message_lm[] = "Optional. Target device for Facial Landmarks Estimation network "
3639
"(the list of available devices is shown below). Default value is CPU. Use \"-d HETERO:<comma-separated_devices_list>\" format to specify HETERO plugin. "
37-
"The demo will look for a suitable plugin for device specified.";
40+
"The demo will look for a suitable plugin for a specified device.";
3841
static const char num_batch_ag_message[] = "Optional. Number of maximum simultaneously processed faces for Age/Gender Recognition network "
3942
"(by default, it is 16)";
4043
static const char num_batch_hp_message[] = "Optional. Number of maximum simultaneously processed faces for Head Pose Estimation network "
@@ -55,19 +58,16 @@ static const char custom_cpu_library_message[] = "Required for CPU custom layers
5558
static const char thresh_output_message[] = "Optional. Probability threshold for detections";
5659
static const char bb_enlarge_coef_output_message[] = "Optional. Coefficient to enlarge/reduce the size of the bounding box around the detected face";
5760
static const char raw_output_message[] = "Optional. Output inference results as raw values";
58-
static const char no_wait_for_keypress_message[] = "Optional. Do not wait for key press in the end.";
5961
static const char no_show_processed_video[] = "Optional. Do not show processed video.";
6062
static const char async_message[] = "Optional. Enable asynchronous mode";
6163
static const char dx_coef_output_message[] = "Optional. Coefficient to shift the bounding box around the detected face along the Ox axis";
6264
static const char dy_coef_output_message[] = "Optional. Coefficient to shift the bounding box around the detected face along the Oy axis";
6365
static const char fps_output_message[] = "Optional. Maximum FPS for playing video";
64-
static const char loop_video_output_message[] = "Optional. Enable playing video on a loop";
6566
static const char no_smooth_output_message[] = "Optional. Do not smooth person attributes";
6667
static const char no_show_emotion_bar_message[] = "Optional. Do not show emotion bar";
6768
static const char utilization_monitors_message[] = "Optional. List of monitors to show initially.";
6869

6970
DEFINE_bool(h, false, help_message);
70-
DEFINE_string(i, "", input_video_message);
7171
DEFINE_string(o, "", output_video_message);
7272
DEFINE_string(m, "", face_detection_model_message);
7373
DEFINE_string(m_ag, "", age_gender_model_message);
@@ -93,13 +93,11 @@ DEFINE_string(l, "", custom_cpu_library_message);
9393
DEFINE_bool(r, false, raw_output_message);
9494
DEFINE_double(t, 0.5, thresh_output_message);
9595
DEFINE_double(bb_enlarge_coef, 1.2, bb_enlarge_coef_output_message);
96-
DEFINE_bool(no_wait, false, no_wait_for_keypress_message);
9796
DEFINE_bool(no_show, false, no_show_processed_video);
9897
DEFINE_bool(async, false, async_message);
9998
DEFINE_double(dx_coef, 1, dx_coef_output_message);
10099
DEFINE_double(dy_coef, 1, dy_coef_output_message);
101-
DEFINE_double(fps, -1, fps_output_message);
102-
DEFINE_bool(loop_video, false, loop_video_output_message);
100+
DEFINE_double(fps, -std::numeric_limits<double>::infinity(), fps_output_message);
103101
DEFINE_bool(no_smooth, false, no_smooth_output_message);
104102
DEFINE_bool(no_show_emotion_bar, false, no_show_emotion_bar_message);
105103
DEFINE_string(u, "", utilization_monitors_message);
@@ -111,11 +109,12 @@ DEFINE_string(u, "", utilization_monitors_message);
111109

112110
static void showUsage() {
113111
std::cout << std::endl;
114-
std::cout << "interactive_face_detection [OPTION]" << std::endl;
112+
std::cout << "interactive_face_detection_demo [OPTION]" << std::endl;
115113
std::cout << "Options:" << std::endl;
116114
std::cout << std::endl;
117115
std::cout << " -h " << help_message << std::endl;
118-
std::cout << " -i \"<path>\" " << input_video_message << std::endl;
116+
std::cout << " -i " << input_message << std::endl;
117+
std::cout << " -loop " << loop_message << std::endl;
119118
std::cout << " -o \"<path>\" " << output_video_message << std::endl;
120119
std::cout << " -m \"<path>\" " << face_detection_model_message<< std::endl;
121120
std::cout << " -m_ag \"<path>\" " << age_gender_model_message << std::endl;
@@ -139,7 +138,6 @@ static void showUsage() {
139138
std::cout << " -dyn_em " << dyn_batch_em_message << std::endl;
140139
std::cout << " -dyn_lm " << dyn_batch_lm_message << std::endl;
141140
std::cout << " -async " << async_message << std::endl;
142-
std::cout << " -no_wait " << no_wait_for_keypress_message << std::endl;
143141
std::cout << " -no_show " << no_show_processed_video << std::endl;
144142
std::cout << " -pc " << performance_counter_message << std::endl;
145143
std::cout << " -r " << raw_output_message << std::endl;
@@ -148,7 +146,6 @@ static void showUsage() {
148146
std::cout << " -dx_coef " << dx_coef_output_message << std::endl;
149147
std::cout << " -dy_coef " << dy_coef_output_message << std::endl;
150148
std::cout << " -fps " << fps_output_message << std::endl;
151-
std::cout << " -loop_video " << loop_video_output_message << std::endl;
152149
std::cout << " -no_smooth " << no_smooth_output_message << std::endl;
153150
std::cout << " -no_show_emotion_bar " << no_show_emotion_bar_message << std::endl;
154151
std::cout << " -u " << utilization_monitors_message << std::endl;

demos/interactive_face_detection_demo/main.cpp

Lines changed: 46 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <inference_engine.hpp>
2626

2727
#include <monitors/presenter.h>
28+
#include <samples/images_capture.h>
2829
#include <samples/ocv_common.hpp>
2930
#include <samples/slog.hpp>
3031

@@ -61,10 +62,6 @@ bool ParseAndCheckCommandLine(int argc, char *argv[]) {
6162
if (FLAGS_n_hp < 1) {
6263
throw std::logic_error("Parameter -n_hp cannot be 0");
6364
}
64-
65-
// no need to wait for a key press from a user if an output image/video file is not shown.
66-
FLAGS_no_wait |= FLAGS_no_show;
67-
6865
return true;
6966
}
7067

@@ -77,27 +74,6 @@ int main(int argc, char *argv[]) {
7774
return 0;
7875
}
7976

80-
slog::info << "Reading input" << slog::endl;
81-
cv::VideoCapture cap;
82-
if (!(FLAGS_i == "cam" ? cap.open(0) : cap.open(FLAGS_i))) {
83-
throw std::logic_error("Cannot open input file or camera: " + FLAGS_i);
84-
}
85-
86-
Timer timer;
87-
// read input (video) frame
88-
cv::Mat frame;
89-
if (!cap.read(frame)) {
90-
throw std::logic_error("Failed to get frame from cv::VideoCapture");
91-
}
92-
93-
const size_t width = static_cast<size_t>(frame.cols);
94-
const size_t height = static_cast<size_t>(frame.rows);
95-
96-
cv::VideoWriter videoWriter;
97-
if (!FLAGS_o.empty()) {
98-
videoWriter.open(FLAGS_o, cv::VideoWriter::fourcc('I', 'Y', 'U', 'V'), 25, cv::Size(width, height));
99-
}
100-
// ---------------------------------------------------------------------------------------------------
10177
// --------------------------- 1. Loading Inference Engine -----------------------------
10278

10379
Core ie;
@@ -163,73 +139,75 @@ int main(int argc, char *argv[]) {
163139
Load(facialLandmarksDetector).into(ie, FLAGS_d_lm, FLAGS_dyn_lm);
164140
// ----------------------------------------------------------------------------------------------------
165141

166-
// --------------------------- 3. Doing inference -----------------------------------------------------
167-
// Starting inference & calculating performance
168-
slog::info << "Start inference " << slog::endl;
169-
170142
bool isFaceAnalyticsEnabled = ageGenderDetector.enabled() || headPoseDetector.enabled() ||
171143
emotionsDetector.enabled() || facialLandmarksDetector.enabled();
172144

145+
Timer timer;
173146
std::ostringstream out;
174147
size_t framesCounter = 0;
175-
int delay = 1;
176-
double msrate = -1;
177-
cv::Mat prev_frame, next_frame;
148+
double msrate = 1000.0 / FLAGS_fps;
178149
std::list<Face::Ptr> faces;
179150
size_t id = 0;
180151

181-
if (FLAGS_fps > 0) {
182-
msrate = 1000.f / FLAGS_fps;
152+
std::unique_ptr<ImagesCapture> cap = openImagesCapture(FLAGS_i, FLAGS_loop);
153+
cv::Mat frame = cap->read();
154+
if (!frame.data) {
155+
throw std::runtime_error("Can't read an image from the input");
183156
}
184157

185-
Visualizer::Ptr visualizer;
186-
if (!FLAGS_no_show || !FLAGS_o.empty()) {
187-
visualizer = std::make_shared<Visualizer>(cv::Size(width, height));
188-
if (!FLAGS_no_show_emotion_bar && emotionsDetector.enabled()) {
189-
visualizer->enableEmotionBar(emotionsDetector.emotionsVec);
158+
const cv::Point THROUGHPUT_METRIC_POSITION{10, 45};
159+
Presenter presenter(FLAGS_u, THROUGHPUT_METRIC_POSITION.y + 15, {frame.cols / 4, 60});
160+
161+
Visualizer visualizer{frame.size()};
162+
if (!FLAGS_no_show_emotion_bar && emotionsDetector.enabled()) {
163+
visualizer.enableEmotionBar(emotionsDetector.emotionsVec);
164+
}
165+
166+
cv::VideoWriter videoWriter;
167+
if (!FLAGS_o.empty()) {
168+
videoWriter.open(FLAGS_o, cv::VideoWriter::fourcc('I', 'Y', 'U', 'V'),
169+
!FLAGS_no_show && FLAGS_fps > 0.0 ? FLAGS_fps : cap->fps(), frame.size());
170+
if (!videoWriter.isOpened()) {
171+
throw std::runtime_error("Can't open video writer");
190172
}
191173
}
192174

193175
// Detecting all faces on the first frame and reading the next one
194176
faceDetector.enqueue(frame);
195177
faceDetector.submitRequest();
196178

197-
prev_frame = frame.clone();
198-
199-
// Reading the next frame
200-
bool frameReadStatus = cap.read(frame);
179+
cv::Mat next_frame = cap->read();
201180

202181
std::cout << "To close the application, press 'CTRL+C' here";
203182
if (!FLAGS_no_show) {
204183
std::cout << " or switch to the output window and press Q or Esc";
205184
}
206185
std::cout << std::endl;
207186

208-
const cv::Point THROUGHPUT_METRIC_POSITION{10, 45};
209-
210-
cv::Size graphSize{static_cast<int>(cap.get(cv::CAP_PROP_FRAME_WIDTH) / 4), 60};
211-
Presenter presenter(FLAGS_u, THROUGHPUT_METRIC_POSITION.y + 15, graphSize);
212-
213-
while (true) {
187+
while (frame.data) {
214188
timer.start("total");
189+
cv::Mat prev_frame = std::move(frame);
190+
frame = std::move(next_frame);
215191
framesCounter++;
216-
bool isLastFrame = !frameReadStatus;
217192

218193
// Retrieving face detection results for the previous frame
219194
faceDetector.wait();
220195
faceDetector.fetchResults();
221196
auto prev_detection_results = faceDetector.results;
222197

223198
// No valid frame to infer if previous frame is the last
224-
if (!isLastFrame) {
199+
if (frame.data) {
200+
if (frame.size() != prev_frame.size()) {
201+
throw std::runtime_error("Images of different size are not supported");
202+
}
225203
faceDetector.enqueue(frame);
226204
faceDetector.submitRequest();
227205
}
228206

229207
// Filling inputs of face analytics networks
230208
for (auto &&face : prev_detection_results) {
231209
if (isFaceAnalyticsEnabled) {
232-
auto clippedRect = face.location & cv::Rect(0, 0, width, height);
210+
cv::Rect clippedRect = face.location & cv::Rect({0, 0}, prev_frame.size());
233211
cv::Mat face = prev_frame(clippedRect);
234212
ageGenderDetector.enqueue(face);
235213
headPoseDetector.enqueue(face);
@@ -246,16 +224,8 @@ int main(int argc, char *argv[]) {
246224
facialLandmarksDetector.submitRequest();
247225
}
248226

249-
// Reading the next frame if the current one is not the last
250-
if (!isLastFrame) {
251-
frameReadStatus = cap.read(next_frame);
252-
if (FLAGS_loop_video && !frameReadStatus) {
253-
if (!(FLAGS_i == "cam" ? cap.open(0) : cap.open(FLAGS_i))) {
254-
throw std::logic_error("Cannot open input file or camera: " + FLAGS_i);
255-
}
256-
frameReadStatus = cap.read(next_frame);
257-
}
258-
}
227+
// Read the next frame while waiting for inference results
228+
next_frame = cap->read();
259229

260230
if (isFaceAnalyticsEnabled) {
261231
ageGenderDetector.wait();
@@ -276,7 +246,7 @@ int main(int argc, char *argv[]) {
276246
// For every detected face
277247
for (size_t i = 0; i < prev_detection_results.size(); i++) {
278248
auto& result = prev_detection_results[i];
279-
cv::Rect rect = result.location & cv::Rect(0, 0, width, height);
249+
cv::Rect rect = result.location & cv::Rect({0, 0}, prev_frame.size());
280250

281251
Face::Ptr face;
282252
if (!FLAGS_no_smooth) {
@@ -327,44 +297,23 @@ int main(int argc, char *argv[]) {
327297

328298
presenter.drawGraphs(prev_frame);
329299

330-
// Visualizing results
331-
if (!FLAGS_no_show || !FLAGS_o.empty()) {
332-
out.str("");
333-
out << "Total image throughput: " << std::fixed << std::setprecision(2)
334-
<< 1000.f / (timer["total"].getSmoothedDuration()) << " fps";
335-
cv::putText(prev_frame, out.str(), THROUGHPUT_METRIC_POSITION, cv::FONT_HERSHEY_TRIPLEX, 1,
336-
cv::Scalar(255, 0, 0), 2);
337-
338-
// drawing faces
339-
visualizer->draw(prev_frame, faces);
340-
341-
if (!FLAGS_no_show) {
342-
cv::imshow("Detection results", prev_frame);
343-
}
344-
}
345-
346-
if (!FLAGS_o.empty()) {
347-
videoWriter.write(prev_frame);
348-
}
349-
350-
prev_frame = frame;
351-
frame = next_frame;
352-
next_frame = cv::Mat();
300+
// drawing faces
301+
visualizer.draw(prev_frame, faces);
353302

354303
timer.finish("total");
304+
out.str("");
305+
out << "Total image throughput: " << std::fixed << std::setprecision(1)
306+
<< 1000.0 / (timer["total"].getSmoothedDuration()) << " fps";
307+
cv::putText(prev_frame, out.str(), THROUGHPUT_METRIC_POSITION, cv::FONT_HERSHEY_TRIPLEX, 1,
308+
cv::Scalar(255, 0, 0), 2);
355309

356-
if (FLAGS_fps > 0) {
357-
delay = std::max(1, static_cast<int>(msrate - timer["total"].getLastCallDuration()));
310+
if (videoWriter.isOpened()) {
311+
videoWriter.write(prev_frame);
358312
}
359313

360-
// End of file (or a single frame file like an image). The last frame is displayed to let you check what is shown
361-
if (isLastFrame) {
362-
if (!FLAGS_no_wait) {
363-
std::cout << "No more frames to process!" << std::endl;
364-
cv::waitKey(0);
365-
}
366-
break;
367-
} else if (!FLAGS_no_show) {
314+
int delay = std::max(1, static_cast<int>(msrate - timer["total"].getLastCallDuration()));
315+
if (!FLAGS_no_show) {
316+
cv::imshow("Detection results", prev_frame);
368317
int key = cv::waitKey(delay);
369318
if (27 == key || 'Q' == key || 'q' == key) {
370319
break;
@@ -374,7 +323,7 @@ int main(int argc, char *argv[]) {
374323
}
375324

376325
slog::info << "Number of processed frames: " << framesCounter << slog::endl;
377-
slog::info << "Total image throughput: " << framesCounter * (1000.f / timer["total"].getTotalDuration()) << " fps" << slog::endl;
326+
slog::info << "Total image throughput: " << framesCounter * (1000.0 / timer["total"].getTotalDuration()) << " fps" << slog::endl;
378327

379328
// Showing performance results
380329
if (FLAGS_pc) {
@@ -386,17 +335,6 @@ int main(int argc, char *argv[]) {
386335
}
387336

388337
std::cout << presenter.reportMeans() << '\n';
389-
// ---------------------------------------------------------------------------------------------------
390-
391-
if (!FLAGS_o.empty()) {
392-
videoWriter.release();
393-
}
394-
395-
// release input video stream
396-
cap.release();
397-
398-
// close windows
399-
cv::destroyAllWindows();
400338
}
401339
catch (const std::exception& error) {
402340
slog::err << error.what() << slog::endl;

demos/interactive_face_detection_demo/visualizer.hpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,6 @@ class HeadPoseVisualizer {
8080
// Drawing detected faces on the frame
8181
class Visualizer {
8282
public:
83-
using Ptr = std::shared_ptr<Visualizer>;
84-
8583
enum AnchorType {
8684
TL = 0,
8785
TR,

0 commit comments

Comments
 (0)