1
+ #include " detector_yolov8face.h"
2
+ using namespace std ;
3
+
4
+ Detector_Yolov8Face::Detector_Yolov8Face ()
5
+ {
6
+ }
7
+ Detector_Yolov8Face::~Detector_Yolov8Face ()
8
+ {
9
+ }
10
+ HZFLAG Detector_Yolov8Face::InitDetector_Yolov8Face (Config& config)
11
+ {
12
+
13
+ this ->conf_thresh =config.yolov8face_confidence_thresh ;
14
+ this ->nms_thresh =config.yolov8face_nms_thresh ;
15
+ this ->batch_size =config.yolov8face_detect_bs ;
16
+
17
+ this ->NUM_CLASSES =1 ;
18
+ this ->CKPT_NUM =5 ;
19
+ this ->NUM_BOX_ELEMENT =7 +CKPT_NUM*2 ;
20
+
21
+ this ->INPUT_BLOB_NAME = " images" ;
22
+ this ->OUTPUT_BLOB_NAME = " output0" ;
23
+ cudaSetDevice (config.gpu_id );
24
+ std::string directory;
25
+ const size_t last_slash_idx=config.Yolov8FactDetectModelPath .rfind (" .onnx" );
26
+ if (std::string::npos != last_slash_idx)
27
+ {
28
+ directory = config.Yolov8FactDetectModelPath .substr (0 , last_slash_idx);
29
+ }
30
+ std::string out_engine=directory+" _batch=" +std::to_string (config.yolov8face_detect_bs )+" .engine" ;
31
+ bool enginemodel=model_exists (out_engine);
32
+ if (!enginemodel)
33
+ {
34
+ std::cout << " Building engine, please wait for a while..." << std::endl;
35
+ bool wts_model=model_exists (config.Yolov8FactDetectModelPath );
36
+ if (!wts_model)
37
+ {
38
+ std::cout<<" yolov8s-face.onnx is not Exist!!!Please Check!" <<std::endl;
39
+ return HZ_WITHOUTMODEL;
40
+ }
41
+ Onnx2Ttr onnx2trt;
42
+ // IHostMemory* modelStream{ nullptr };
43
+ onnx2trt.onnxToTRTModel (gLogger ,config.Yolov8FactDetectModelPath .c_str (),config.yolov8face_detect_bs ,out_engine.c_str ());
44
+ }
45
+ size_t size{0 };
46
+ std::ifstream file (out_engine, std::ios::binary);// out_engine"/home/pcb/FaceRecognition_Linux_Release/yolov8face_test/yolov8-face-tensorrt/yolov8s-face_batch=1.engine"
47
+ if (file.good ())
48
+ {
49
+ file.seekg (0 , file.end );
50
+ size = file.tellg ();
51
+ file.seekg (0 , file.beg );
52
+ trtModelStream = new char [size];
53
+ assert (trtModelStream);
54
+ file.read (trtModelStream, size);
55
+ file.close ();
56
+ }
57
+ else
58
+ {
59
+ std::cout<<" yolov8s-face.engine model file not exist!" <<std::endl;
60
+ return HZ_WITHOUTMODEL;
61
+ }
62
+
63
+ this ->runtime = createInferRuntime (gLogger );
64
+ assert (runtime != nullptr );
65
+ this ->engine = runtime->deserializeCudaEngine (trtModelStream, size);
66
+ assert (engine != nullptr );
67
+ this ->context = engine->createExecutionContext ();
68
+ assert (context != nullptr );
69
+ delete[] trtModelStream;
70
+ assert (engine->getNbBindings () == 2 );
71
+ this ->inputIndex = engine->getBindingIndex (INPUT_BLOB_NAME);
72
+ this ->outputIndex = engine->getBindingIndex (OUTPUT_BLOB_NAME);
73
+ assert (inputIndex == 0 );
74
+ assert (outputIndex == 1 );
75
+
76
+ // input nchw
77
+ auto input_dims = engine->getBindingDimensions (0 );
78
+ this ->INPUT_W = input_dims.d [3 ];
79
+ this ->INPUT_H = input_dims.d [2 ];
80
+
81
+ // 1*20*8400
82
+ auto output_dims = engine->getBindingDimensions (1 );
83
+ this ->OUTPUT_ELEMENT =output_dims.d [1 ];
84
+ this ->OUTPUT_CANDIDATES = output_dims.d [2 ];
85
+ this ->OUTPUT_SIZE =this ->OUTPUT_ELEMENT *this ->OUTPUT_CANDIDATES ;
86
+ // 1*20*8400
87
+
88
+
89
+ // Create GPU buffers on device
90
+ CHECK (cudaMalloc (&this ->buffers [inputIndex], config.yolov8face_detect_bs * 3 * INPUT_H * INPUT_W * sizeof (float )));
91
+ CHECK (cudaMalloc (&this ->buffers [outputIndex], config.yolov8face_detect_bs * OUTPUT_SIZE * sizeof (float )));
92
+ // Create stream
93
+ CHECK (cudaStreamCreate (&stream));
94
+ // prepare input data cache in pinned memory
95
+ CHECK (cudaMallocHost ((void **)&img_host, config.yolov8face_detect_bs *MAX_IMAGE_INPUT_SIZE_THRESH * 3 *sizeof (uint8_t )));
96
+ // prepare input data cache in device memory
97
+ CHECK (cudaMalloc ((void **)&img_device, config.yolov8face_detect_bs *MAX_IMAGE_INPUT_SIZE_THRESH * 3 *sizeof (uint8_t )));
98
+
99
+ // postprocess input data cache in device memory
100
+ CHECK (cudaMalloc (&decode_ptr_device,sizeof (float )*(1 +MAX_OBJECTS*NUM_BOX_ELEMENT)));
101
+
102
+ CHECK (cudaMalloc ((void **)&pre_predict, OUTPUT_SIZE * sizeof (float )));
103
+
104
+ CHECK (cudaMallocHost (&affine_matrix_d2i_host,sizeof (float )*6 ));
105
+
106
+ CHECK (cudaMalloc (&transpose_device, OUTPUT_SIZE * sizeof (float )));
107
+
108
+ this ->affine_matrix_d2i_device =new float *[batch_size];
109
+ this ->decode_ptr_host =new float *[batch_size];
110
+ for (size_t i = 0 ; i < batch_size; i++)
111
+ {
112
+ this ->decode_ptr_host [i]= new float [(1 +MAX_OBJECTS*NUM_BOX_ELEMENT)];
113
+ CHECK (cudaMalloc (&this ->affine_matrix_d2i_device [i],sizeof (float )*6 ));
114
+ }
115
+ return HZ_SUCCESS;
116
+ }
117
+
118
+ HZFLAG Detector_Yolov8Face::Detect_Yolov8Face (std::vector<cv::Mat>&ImgVec,std::vector<std::vector<Det>>& dets)
119
+ {
120
+ // prepare input data ---------------------------
121
+ int detector_batchsize=ImgVec.size ();
122
+ float * buffer_idx = (float *)this ->buffers [inputIndex];
123
+ for (int b = 0 ; b < detector_batchsize; b++)
124
+ {
125
+ if (ImgVec[b].empty ()||ImgVec[b].data ==NULL )
126
+ {
127
+ continue ;
128
+ }
129
+ // proprecess
130
+ affineMatrix afmt;
131
+ getd2i (afmt,cv::Size (INPUT_W,INPUT_H),cv::Size (ImgVec[b].cols ,ImgVec[b].rows ));
132
+ size_t size_image = ImgVec[b].cols * ImgVec[b].rows * 3 *sizeof (uint8_t );
133
+ size_t size_image_dst = INPUT_H * INPUT_W * 3 *sizeof (uint8_t );
134
+ memcpy (affine_matrix_d2i_host,afmt.d2i ,sizeof (afmt.d2i ));
135
+ memcpy (img_host, ImgVec[b].data , size_image);
136
+ CHECK (cudaMemcpy (img_device, img_host, size_image, cudaMemcpyHostToDevice));
137
+ CHECK (cudaMemcpy (affine_matrix_d2i_device[b],affine_matrix_d2i_host,sizeof (afmt.d2i ),cudaMemcpyHostToDevice));
138
+ yolov8face_preprocess_kernel_img (img_device, ImgVec[b].cols , ImgVec[b].rows , buffer_idx, INPUT_W, INPUT_H,affine_matrix_d2i_device[b], stream);
139
+ buffer_idx += size_image_dst;
140
+ }
141
+ // inference
142
+ // (*context).enqueue(detector_batchsize,(void**)this->buffers, stream, nullptr);
143
+ (*context).enqueueV2 ((void **)this ->buffers , stream, nullptr );
144
+
145
+ // postprocess
146
+ float *predict = (float *)this ->buffers [outputIndex];
147
+ for (size_t i = 0 ; i < detector_batchsize; i++)
148
+ {
149
+ CHECK (cudaMemsetAsync (decode_ptr_device,0 ,sizeof (int ),stream));
150
+
151
+ CHECK (cudaMemcpyAsync (pre_predict,predict,OUTPUT_SIZE * sizeof (float ),cudaMemcpyDeviceToDevice, stream));
152
+ // transpose [1 20 8400] convert to [1 8400 0]
153
+ yolov8_transpose (pre_predict, this ->OUTPUT_CANDIDATES ,this ->OUTPUT_ELEMENT ,transpose_device, stream);
154
+
155
+ yolov8face_decode_kernel_invoker (transpose_device,NUM_BOX_ELEMENT,OUTPUT_CANDIDATES,NUM_CLASSES,CKPT_NUM,
156
+ this ->conf_thresh ,affine_matrix_d2i_device[i],decode_ptr_device,MAX_OBJECTS,stream); // cuda decode
157
+ yolov8face_nms_kernel_invoker (decode_ptr_device,this ->nms_thresh , MAX_OBJECTS, stream,NUM_BOX_ELEMENT); // cuda nms
158
+ CHECK (cudaMemcpyAsync (decode_ptr_host[i],decode_ptr_device,sizeof (float )*(1 +MAX_OBJECTS*NUM_BOX_ELEMENT),cudaMemcpyDeviceToHost,stream));
159
+ predict+=OUTPUT_SIZE;
160
+ }
161
+ cudaStreamSynchronize (stream);
162
+ for (size_t k = 0 ; k < detector_batchsize; k++)
163
+ {
164
+ std::vector<Det>det;
165
+ int count = std::min ((int )*decode_ptr_host[k],MAX_OBJECTS);
166
+ for (int i = 0 ; i<count;i++)
167
+ {
168
+ int basic_pos = 1 +i*NUM_BOX_ELEMENT;
169
+ int keep_flag= decode_ptr_host[k][basic_pos+6 ];
170
+ if (keep_flag==1 )
171
+ {
172
+ Det det_temp;
173
+ det_temp.bbox .xmin = decode_ptr_host[k][basic_pos+0 ];
174
+ det_temp.bbox .ymin = decode_ptr_host[k][basic_pos+1 ];
175
+ det_temp.bbox .xmax = decode_ptr_host[k][basic_pos+2 ];
176
+ det_temp.bbox .ymax = decode_ptr_host[k][basic_pos+3 ];
177
+ det_temp.confidence = decode_ptr_host[k][basic_pos+4 ];
178
+ int landmark_pos = basic_pos+7 ;
179
+ for (int id = 0 ; id<CKPT_NUM; id+=1 )
180
+ {
181
+ det_temp.key_points .push_back (decode_ptr_host[k][landmark_pos+2 *id]);
182
+ det_temp.key_points .push_back (decode_ptr_host[k][landmark_pos+2 *id+1 ]);
183
+ }
184
+ det.push_back (det_temp);
185
+ }
186
+ }
187
+ dets.push_back (det);
188
+ }
189
+ return HZ_SUCCESS;
190
+ }
191
+ HZFLAG Detector_Yolov8Face::ReleaseDetector_Yolov8Face ()
192
+ {
193
+ context->destroy ();
194
+ engine->destroy ();
195
+ runtime->destroy ();
196
+ for (size_t i = 0 ; i < batch_size; i++)
197
+ {
198
+ CHECK (cudaFree (affine_matrix_d2i_device[i]));
199
+ delete decode_ptr_host[i];
200
+ }
201
+ delete [] decode_ptr_host;
202
+ delete [] affine_matrix_d2i_device;
203
+ CHECK (cudaFreeHost (affine_matrix_d2i_host));
204
+ CHECK (cudaFree (img_device));
205
+ CHECK (cudaFreeHost (img_host));
206
+ CHECK (cudaFree (buffers[inputIndex]));
207
+ CHECK (cudaFree (buffers[outputIndex]));
208
+ CHECK (cudaFree (decode_ptr_device));
209
+ CHECK (cudaFree (pre_predict));
210
+ CHECK (cudaFree (transpose_device));
211
+ return HZ_SUCCESS;
212
+ }
213
+
214
+ void Detector_Yolov8Face::affine_project (float *d2i,float x,float y,float *ox,float *oy) // 通过仿射变换逆矩阵,恢复成原图的坐标
215
+ {
216
+ *ox = d2i[0 ]*x+d2i[1 ]*y+d2i[2 ];
217
+ *oy = d2i[3 ]*x+d2i[4 ]*y+d2i[5 ];
218
+ }
219
+
220
+ void Detector_Yolov8Face::getd2i (affineMatrix &afmt,cv::Size to,cv::Size from) // 计算仿射变换的矩阵和逆矩阵
221
+ {
222
+ float scale = std::min (1.0 *to.width /from.width , 1.0 *to.height /from.height );
223
+ afmt.i2d [0 ]=scale;
224
+ afmt.i2d [1 ]=0 ;
225
+ afmt.i2d [2 ]=-scale*from.width *0.5 +to.width *0.5 ;
226
+ afmt.i2d [3 ]=0 ;
227
+ afmt.i2d [4 ]=scale;
228
+ afmt.i2d [5 ]=-scale*from.height *0.5 +to.height *0.5 ;
229
+ cv::Mat i2d_mat (2 ,3 ,CV_32F,afmt.i2d );
230
+ cv::Mat d2i_mat (2 ,3 ,CV_32F,afmt.d2i );
231
+ cv::invertAffineTransform (i2d_mat,d2i_mat);
232
+ memcpy (afmt.d2i , d2i_mat.ptr <float >(0 ), sizeof (afmt.d2i ));
233
+ }
0 commit comments