From c15f92ea9ad61fb6c944a90a1a87577d2b68f843 Mon Sep 17 00:00:00 2001 From: vekkt0r Date: Mon, 24 May 2021 21:09:38 +0200 Subject: [PATCH] Normalize input correctly for segm_ / selfie (#77) Expected input is [0.0, 1.0] for both segm_ / selfie models, while deepseg requires [-1.0, 1.0] performance with negative inputs is really bad with segm_ model. --- deepseg.cc | 139 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 96 insertions(+), 43 deletions(-) diff --git a/deepseg.cc b/deepseg.cc index 4622037..1faa261 100644 --- a/deepseg.cc +++ b/deepseg.cc @@ -148,8 +148,23 @@ typedef struct { pthread_mutex_t lock; } capinfo_t; +enum class modeltype_t { + Unknown, + BodyPix, + DeepLab, + GoogleMeetSegmentation, + MLKitSelfie, +}; + +struct normalization_t { + float scaling; + float offset; +}; + typedef struct { const char *modelname; + modeltype_t modeltype; + normalization_t norm; size_t threads; size_t width; size_t height; @@ -190,6 +205,36 @@ void *grab_thread(void *arg) { return NULL; } +modeltype_t get_modeltype(const char* modelname) { + if (strstr(modelname, "body-pix")) { + return modeltype_t::BodyPix; + } + else if (strstr(modelname, "deeplab")) { + return modeltype_t::DeepLab; + } + else if (strstr(modelname, "segm_")) { + return modeltype_t::GoogleMeetSegmentation; + } + else if (strstr(modelname, "selfie")) { + return modeltype_t::MLKitSelfie; + } + return modeltype_t::Unknown; +} + +normalization_t get_normalization(modeltype_t type) { + // TODO: This should be read out from actual mode metadata instead + switch (type) { + case modeltype_t::DeepLab: + return normalization_t{.scaling = 1/127.5, .offset = -1}; + case modeltype_t::BodyPix: + case modeltype_t::GoogleMeetSegmentation: + case modeltype_t::MLKitSelfie: + case modeltype_t::Unknown: + default: + return normalization_t{.scaling = 1/255.0, .offset = 0}; + } +} + void init_tensorflow(calcinfo_t &info) { // Load model info.model = tflite::FlatBufferModel::BuildFromFile(info.modelname); @@ -244,11 +289,10 @@ void calc_mask(calcinfo_t &info, timinginfo_t &ti) { in_u8_rgb = filtered; } - // convert to float and normalize values to [-1;1] - in_u8_rgb.convertTo(info.input,CV_32FC3,1.0/128.0,-1.0); + // convert to float and normalize to values expected by model + in_u8_rgb.convertTo(info.input,CV_32FC3,info.norm.scaling,info.norm.offset); ti.openns=timestamp(); - // Run inference TFLITE_MINIMAL_CHECK(interpreter->Invoke() == kTfLiteOk); ti.tfltns=timestamp(); @@ -256,47 +300,50 @@ void calc_mask(calcinfo_t &info, timinginfo_t &ti) { float* tmp = (float*)info.output.data; uint8_t* out = (uint8_t*)info.ofinal.data; - // find class with maximum probability - if (strstr(info.modelname,"deeplab")) { - for (unsigned int n = 0; n < info.output.total(); n++) { - float maxval = -10000; size_t maxpos = 0; - for (size_t i = 0; i < cnum; i++) { - if (tmp[n*cnum+i] > maxval) { - maxval = tmp[n*cnum+i]; - maxpos = i; + switch (info.modeltype) { + case modeltype_t::DeepLab: + // find class with maximum probability + for (unsigned int n = 0; n < info.output.total(); n++) { + float maxval = -10000; size_t maxpos = 0; + for (size_t i = 0; i < cnum; i++) { + if (tmp[n*cnum+i] > maxval) { + maxval = tmp[n*cnum+i]; + maxpos = i; + } } + // set mask to 0 where class == person + uint8_t val = (maxpos==pers ? 0 : 255); + out[n] = (val & 0xE0) | (out[n] >> 3); } - // set mask to 0 where class == person - uint8_t val = (maxpos==pers ? 0 : 255); - out[n] = (val & 0xE0) | (out[n] >> 3); - } - } - - // threshold probability - if (strstr(info.modelname,"body-pix") || strstr(info.modelname,"selfie")) { - for (unsigned int n = 0; n < info.output.total(); n++) { - // FIXME: hardcoded threshold - uint8_t val = (tmp[n] > 0.65 ? 0 : 255); - out[n] = (val & 0xE0) | (out[n] >> 3); - } - } - - // Google Meet segmentation network - if (strstr(info.modelname,"segm_")) { - /* 256 x 144 x 2 tensor for the full model or 160 x 96 x 2 - * tensor for the light model with masks for background - * (channel 0) and person (channel 1) where values are in - * range [MIN_FLOAT, MAX_FLOAT] and user has to apply - * softmax across both channels to yield foreground - * probability in [0.0, 1.0]. */ - for (unsigned int n = 0; n < info.output.total(); n++) { - float exp0 = expf(tmp[2*n ]); - float exp1 = expf(tmp[2*n+1]); - float p0 = exp0 / (exp0+exp1); - float p1 = exp1 / (exp0+exp1); - uint8_t val = (p0 < p1 ? 0 : 255); - out[n] = (val & 0xE0) | (out[n] >> 3); - } + break; + case modeltype_t::BodyPix: + case modeltype_t::MLKitSelfie: + // threshold probability + for (unsigned int n = 0; n < info.output.total(); n++) { + // FIXME: hardcoded threshold + uint8_t val = (tmp[n] > 0.65 ? 0 : 255); + out[n] = (val & 0xE0) | (out[n] >> 3); + } + break; + case modeltype_t::GoogleMeetSegmentation: + /* 256 x 144 x 2 tensor for the full model or 160 x 96 x 2 + * tensor for the light model with masks for background + * (channel 0) and person (channel 1) where values are in + * range [MIN_FLOAT, MAX_FLOAT] and user has to apply + * softmax across both channels to yield foreground + * probability in [0.0, 1.0]. */ + for (unsigned int n = 0; n < info.output.total(); n++) { + float exp0 = expf(tmp[2*n ]); + float exp1 = expf(tmp[2*n+1]); + float p0 = exp0 / (exp0+exp1); + float p1 = exp1 / (exp0+exp1); + uint8_t val = (p0 < p1 ? 0 : 255); + out[n] = (val & 0xE0) | (out[n] >> 3); + } + break; + case modeltype_t::Unknown: + fprintf(stderr, "Unknown model type\n"); + break; } ti.maskns=timestamp(); @@ -463,7 +510,13 @@ int main(int argc, char* argv[]) { cap.set(CV_CAP_PROP_FOURCC, fourcc); cap.set(CV_CAP_PROP_CONVERT_RGB, true); - calcinfo_t calcinfo = { modelname, threads, width, height, debug }; + auto modeltype = get_modeltype(modelname); + auto norm = get_normalization(modeltype); + if (modeltype_t::Unknown == modeltype) { + fprintf(stderr, "Unknown model type '%s'.\n", modelname); + exit(1); + } + calcinfo_t calcinfo = { modelname, modeltype, norm, threads, width, height, debug }; init_tensorflow(calcinfo); // kick off separate grabber thread to keep OpenCV/FFMpeg happy (or it lags badly)