Support non-spatial mode in BatchNormalization (microsoft#2092)

hariharans29 · web-flow · commit 95ab5ad39f47 · 2019-10-14T18:14:14.000-07:00
* Initial commit

* Update

* Update

* Fix build break

* Update

* More changes

* Update type

* Exclude Nuphar for non-spatial tests

* Update

* Resolve PR comments
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs
@@ -319,7 +319,7 @@ private void TestMultiThreads()
         private static Dictionary<string, string> GetSkippedModels()
         {
             var skipModels = new Dictionary<string, string>() {
-                { "mxnet_arcface", "Model not supported by CPU execution provider" } ,
+                { "mxnet_arcface", "Model is an invalid ONNX model"},
                 { "tf_inception_v2", "TODO: Debug failing model, skipping for now" },
                 { "fp16_inception_v1", "16-bit float not supported type in C#." },
                 { "fp16_shufflenet", "16-bit float not supported type in C#." },
diff --git a/onnxruntime/core/providers/cpu/nn/batch_norm.cc b/onnxruntime/core/providers/cpu/nn/batch_norm.cc
@@ -52,7 +52,7 @@ Status BatchNorm<float>::Compute(OpKernelContext* p_op_kernel_context) const {
   const auto* mean = p_op_kernel_context->Input<Tensor>(3);
   const auto* var = p_op_kernel_context->Input<Tensor>(4);
 
-  ORT_RETURN_IF_ERROR(BatchNormHelper::ValidateInputs(X, scale, B, mean, var));
+  ORT_RETURN_IF_ERROR(BatchNormHelper::ValidateInputs(X, scale, B, mean, var, is_spatial_));
 
   const TensorShape& x_shape = X->Shape();
   Tensor* Y = p_op_kernel_context->Output(0, x_shape);
@@ -61,33 +61,46 @@ Status BatchNorm<float>::Compute(OpKernelContext* p_op_kernel_context) const {
   const size_t N = dims_vec[0];
   const size_t C = dims_vec[1];  // assume NCHW as per the spec
 
-  // calculate sample_size
+  // calculate sample_size (per individual channel)
   size_t sample_size = 1;
   for (size_t i = 2; i < dims_vec.size(); ++i) {
     sample_size *= dims_vec[i];
   }
 
-  ConstEigenVectorArrayMap<float> scale_arr(scale->template Data<float>(), C);
-  ConstEigenVectorArrayMap<float> bias_arr(B->template Data<float>(), C);
+  // calculate sample_size (including all channels)
+  size_t sample_size_incl_all_channels = sample_size * C;
+
+  ConstEigenVectorArrayMap<float> scale_arr(scale->template Data<float>(), is_spatial_ ? C : sample_size_incl_all_channels);
+  ConstEigenVectorArrayMap<float> bias_arr(B->template Data<float>(), is_spatial_ ? C : sample_size_incl_all_channels);
 
   // Regardless of training or testing, we will apply the estimated mean
   // and standard deviation to the input. For testing, they are
   // specified directly by the input, and for training, they are computed
   // by the op.
-  Eigen::Array<float, Eigen::Dynamic, 1> inv_std(C);
-  ConstEigenVectorArrayMap<float> var_arr(var->template Data<float>(), C);
+  Eigen::Array<float, Eigen::Dynamic, 1> inv_std(is_spatial_ ? C : sample_size_incl_all_channels);
+  ConstEigenVectorArrayMap<float> var_arr(var->template Data<float>(), is_spatial_ ? C : sample_size_incl_all_channels);
   inv_std = (var_arr + epsilon_).sqrt().inverse();
-  ConstEigenVectorArrayMap<float> mean_arr(mean->template Data<float>(), C);
+  ConstEigenVectorArrayMap<float> mean_arr(mean->template Data<float>(), is_spatial_ ? C : sample_size_incl_all_channels);
   // We can fuse the output computation as follows:
   //   ((x - est_mean) * (inv_var) * scale + bias
   // to
   //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
   Eigen::Array<float, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
   Eigen::Array<float, Eigen::Dynamic, 1> new_bias = bias_arr - mean_arr * new_scale;
-  EigenArrayMap<float> Y_arr(Y->template MutableData<float>(), sample_size, N * C);
-  ConstEigenArrayMap<float> X_arr(X->template Data<float>(), sample_size, N * C);
-  for (size_t nc = 0; nc < N * C; ++nc) {
-    Y_arr.col(nc) = X_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
+  EigenArrayMap<float> Y_arr(Y->template MutableData<float>(),
+                             is_spatial_ ? sample_size : sample_size_incl_all_channels,
+                             is_spatial_ ? N * C : N);
+  ConstEigenArrayMap<float> X_arr(X->template Data<float>(),
+                                  is_spatial_ ? sample_size : sample_size_incl_all_channels,
+                                  is_spatial_ ? N * C : N);
+  if (is_spatial_) {  // spatial == 1
+    for (size_t nc = 0; nc < N * C; ++nc) {
+      Y_arr.col(nc) = X_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
+    }
+  } else {  // spatial == 0
+    for (size_t n = 0; n < N; ++n) {
+      Y_arr.col(n) = X_arr.col(n) * new_scale.col(0) + new_bias.col(0);
+    }
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/cpu/nn/batch_norm.h b/onnxruntime/core/providers/cpu/nn/batch_norm.h
@@ -32,20 +32,19 @@ class BatchNorm : public OpKernel {
   explicit BatchNorm(const OpKernelInfo& op_kernel_info) : OpKernel(op_kernel_info) {
     auto st = op_kernel_info.GetAttr<float>("epsilon", &epsilon_);
     ORT_ENFORCE(st.IsOK(), st.ErrorMessage());
-    
-    // opset 6-8
-    int64_t spatial;
-    if (op_kernel_info.GetAttr<int64_t>("spatial", &spatial).IsOK()) { 
-      ORT_ENFORCE(spatial == 1, "BatchNormalization kernel for CPU provider does not support non-spatial cases");
-    }
+
+    // For opset 6-8, if spatial attribute exists, pick up the value (by default spatial == 1)
+    // From opset 9 onwards, by default, only the spatial case (spatial == 1) is defined per spec
+    is_spatial_ = op_kernel_info.GetAttrOrDefault<int64_t>("spatial", 1) == 1 ? true : false;
 
     //TODO: momentum
   }
 
   Status Compute(OpKernelContext* p_op_kernel_context) const override;
 
-  protected:
-   float epsilon_;
-   //int64_t is_test_;   ignored in this implementation since we're doing inferencing only.
+ protected:
+  float epsilon_;
+  bool is_spatial_;
+  //int64_t is_test_;   ignored in this implementation since we're doing inferencing only.
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h b/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h
@@ -14,48 +14,92 @@ class BatchNormHelper {
                                        const Tensor* scale,
                                        const Tensor* B,
                                        const Tensor* mean,
-                                       const Tensor* var) {
+                                       const Tensor* var,
+                                       bool is_spatial = true) {
+    const auto& x_dims = X->Shape().GetDims();
+    if (x_dims.size() < 2) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Invalid input X: The rank of input X must be atleast 2. Got rank: ", x_dims.size());
+    }
+
+    int64_t num_channels = x_dims[1];
+    int num_feature_dims = static_cast<int>(X->Shape().NumDimensions() - 2);  // the first 2 are respectively - N and C
+
     // defined as per spec and used for validation
-    constexpr int kNumInputScaleDimensions = 1;
-    constexpr int kNumInputBiasDimensions = 1;
-    constexpr int kNumInputMeanDimensions = 1;
-    constexpr int kNumInputVarianceDimensions = 1;
+    int kNumInputScaleDimensions = (is_spatial ? 1 : num_feature_dims + 1);
+    int kNumInputBiasDimensions = (is_spatial ? 1 : num_feature_dims + 1);
+    int kNumInputMeanDimensions = (is_spatial ? 1 : num_feature_dims + 1);
+    int kNumInputVarianceDimensions = (is_spatial ? 1 : num_feature_dims + 1);
     //constexpr int kMinCudaNumDims = 4;
     //constexpr int kMaxCudaNumDims = 5;
 
-    if (X->Shape().GetDims().empty()) {
-      return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Invalid input X: Empty dimensions");
-    }
-
-    int64_t num_channels = X->Shape().GetDims()[1];
-
-    if (scale->Shape().NumDimensions() != kNumInputScaleDimensions) {
+    // validate 'scales' shape
+    const auto& scale_dims = scale->Shape().GetDims();
+    if (static_cast<int>(scale_dims.size()) != kNumInputScaleDimensions) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input scale: NumDimensions() != ", kNumInputScaleDimensions);
     }
-    if (scale->Shape().GetDims()[0] != num_channels) {
+    if (scale_dims[0] != num_channels) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input scale: 0th dimension != ", num_channels);
     }
+    // in non-spatial cases - the other dims of 'scale' must be validated
+    if (!is_spatial) {
+      for (int feature = 0; feature < num_feature_dims; ++feature) {
+        if (scale_dims[1 + feature] != x_dims[2 + feature]) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input scale: ", (1 + feature), " dimension != ", x_dims[2 + feature]);
+        }
+      }
+    }
 
-    if (B->Shape().NumDimensions() != kNumInputBiasDimensions) {
+    // validate 'B' shape
+    const auto& B_dims = B->Shape().GetDims();
+    if (static_cast<int>(B_dims.size()) != kNumInputBiasDimensions) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input B: NumDimensions() != ", kNumInputBiasDimensions);
     }
-    if (B->Shape().GetDims()[0] != num_channels) {
+    if (B_dims[0] != num_channels) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input B: 0th dimension != ", num_channels);
     }
+    // in non-spatial cases - the other dims of 'B' must be validated
+    if (!is_spatial) {
+      for (int feature = 0; feature < num_feature_dims; ++feature) {
+        if (B_dims[1 + feature] != x_dims[2 + feature]) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input B: ", (1 + feature), " dimension != ", x_dims[2 + feature]);
+        }
+      }
+    }
 
-    if (mean->Shape().NumDimensions() != kNumInputMeanDimensions) {
+    // validate 'mean' shape
+    const auto& mean_dims = mean->Shape().GetDims();
+    if (static_cast<int>(mean_dims.size()) != kNumInputMeanDimensions) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input mean: NumDimensions() != ", kNumInputMeanDimensions);
     }
-    if (mean->Shape().GetDims()[0] != num_channels) {
+    if (mean_dims[0] != num_channels) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input mean: 0th dimension != ", num_channels);
     }
+    // in non-spatial cases - the other dims of 'mean' must be validated
+    if (!is_spatial) {
+      for (int feature = 0; feature < num_feature_dims; ++feature) {
+        if (mean_dims[1 + feature] != x_dims[2 + feature]) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input mean: ", (1 + feature), " dimension != ", x_dims[2 + feature]);
+        }
+      }
+    }
 
-    if (var->Shape().NumDimensions() != kNumInputVarianceDimensions) {
+    // validate 'var' shape
+    const auto& var_dims = var->Shape().GetDims();
+    if (static_cast<int>(var_dims.size()) != kNumInputVarianceDimensions) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input var: NumDimensions() != ", kNumInputVarianceDimensions);
     }
-    if (var->Shape().GetDims()[0] != num_channels) {
+    if (var_dims[0] != num_channels) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input var: 0th dimension != ", num_channels);
     }
+    // in non-spatial cases - the other dims of 'var' must be validated
+    if (!is_spatial) {
+      for (int feature = 0; feature < num_feature_dims; ++feature) {
+        if (var_dims[1 + feature] != x_dims[2 + feature]) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid input var: ", (1 + feature), " dimension != ", x_dims[2 + feature]);
+        }
+      }
+    }
 
     return common::Status::OK();
   }
diff --git a/onnxruntime/core/providers/cuda/nn/batch_norm.cc b/onnxruntime/core/providers/cuda/nn/batch_norm.cc
@@ -49,7 +49,7 @@ Status BatchNorm<T>::ComputeInternal(OpKernelContext* p_op_kernel_context) const
   const Tensor* mean = p_op_kernel_context->Input<Tensor>(3);
   const Tensor* var = p_op_kernel_context->Input<Tensor>(4);
 
-  ORT_RETURN_IF_ERROR(BatchNormHelper::ValidateInputs(X, scale, B, mean, var));
+  ORT_RETURN_IF_ERROR(BatchNormHelper::ValidateInputs(X, scale, B, mean, var, spatial_ == 1));
 
   const TensorShape& x_shape = X->Shape();
   Tensor* Y = p_op_kernel_context->Output(0, x_shape);
diff --git a/onnxruntime/core/providers/cuda/nn/batch_norm.h b/onnxruntime/core/providers/cuda/nn/batch_norm.h
@@ -26,7 +26,7 @@ class BatchNorm final : public CudaKernel {
     }
 
     if (spatial_ == 0) {
-      cudnn_batch_norm_mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;  // TODO add test case for this when implemented in CPU as well.
+      cudnn_batch_norm_mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
     }
   }
 
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
@@ -398,7 +398,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
       {"shrink", "test case is wrong", {"onnx141"}},
       {"maxpool_with_argmax_2d_precomputed_strides", "ShapeInferenceError"},
       {"tf_inception_v2", "result mismatch"},
-      {"mxnet_arcface", "result mismatch"},
+      {"mxnet_arcface", "Model is an invalid ONNX model"},
       {"unique_not_sorted_without_axis", "Expected data for 'Y' is incorrect and in sorted order."},
       {"cumsum_1d_reverse_exclusive", "only failing linux GPU CI. Likely build error."},
       {"det_2d", "not implemented yet"},
@@ -508,7 +508,6 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
 #endif
 
 #ifdef USE_CUDA
-  broken_tests.insert({"mxnet_arcface", "result mismatch"});
   broken_tests.insert({"mask_rcnn_keras", "result mismatch"});
   broken_tests.insert({"mlperf_ssd_mobilenet_300", "unknown error"});
   broken_tests.insert({"mlperf_ssd_resnet34_1200", "unknown error"});
diff --git a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
@@ -23,19 +23,27 @@ void TestBatchNorm(const InputDataMap& input_data_map,
                    const vector<int64_t>& expected_output_shape,
                    int64_t spatial_mode = 1,
                    OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess,
-                   const std::string& err_str = "") {
-  OpTester test("BatchNormalization");
+                   const std::string& err_str = "",
+                   int opset_version = 9) {
+  OpTester test("BatchNormalization", opset_version);
   if (epsilon.has_value()) {
     test.AddAttribute("epsilon", epsilon.value());
   }
-  test.AddAttribute("spatial", spatial_mode);
+  if (opset_version < 9) {  // spatial is only defined for opset-8 and below in the spec
+    test.AddAttribute("spatial", spatial_mode);
+  }
   test.AddInput<float>("X", input_shapes_map.at("X"), input_data_map.at("X"));
   test.AddInput<float>("scale", input_shapes_map.at("scale"), input_data_map.at("scale"));
   test.AddInput<float>("B", input_shapes_map.at("B"), input_data_map.at("B"));
   test.AddInput<float>("mean", input_shapes_map.at("mean"), input_data_map.at("mean"));
   test.AddInput<float>("var", input_shapes_map.at("var"), input_data_map.at("var"));
   test.AddOutput<float>("output", expected_output_shape, expected_output);
-  test.Run(expect_result, err_str, {kTensorrtExecutionProvider});// Weight as input is not supported by TensorRT
+  // Weight as input is not supported by TensorRT and spatial == 0 is not supported by Nuphar
+  std::unordered_set<std::string> excluded_eps = {kTensorrtExecutionProvider};
+  if (spatial_mode == 0) {
+    excluded_eps.insert(kNGraphExecutionProvider);
+  }
+  test.Run(expect_result, err_str, excluded_eps);
 }
 
 TEST(BatchNormTest, PositiveTestCase) {
@@ -513,6 +521,76 @@ TEST(BatchNormTest, InvalidVarDim) {
                 "Invalid input var");
 }
 
+TEST(BatchNormTest, NonSpatial_Simple) {
+  vector<float> X{1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f};
+  vector<float> scale{1.f, 1.f, 1.f, 1.f};
+  vector<float> B{1.f, 0.f, 0.f, 1.f};
+  vector<float> mean{0.f, 0.f, 0.f, 0.f};
+  vector<float> var{1.f, 1.f, 1.f, 1.f};
+
+  InputDataMap input_data_map;
+  input_data_map.insert({"X", X});
+  input_data_map.insert({"scale", scale});
+  input_data_map.insert({"B", B});
+  input_data_map.insert({"mean", mean});
+  input_data_map.insert({"var", var});
+
+  InputShapesMap input_shapes_map;
+  input_shapes_map.insert({"X", {2, 2, 2}});
+  input_shapes_map.insert({"scale", {2, 2}});
+  input_shapes_map.insert({"B", {2, 2}});
+  input_shapes_map.insert({"mean", {2, 2}});
+  input_shapes_map.insert({"var", {2, 2}});
+
+  vector<int64_t> expected_output_shape{2, 2, 2};
+  auto expected_output = {2.f, 2.f, 3.f, 5.f, 2.f, 2.f, 3.f, 5.f};
+  float epsilon = 0.f;
+  TestBatchNorm(input_data_map,
+                input_shapes_map,
+                epsilon,
+                expected_output,
+                expected_output_shape,
+                0,
+                OpTester::ExpectResult::kExpectSuccess,
+                "",
+                7);  // opset-7
+}
+
+TEST(BatchNormTest, NonSpatial_Complicated) {
+  vector<float> X{0.2134f, 0.32434f, 0.5644f, 0.3234f, 0.4545f, 0.3445f};
+  vector<float> scale{0.5f, 0.6f};
+  vector<float> B{0.2f, 0.1f};
+  vector<float> mean{0.034f, 0.342f};
+  vector<float> var{1.f, 1.f};
+
+  InputDataMap input_data_map;
+  input_data_map.insert({"X", X});
+  input_data_map.insert({"scale", scale});
+  input_data_map.insert({"B", B});
+  input_data_map.insert({"mean", mean});
+  input_data_map.insert({"var", var});
+
+  InputShapesMap input_shapes_map;
+  input_shapes_map.insert({"X", {3, 1, 2}});
+  input_shapes_map.insert({"scale", {1, 2}});
+  input_shapes_map.insert({"B", {1, 2}});
+  input_shapes_map.insert({"mean", {1, 2}});
+  input_shapes_map.insert({"var", {1, 2}});
+
+  vector<int64_t> expected_output_shape{3, 1, 2};
+  auto expected_output = {0.2897f, 0.089404f, 0.4652f, 0.08884f, 0.41025f, 0.1015f};
+  float epsilon = 1e-05f;
+  TestBatchNorm(input_data_map,
+                input_shapes_map,
+                epsilon,
+                expected_output,
+                expected_output_shape,
+                0,
+                OpTester::ExpectResult::kExpectSuccess,
+                "",
+                8);  // opset-8
+}
+
 // Only CUDA kernel has float 16 support
 #ifdef USE_CUDA
 TEST(BatchNormTest, BatchNorm2d_fp16) {

Original file line number	Diff line number	Diff line change
`@@ -319,7 +319,7 @@ private void TestMultiThreads()`
`319`	`319`	`private static Dictionary<string, string> GetSkippedModels()`
`320`	`320`	`{`
`321`	`321`	`var skipModels = new Dictionary<string, string>() {`
`322`		`- { "mxnet_arcface", "Model not supported by CPU execution provider" } ,`
	`322`	`+ { "mxnet_arcface", "Model is an invalid ONNX model"},`
`323`	`323`	`{ "tf_inception_v2", "TODO: Debug failing model, skipping for now" },`
`324`	`324`	`{ "fp16_inception_v1", "16-bit float not supported type in C#." },`
`325`	`325`	`{ "fp16_shufflenet", "16-bit float not supported type in C#." },`
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ class BatchNorm final : public CudaKernel {`
`26`	`26`	`}`
`27`	`27`
`28`	`28`	`if (spatial_ == 0) {`
`29`		`- cudnn_batch_norm_mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; // TODO add test case for this when implemented in CPU as well.`
	`29`	`+ cudnn_batch_norm_mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;`
`30`	`30`	`}`
`31`	`31`	`}`
`32`	`32`