[Native WebGPU] Added ReduceMax and ReduceSum (#23934)

satyajandhyala · web-flow · commit 6dd6ef93f01e · 2025-03-12T11:58:12.000-07:00
### Description
Added ReduceMax and ReduceSum



### Motivation and Context
&lt;!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. --&gt;
diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
@@ -34,11 +34,28 @@ REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 11, 12);
 REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 13, 17);
 REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMean, 18);
 
+REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 1, 10);
+REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 11, 11);
+REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 12, 12);
+REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 13, 17);
+REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMax, 18);
+
+REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSum, 1, 10);
+REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSum, 11, 12);
+REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceSum, 13);
+
 Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const {
-  const auto& input = shader.AddInput("input", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
   const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+  if (is_input_empty_) {
+    shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
+                              << code_[0]
+                              << code_[2]
+                              << output.SetByOffset("global_idx", "output_value");
+    return Status::OK();
+  }
+  const auto& input = shader.AddInput("input", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
   bool reduce_on_all_axes = no_op_with_empty_axes_ == false && axes_.empty();
-  std::string loop_header = code_[0];
+  std::string loop_header = code_[0].find("first_element") == std::string::npos ? code_[0] : "let first_element = " + input.GetByIndices("input_indices") + ";\n" + code_[0] + "\n";
   std::string loop_body = "let current_element: input_value_t = " + input.GetByIndices("input_indices") + ";\n" + code_[1];
   std::string loop_footer = code_[2];
   const auto input_rank = input.Rank();
@@ -56,10 +73,10 @@ Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const {
       loop_body = ss.str();
     } else {
       std::stringstream ss;
-      ss << loop_header << "\n";
       std::string index = "i" + std::to_string(i);
       ss << "let " << index << " = " << output.IndicesGet("output_indices", l) << ";\n";
       ss << input.IndicesSet("input_indices", i, index) << ";\n";
+      ss << loop_header << "\n";
       loop_header = ss.str();
       l++;
     }
@@ -80,6 +97,7 @@ Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const {
 template <bool allow_multi_axes>
 Status ReduceKernel<allow_multi_axes>::ComputeInternal(ComputeContext& context) const {
   const auto* input_tensor = context.Input(0);
+  ORT_RETURN_IF_ERROR(CheckInput(input_tensor));
   InlinedVector<uint32_t> input_axes;
   auto rank = input_tensor->Shape().NumDimensions();
   auto transform_axis = [rank](int64_t axis) {
@@ -95,10 +113,12 @@ Status ReduceKernel<allow_multi_axes>::ComputeInternal(ComputeContext& context)
   if (context.InputCount() > 1) {
     ORT_ENFORCE(axes_.empty(), "Axes attribute may not be specified when axes input is also provided.");
     const Tensor* axes_tensor = context.Input<Tensor>(1);
-    auto size = static_cast<size_t>(axes_tensor->Shape()[0]);
-    const auto* data = axes_tensor->Data<int64_t>();
-    input_axes.reserve(size);
-    std::transform(data, data + size, std::back_inserter(input_axes), transform_axis);
+    if (nullptr != axes_tensor) {
+      auto size = static_cast<size_t>(axes_tensor->Shape()[0]);
+      const auto* data = axes_tensor->Data<int64_t>();
+      input_axes.reserve(size);
+      std::transform(data, data + size, std::back_inserter(input_axes), transform_axis);
+    }
   } else {
     input_axes.reserve(axes_.size());
     std::transform(axes_.begin(), axes_.end(), std::back_inserter(input_axes), transform_axis);
@@ -120,10 +140,12 @@ Status ReduceKernel<allow_multi_axes>::ComputeInternal(ComputeContext& context)
       std::iota(input_axes.begin(), input_axes.end(), 0);
     }
   }
-  const auto code = GetOpSpecificCode(input_tensor, input_axes.size());
+  const auto code = GetOpSpecificCode(input_tensor);
   // Compute output shape
   std::vector<int64_t> output_shape;
+  bool is_input_empty = false;
   for (size_t i = 0; i < input_tensor->Shape().NumDimensions(); ++i) {
+    is_input_empty |= input_tensor->Shape()[i] == 0;
     if (std::find(input_axes.begin(), input_axes.end(), i) != input_axes.end()) {
       if (keepdims_) {
         output_shape.push_back(1);
@@ -134,34 +156,68 @@ Status ReduceKernel<allow_multi_axes>::ComputeInternal(ComputeContext& context)
   }
   TensorShape output_tensor_shape(output_shape);
   int64_t output_size = output_tensor_shape.Size();
-  ReduceKernelProgram program("ReduceMean", keepdims_, noop_with_empty_axes_, input_axes, code);
-  program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank})
+  if (output_size == 0) {
+    ORT_IGNORE_RETURN_VALUE(context.Output(0, output_tensor_shape));
+    return Status::OK();
+  }
+
+  auto input_rank = input_tensor->Shape().NumDimensions();
+  // reduce_axes element is either 1 or 0 depending on whether the axis is reduced or not
+  std::vector<uint32_t> reduce_axes;
+  reduce_axes.resize(input_rank, 0);
+  for (auto axis : input_axes) {
+    reduce_axes[axis] = 1;
+  }
+
+  ReduceKernelProgram program(name_, keepdims_, noop_with_empty_axes_, input_axes, code, is_input_empty);
+  if (!is_input_empty) {
+    program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank});
+  }
+
+  program.CacheHint(is_input_empty)
       .AddOutput({context.Output(0, output_shape), ProgramTensorMetadataDependency::TypeAndRank})
       .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
       .AddUniformVariables({{static_cast<uint32_t>(output_size)},
                             {static_cast<uint32_t>(noop_with_empty_axes_ ? 1 : 0)},
-                            {input_axes},
-                            {static_cast<uint32_t>(input_axes.size())}});
+                            {reduce_axes}});
 
   return context.RunProgram(program);
 }
 
-ReduceOpSpecificCode ReduceMean::GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const {
+ReduceOpSpecificCode ReduceMean::GetOpSpecificCode(const Tensor* input_tensor) const {
   const TensorShape& input_shape = input_tensor->Shape();
   size_t input_rank = input_shape.NumDimensions();
+  std::string loop_header = "var sum = f32(0);";
+  std::string loop_body = "sum += f32(current_element);";
   std::stringstream ss;
   ss << "var size: u32 = 1;\n"
-     << "for (var i: u32 = 0; i < uniforms.axes_size; i += 1) { \n"
-     << "  let index = " << GetElementAt("uniforms.axes", "i", axes_size) << ";\n"
-     << "  size = size * " << GetElementAt("uniforms.input_shape", "index", input_rank) << ";\n"
+     << "for (var i: u32 = 0; i < " << input_rank << "; i += 1) { \n"
+     << "  let index_reduced_or_not = " << GetElementAt("uniforms.reduce_axes", "i", input_rank) << ";\n"
+     << "  if (index_reduced_or_not == 1) { \n"
+     << "    size = size * " << GetElementAt("uniforms.input_shape", "i", input_rank) << ";\n"
+     << "  }\n"
      << "}\n"
      << "let output_value = output_value_t(sum / f32(size));";
-  ReduceOpSpecificCode code({"var sum = f32(0);", "sum += f32(current_element);", ss.str()});
+  std::string loop_footer = ss.str();
+  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
   return code;
 }
 
-Status ReduceMean::ComputeInternal(ComputeContext& ctx) const {
-  return ReduceKernel<true>::ComputeInternal(ctx);
+ReduceOpSpecificCode ReduceMax::GetOpSpecificCode(const Tensor* input_tensor) const {
+  ORT_UNUSED_PARAMETER(input_tensor);
+  std::string loop_header = "var max_element = first_element;";
+  std::string loop_body = "max_element = max(max_element, current_element);";
+  std::string loop_footer = "let output_value = output_value_t(max_element);";
+  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
+  return code;
+}
+ReduceOpSpecificCode ReduceSum::GetOpSpecificCode(const Tensor* input_tensor) const {
+  ORT_UNUSED_PARAMETER(input_tensor);
+  std::string loop_header = "var sum = f32(0);";
+  std::string loop_body = "sum += f32(current_element);";
+  std::string loop_footer = "let output_value = output_value_t(sum);";
+  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
+  return code;
 }
 
 }  // namespace webgpu
diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h
@@ -13,22 +13,23 @@ namespace webgpu {
 // reduceOpSpecificCode is a 3-element array of strings that represent the op specific code for the reduce operation.
 // The first element is the loop header, the second element is the loop body, and the third element is the loop footer.
 // The loop header is the code that is executed before the loop starts. The loop body is the code that is executed for each element in the loop.
-// The loop footer is the code that is executed after the loop ends.
+// The loop footer is the code that is executed after the loop ends. The loop body should contain the code that accumulates the result of the reduction and
+// the loop footer should contain the code that assigins output_value the result of the reduction.
 typedef std::array<std::string, 3> ReduceOpSpecificCode;
 class ReduceKernelProgram final : public Program<ReduceKernelProgram> {
  public:
-  ReduceKernelProgram(std::string name, bool keepdims, bool no_op_with_empty_axes, const InlinedVector<uint32_t>& axes, ReduceOpSpecificCode code) : Program{name}, keepdims_(keepdims), no_op_with_empty_axes_(no_op_with_empty_axes), axes_(axes.begin(), axes.end()), code_(code) {}
+  ReduceKernelProgram(std::string name, bool keepdims, bool no_op_with_empty_axes, const InlinedVector<uint32_t>& axes, ReduceOpSpecificCode code, bool is_input_empty) : Program{name}, keepdims_(keepdims), no_op_with_empty_axes_(no_op_with_empty_axes), axes_(axes.begin(), axes.end()), code_(code), is_input_empty_(is_input_empty) {}
   Status GenerateShaderCode(ShaderHelper& wgpuShaderModuleAddRef) const override;
   WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32},
                                           {"no_op_with_empty_axes", ProgramUniformVariableDataType::Uint32},
-                                          {"axes", ProgramUniformVariableDataType::Uint32},
-                                          {"axes_size", ProgramUniformVariableDataType::Uint32});
+                                          {"reduce_axes", ProgramUniformVariableDataType::Uint32});
 
  private:
   const bool keepdims_;
   const bool no_op_with_empty_axes_;
   InlinedVector<uint32_t> axes_;
   ReduceOpSpecificCode code_;
+  bool is_input_empty_;
 };
 
 template <bool allow_multi_axes = true>
@@ -39,23 +40,41 @@ class ReduceKernel : public WebGpuKernel, public ReduceKernelBase<allow_multi_ax
   using ReduceKernelBase<allow_multi_axes>::keepdims_;
   using ReduceKernelBase<allow_multi_axes>::select_last_index_;
 
-  ReduceKernel(const OpKernelInfo& info, std::string name, optional<int64_t> keepdims_override = {})
+  ReduceKernel(const OpKernelInfo& info, std::string name, bool allow_empty_input = false, optional<int64_t> keepdims_override = {})
       : WebGpuKernel(info),
         ReduceKernelBase<allow_multi_axes>(info, keepdims_override),
-        name_(name) {
+        name_(name),
+        allow_empty_input_(allow_empty_input) {
   }
   Status ComputeInternal(ComputeContext& ctx) const;
-  virtual ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const = 0;
+  virtual ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const = 0;
+
+  Status CheckInput(const Tensor* input_tensor) const {
+    ORT_ENFORCE(input_tensor != nullptr && (input_tensor->Shape().Size() > 0 || allow_empty_input_), "Input tensor cannot be null or empty");
+    return Status::OK();
+  }
 
  private:
   std::string name_;
+  bool allow_empty_input_;
 };
 
 class ReduceMean final : public ReduceKernel<true> {
  public:
-  ReduceMean(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceMean") {}
-  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor, size_t axes_size) const override;
-  Status ComputeInternal(ComputeContext& ctx) const override;
+  ReduceMean(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceMean", true) {}
+  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
+};
+
+class ReduceMax final : public ReduceKernel<true> {
+ public:
+  ReduceMax(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceMax") {}
+  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
+};
+
+class ReduceSum final : public ReduceKernel<true> {
+ public:
+  ReduceSum(const OpKernelInfo& info) : ReduceKernel<true>(info, "ReduceSum", true) {}
+  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
 };
 
 }  // namespace webgpu
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -513,11 +513,11 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, Squeeze)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Squeeze)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, Squeeze)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceMax)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 11, ReduceMax)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 12, 12, ReduceMax)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 17, ReduceMax)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 11, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 12, 12, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 17, ReduceMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ReduceMax)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceMean)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ReduceMean)>,
@@ -539,9 +539,9 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 17, ReduceProd)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ReduceProd)>,
 
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceSum)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ReduceSum)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ReduceSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, ReduceSum)>,
 
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ReduceL1)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ReduceL1)>,