optimize

Xiaofei Han · Xiaofei Han · commit 99b07eb5d57c · 2025-03-18T15:35:57.000+08:00
diff --git a/onnxruntime/core/providers/webgpu/math/gemm.cc b/onnxruntime/core/providers/webgpu/math/gemm.cc
@@ -36,36 +36,107 @@ WEBGPU_GEMM_VERSIONED_KERNEL(11, 12)
 WEBGPU_GEMM_KERNEL(13)
 
 Status GemmProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  const uint32_t TILE_SIZE = 16;
+
+  // Add shared memory arrays
+  shader.AdditionalImplementation() << "var<workgroup> tile_a: array<array<output_value_t, " << TILE_SIZE << ">, " << TILE_SIZE << ">;\n"
+                                    << "var<workgroup> tile_b: array<array<output_value_t, " << TILE_SIZE << ">, " << TILE_SIZE << ">;\n\n";
+
   const ShaderVariableHelper& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
 
-  shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
-                            << "  let m = global_idx / uniforms.N;\n"
-                            << "  let n = global_idx % uniforms.N;\n"
-                            << "  var value = output_value_t(0);\n"
-                            << "\n";
+  shader.MainFunctionBody() << "  var value = output_value_t(0);\n\n"
+                            << "  let tile_col_start = (workgroup_id.x % uniforms.num_tile_n) * " << TILE_SIZE << "u;\n"
+                            << "  let tile_row_start = (workgroup_id.x / uniforms.num_tile_n) * " << TILE_SIZE << "u;\n";
 
-  // When K == 0, we don't bind A and B. Because WebGPU doesn't support binding a zero-sized buffer,
   if (need_handle_matmul_) {
     const ShaderVariableHelper& A = shader.AddInput("A", ShaderUsage::UseUniform);
     const ShaderVariableHelper& B = shader.AddInput("B", ShaderUsage::UseUniform);
 
-    shader.MainFunctionBody() << "  for (var k = 0u; k < uniforms.K; k = k + 1u) {\n";
+    shader.MainFunctionBody()
+        << "  let num_tiles = (uniforms.K - 1u) / " << TILE_SIZE << "u + 1u;\n"
+        << "  var k_start = 0u;\n"
+        << "  for (var t = 0u; t < num_tiles; t = t + 1u) {\n";
 
+    // Fill workgroup shared memory
     if (transA_ && transB_) {
-      shader.MainFunctionBody() << "    value = value + " << A.GetByOffset("k * uniforms.M + m")
-                                << " * " << B.GetByOffset("n * uniforms.K + k") << ";\n";
+      shader.MainFunctionBody() << "    var col = tile_row_start + local_id.x;\n"
+                                << "    var row = k_start + local_id.y;\n"
+                                << "    if (col < uniforms.M && row < uniforms.K) {\n"
+                                << "      tile_a[local_id.y][local_id.x] = " << A.GetByOffset("row * uniforms.M + col") << ";\n"
+                                << "    } else {\n"
+                                << "      tile_a[local_id.y][local_id.x] = output_value_t(0);\n"
+                                << "    }\n\n"
+                                << "    col = k_start + local_id.x;\n"
+                                << "    row = tile_col_start + local_id.y;\n"
+                                << "    if (col < uniforms.K && row < uniforms.N) {\n"
+                                << "      tile_b[local_id.y][local_id.x] = " << B.GetByOffset("row * uniforms.K + col") << ";\n"
+                                << "    } else {\n"
+                                << "      tile_b[local_id.y][local_id.x] = output_value_t(0);\n"
+                                << "    }\n";
     } else if (transA_ && !transB_) {
-      shader.MainFunctionBody() << "    value = value + " << A.GetByOffset("k * uniforms.M + m")
-                                << " * " << B.GetByOffset("k * uniforms.N + n") << ";\n";
+      shader.MainFunctionBody() << "    var col = tile_row_start + local_id.x;\n"
+                                << "    var row = k_start + local_id.y;\n"
+                                << "    if (col < uniforms.M && row < uniforms.K) {\n"
+                                << "      tile_a[local_id.y][local_id.x] = " << A.GetByOffset("row * uniforms.M + col") << ";\n"
+                                << "    } else {\n"
+                                << "      tile_a[local_id.y][local_id.x] = output_value_t(0);\n"
+                                << "    }\n\n"
+                                << "    col = tile_col_start + local_id.x;\n"
+                                << "    row = k_start + local_id.y;\n"
+                                << "    if (col < uniforms.N && row < uniforms.K) {\n"
+                                << "      tile_b[local_id.y][local_id.x] = " << B.GetByOffset("row * uniforms.N + col") << ";\n"
+                                << "    } else {\n"
+                                << "      tile_b[local_id.y][local_id.x] = output_value_t(0);\n"
+                                << "    }\n";
     } else if (!transA_ && transB_) {
-      shader.MainFunctionBody() << "    value = value + " << A.GetByOffset("m * uniforms.K + k")
-                                << " * " << B.GetByOffset("n * uniforms.K + k") << ";\n";
+      shader.MainFunctionBody() << "    var col = k_start + local_id.x;\n"
+                                << "    var row = tile_row_start + local_id.y;\n"
+                                << "    if (col < uniforms.K && row < uniforms.M) {\n"
+                                << "      tile_a[local_id.y][local_id.x] = " << A.GetByOffset("row * uniforms.K + col") << ";\n"
+                                << "    } else {\n"
+                                << "      tile_a[local_id.y][local_id.x] = output_value_t(0);\n"
+                                << "    }\n\n"
+                                << "    col = k_start + local_id.x;\n"
+                                << "    row = tile_col_start + local_id.y;\n"
+                                << "    if (col < uniforms.K && row < uniforms.N) {\n"
+                                << "      tile_b[local_id.y][local_id.x] = " << B.GetByOffset("row * uniforms.K + col") << ";\n"
+                                << "    } else {\n"
+                                << "      tile_b[local_id.y][local_id.x] = output_value_t(0);\n"
+                                << "    }\n";
     } else {
-      shader.MainFunctionBody() << "    value = value + " << A.GetByOffset("m * uniforms.K + k")
-                                << " * " << B.GetByOffset("k * uniforms.N + n") << ";\n";
+      shader.MainFunctionBody() << "    var col = k_start + local_id.x;\n"
+                                << "    var row = tile_row_start + local_id.y;\n"
+                                << "    if (col < uniforms.K && row < uniforms.M) {\n"
+                                << "      tile_a[local_id.y][local_id.x] = " << A.GetByOffset("row * uniforms.K + col") << ";\n"
+                                << "    } else {\n"
+                                << "      tile_a[local_id.y][local_id.x] = output_value_t(0);\n"
+                                << "    }\n\n"
+                                << "    col = tile_col_start + local_id.x;\n"
+                                << "    row = k_start + local_id.y;\n"
+                                << "    if (col < uniforms.N && row < uniforms.K) {\n"
+                                << "      tile_b[local_id.y][local_id.x] = " << B.GetByOffset("row * uniforms.N + col") << ";\n"
+                                << "    } else {\n"
+                                << "      tile_b[local_id.y][local_id.x] = output_value_t(0);\n"
+                                << "    }\n";
     }
-    shader.MainFunctionBody() << "  }\n"
-                              << "\n";
+
+    shader.MainFunctionBody() << "    k_start = k_start + " << TILE_SIZE << "u;\n"
+                              << "    workgroupBarrier();\n\n"
+                              << "    for (var k = 0u; k < " << TILE_SIZE << "u; k = k + 1u) {\n";
+
+    if (transA_ && transB_) {
+      shader.MainFunctionBody() << "      value = value + tile_a[k][local_id.y] * tile_b[local_id.x][k];\n";
+    } else if (transA_ && !transB_) {
+      shader.MainFunctionBody() << "      value = value + tile_a[k][local_id.y] * tile_b[k][local_id.x];\n";
+    } else if (!transA_ && transB_) {
+      shader.MainFunctionBody() << "      value = value + tile_a[local_id.y][k] * tile_b[local_id.x][k];\n";
+    } else {
+      shader.MainFunctionBody() << "      value = value + tile_a[local_id.y][k] * tile_b[k][local_id.x];\n";
+    }
+
+    shader.MainFunctionBody() << "    }\n"
+                              << "    workgroupBarrier();\n"
+                              << "  }\n\n";
   }
 
   // Calculate Alpha
@@ -76,11 +147,19 @@ Status GemmProgram::GenerateShaderCode(ShaderHelper& shader) const {
   // Calculate Bias
   if (need_handle_bias_) {
     const ShaderVariableHelper& C = shader.AddInput("C", ShaderUsage::UseUniform);
-    shader.MainFunctionBody() << "  value = value + output_value_t(uniforms.beta) * "
+    shader.MainFunctionBody() << "  let m = tile_row_start + local_id.y;\n"
+                              << "  let n = tile_col_start + local_id.x;\n"
+                              << "  value = value + output_value_t(uniforms.beta) * "
                               << C.GetByOffset(C.BroadcastedIndicesToOffset("vec2(m, n)", output)) << ";\n";
+  } else {
+    shader.MainFunctionBody() << "  let m = tile_row_start + local_id.y;\n"
+                              << "  let n = tile_col_start + local_id.x;\n";
   }
 
-  shader.MainFunctionBody() << output.SetByOffset("global_idx", "value") << "\n";
+  // Write output
+  shader.MainFunctionBody() << "  if (m < uniforms.M && n < uniforms.N) {\n"
+                            << "    " << output.SetByOffset("m * uniforms.N + n", "value") << "\n"
+                            << "  }\n";
 
   return Status::OK();
 }
@@ -132,16 +211,20 @@ Status Gemm::ComputeInternal(ComputeContext& context) const {
     program.AddInput({C, ProgramTensorMetadataDependency::Rank});
   }
 
+  const uint32_t TILE_SIZE = 16;
+  const uint32_t num_tile_n = (N + TILE_SIZE - 1) / TILE_SIZE;
+  const uint32_t num_tile_m = (M + TILE_SIZE - 1) / TILE_SIZE;
+
   program.AddOutputs({{Y, ProgramTensorMetadataDependency::Type}})
-      .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
-      .SetWorkgroupSize(WORKGROUP_SIZE)
+      .SetDispatchGroupSize(num_tile_n * num_tile_m)
+      .SetWorkgroupSize(TILE_SIZE, TILE_SIZE)
       .AddUniformVariables({
-          {static_cast<uint32_t>(output_size)},  // output_size
-          {static_cast<uint32_t>(M)},            // M
-          {static_cast<uint32_t>(N)},            // N
-          {static_cast<uint32_t>(K)},            // K
-          {alpha_},                              // alpha
-          {beta_}                                // beta
+          {static_cast<uint32_t>(num_tile_n)},  // num_tile_n
+          {static_cast<uint32_t>(M)},           // M
+          {static_cast<uint32_t>(N)},           // N
+          {static_cast<uint32_t>(K)},           // K
+          {alpha_},                             // alpha
+          {beta_}                               // beta
       });
 
   return context.RunProgram(program);
diff --git a/onnxruntime/core/providers/webgpu/math/gemm.h b/onnxruntime/core/providers/webgpu/math/gemm.h
@@ -23,7 +23,7 @@ class GemmProgram final : public Program<GemmProgram> {
   Status GenerateShaderCode(ShaderHelper& sh) const override;
 
   WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES(
-      {"output_size", ProgramUniformVariableDataType::Uint32},
+      {"num_tile_n", ProgramUniformVariableDataType::Uint32},
       {"M", ProgramUniformVariableDataType::Uint32},
       {"N", ProgramUniformVariableDataType::Uint32},
       {"K", ProgramUniformVariableDataType::Uint32},