Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion backends/webgpu/runtime/WebGPUBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@

#include <executorch/runtime/backend/interface.h>
#include <executorch/runtime/core/error.h>
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
#include <executorch/runtime/platform/log.h>

#include <vector>

#include <new>

namespace executorch {
Expand All @@ -35,6 +38,7 @@ using executorch::runtime::Error;
using executorch::runtime::EValue;
using executorch::runtime::FreeableBuffer;
using executorch::runtime::register_backend;
using executorch::runtime::resize_tensor;
using executorch::runtime::Result;
using executorch::runtime::Span;

Expand Down Expand Up @@ -108,11 +112,32 @@ Error WebGPUBackend::execute(
}
// Fail loud as a runtime Error so a throw never crosses the backend boundary.
try {
// Dynamic shapes: shrink each input to its live sizes before upload
// (mirrors Vulkan maybe_resize_input). No-op when unchanged, so a static
// graph is byte-identical.
for (size_t i = 0; i < num_inputs; i++) {
const auto sizes = args[i]->toTensor().sizes();
std::vector<int64_t> new_dims(sizes.begin(), sizes.end());
graph->resize_input(graph->input_ids()[i], new_dims);
}
graph->copy_inputs(inputs);
graph->update_symints_from_inputs(inputs);
graph->propagate_resize();
// Resize each output EValue to its live shape so the readback length is
// correct (mirrors Vulkan maybe_resize_output).
for (size_t i = 0; i < num_outputs; i++) {
const auto& cd = graph->cur_dims(graph->output_ids()[i]);
std::vector<executorch::aten::SizesType> osizes(cd.begin(), cd.end());
Error e = resize_tensor(
args[num_inputs + i]->toTensor(),
ArrayRef<executorch::aten::SizesType>(osizes.data(), osizes.size()));
if (e != Error::Ok) {
ET_LOG(Error, "WebGPU: output %zu resize failed", i);
return Error::Internal;
}
}
} catch (const std::exception& e) {
ET_LOG(Error, "WebGPU input copy / symint refresh failed: %s", e.what());
ET_LOG(Error, "WebGPU input/output resize / copy failed: %s", e.what());
return Error::Internal;
}

Expand Down
117 changes: 103 additions & 14 deletions backends/webgpu/runtime/WebGPUGraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <executorch/backends/webgpu/runtime/WebGPUCompat.h>
#include <executorch/backends/webgpu/runtime/WebGPUDevice.h>

#include <algorithm>
#include <cstdlib>
#include <cstring>
#include <stdexcept>
Expand Down Expand Up @@ -62,6 +63,18 @@ bool vk_datatype_is_int(vkgraph::VkDataType dtype) {
}
}

// Normalize a possibly-negative dim against rank; throws (fail-loud) if OOR.
int normalize_dim(int dim, int rank, const char* op) {
if (dim < 0) {
dim += rank;
}
if (dim < 0 || dim >= rank) {
throw std::runtime_error(
std::string("WebGPU ") + op + ": dim out of range");
}
return dim;
}

} // namespace

WebGPUGraph::WebGPUGraph() = default;
Expand Down Expand Up @@ -104,11 +117,10 @@ void WebGPUGraph::update_symints_from_inputs(
throw std::runtime_error(
"select_as_symint: source tensor is not a graph input");
}
const auto& dims = tensors_[src.input_tensor_id].dims;
int dim = src.dim < 0 ? src.dim + static_cast<int>(dims.size()) : src.dim;
if (dim < 0 || dim >= static_cast<int>(dims.size())) {
throw std::runtime_error("select_as_symint: dim out of range");
}
// Live cur_dims: the source may be a dynamic-shape input.
const auto& dims = tensors_[src.input_tensor_id].cur_dims;
int dim = normalize_dim(
src.dim, static_cast<int>(dims.size()), "select_as_symint");
int index = src.index;
if (index < 0) {
index += static_cast<int>(dims[dim]);
Expand All @@ -129,9 +141,9 @@ void WebGPUGraph::update_symints_from_inputs(
}
// Reads the [0,..,index,..,0] element; symint sources are scalar-ish.
const int64_t offset = static_cast<int64_t>(index) * stride;
// elem_size back-derived from build-time numel (sources are static-shaped).
const void* host = inputs[pos].data;
const size_t elem_size = inputs[pos].nbytes / static_cast<size_t>(numel);
// Stored elem_size (live nbytes/numel mis-derives for a dynamic source).
const size_t elem_size = tensors_[src.input_tensor_id].elem_size;
int32_t val;
if (elem_size == sizeof(int64_t)) {
val = static_cast<int32_t>(static_cast<const int64_t*>(host)[offset]);
Expand All @@ -143,6 +155,14 @@ void WebGPUGraph::update_symints_from_inputs(
}
set_symint(src.symint_id, val);
}
// sym_size.int: SymInt = a tensor's live dim (cur_dims). Usually unused (ops
// read cur_dims directly); for an intermediate source cur_dims is the build
// max here (hooks run later in propagate_resize), which is fine while unused.
for (const auto& s : symint_dim_sources_) {
const auto& d = tensors_[s.tensor_id].cur_dims;
int dim = normalize_dim(s.dim, static_cast<int>(d.size()), "sym_size");
set_symint(s.symint_id, static_cast<int32_t>(d[dim]));
}
}

void WebGPUGraph::set_symint(int id, int32_t val) {
Expand All @@ -158,16 +178,78 @@ void WebGPUGraph::set_symint(int id, int32_t val) {
}
}

void WebGPUGraph::set_cur_dims(
int value_id,
const std::vector<int64_t>& new_dims) {
auto& t = tensors_[value_id];
if (new_dims.size() != t.dims.size()) {
throw std::runtime_error("WebGPU resize: tensor rank changed");
}
size_t numel = 1;
for (size_t d = 0; d < new_dims.size(); d++) {
// 0-sized dims unsupported: live shapes are always in [1, max] per dim.
if (new_dims[d] <= 0) {
throw std::runtime_error("WebGPU resize: new dim must be positive");
}
if (new_dims[d] > t.dims[d]) {
throw std::runtime_error(
"WebGPU resize: new dim exceeds the max (serialized) allocation");
}
numel *= static_cast<size_t>(new_dims[d]);
}
const size_t new_nbytes = numel * t.elem_size;
if (t.cur_dims != new_dims) {
t.cur_dims = new_dims;
t.cur_nbytes = new_nbytes;
dirty_tensors_.insert(value_id);
}
}

void WebGPUGraph::resize_input(
int value_id,
const std::vector<int64_t>& new_dims) {
if (std::find(input_ids_.begin(), input_ids_.end(), value_id) ==
input_ids_.end()) {
throw std::runtime_error(
"WebGPUGraph::resize_input: value_id is not a graph input");
}
set_cur_dims(value_id, new_dims);
}

void WebGPUGraph::propagate_resize() {
if (dirty_symints_.empty()) {
if (dirty_symints_.empty() && dirty_tensors_.empty()) {
return;
}
// Hooks fire in registration (topological) order: operands update first.
for (auto& hook : resize_hooks_) {
if (dirty_symints_.count(hook.symint_id) != 0) {
hook.fn(*this);
}
}
dirty_symints_.clear();
// Tensor hooks: bounded fixpoint. A hook may dirty its output (cascading to a
// consumer); each pass handles the currently-dirty set. A forward DAG
// converges in <= depth passes (set_cur_dims re-dirties only on a change).
for (size_t pass = 0;
!dirty_tensors_.empty() && pass <= tensor_resize_hooks_.size();
pass++) {
std::unordered_set<int> processing;
processing.swap(dirty_tensors_);
for (auto& hook : tensor_resize_hooks_) {
if (processing.count(hook.trigger_tensor_id) != 0) {
hook.fn(*this);
}
}
}
if (!dirty_tensors_.empty()) {
throw std::runtime_error(
"WebGPU resize: tensor resize hooks did not converge");
}
// Tensor hooks must not set_symint (dirty_symints_ already drained above).
if (!dirty_symints_.empty()) {
throw std::runtime_error(
"WebGPU resize: a tensor resize hook set a SymInt; not supported");
}
}

WebGPUGraph::~WebGPUGraph() {
Expand Down Expand Up @@ -322,6 +404,10 @@ void WebGPUGraph::build(
tensor.elem_size = vk_datatype_size(vk_tensor->datatype());
tensor.is_int = vk_datatype_is_int(vk_tensor->datatype());
tensor.nbytes = numel * tensor.elem_size;
// Live dims start == max (serialized upper bound); resize_input shrinks
// them per call. Static graphs keep cur == max forever.
tensor.cur_dims = tensor.dims;
tensor.cur_nbytes = tensor.nbytes;

int constant_id = vk_tensor->constant_id();
int mem_obj_id = vk_tensor->mem_obj_id();
Expand Down Expand Up @@ -624,17 +710,20 @@ void WebGPUGraph::copy_inputs(const std::vector<InputData>& inputs) {
}
int tid = input_ids_[i];
const auto& tensor = tensors_[tid];
// Upload only the live (cur) bytes, not the max allocation; cur_nbytes ==
// nbytes on a static graph, so this is byte-identical there.
const size_t live_nbytes = tensor.cur_nbytes;

// Fast path: host and GPU element types match byte-for-byte.
if (in.nbytes == tensor.nbytes) {
wgpuQueueWriteBuffer(queue_, tensor.buffer, 0, in.data, tensor.nbytes);
if (in.nbytes == live_nbytes) {
wgpuQueueWriteBuffer(queue_, tensor.buffer, 0, in.data, live_nbytes);
continue;
}

// Narrow int64 host indices into the int32 buffer (mirrors Vulkan).
const bool buffer_is_int32 = tensor.is_int && tensor.elem_size == 4;
if (in.host_is_int64 && buffer_is_int32 && in.nbytes == tensor.nbytes * 2) {
const size_t numel = tensor.nbytes / 4;
if (in.host_is_int64 && buffer_is_int32 && in.nbytes == live_nbytes * 2) {
const size_t numel = live_nbytes / 4;
const int64_t* src = static_cast<const int64_t*>(in.data);
std::vector<int32_t> narrowed(numel);
for (size_t e = 0; e < numel; e++) {
Expand All @@ -648,15 +737,15 @@ void WebGPUGraph::copy_inputs(const std::vector<InputData>& inputs) {
narrowed[e] = static_cast<int32_t>(src[e]);
}
wgpuQueueWriteBuffer(
queue_, tensor.buffer, 0, narrowed.data(), tensor.nbytes);
queue_, tensor.buffer, 0, narrowed.data(), live_nbytes);
continue;
}

throw std::runtime_error(
"WebGPU: unsupported input copy for input " + std::to_string(i) +
" (host " + std::to_string(in.nbytes) + " bytes" +
(in.host_is_int64 ? " int64" : "") + " vs buffer " +
std::to_string(tensor.nbytes) + " bytes)");
std::to_string(live_nbytes) + " bytes)");
}
}

Expand Down
56 changes: 52 additions & 4 deletions backends/webgpu/runtime/WebGPUGraph.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,14 @@ namespace executorch::backends::webgpu {

struct WebGPUTensor {
WGPUBuffer buffer = nullptr;
// Max (allocation) dims/nbytes: the serialized upper-bound shape. The GPU
// buffer is sized from these and never reallocated (Vulkan allocate-at-max).
std::vector<int64_t> dims;
size_t nbytes = 0;
// Live dims/nbytes for dynamic shapes; always <= the max. == max on a static
// graph, so dynamic-resize logic keyed off these is inert there.
std::vector<int64_t> cur_dims;
size_t cur_nbytes = 0;
// Serialized (GPU-side) element type, used to narrow wider host inputs.
size_t elem_size = 0;
bool is_int = false;
Expand Down Expand Up @@ -171,14 +177,43 @@ class WebGPUGraph {
return symint_sources_;
}

// Records that a SymInt is a tensor's live dim size (sym_size.int), read from
// cur_dims at execute; distinct from SymIntSource (a scalar data element).
struct SymIntDimSource {
int symint_id;
int tensor_id;
int dim;
};
void add_symint_dim_source(int symint_id, int tensor_id, int dim) {
symint_dim_sources_.push_back({symint_id, tensor_id, dim});
}

// Execute-time select_as_symint read; mirrors Vulkan select_as_symint_impl.
void update_symints_from_inputs(const std::vector<InputData>& inputs);

// Per-SymInt resize hook; mirrors Vulkan DynamicDispatchNode::trigger_resize.
void add_resize_hook(int symint_id, std::function<void(WebGPUGraph&)> fn) {
resize_hooks_.push_back({symint_id, std::move(fn)});
}
// Run hooks for changed SymInts then clear; call before execute().

// Set a graph input's live dims (<= max) + dirty it; static path stays inert.
void resize_input(int value_id, const std::vector<int64_t>& new_dims);
// Set a tensor's live dims (an op resize hook calls this for its output to
// cascade to consumers); validates the new dims fit the max, never reallocs.
void set_cur_dims(int value_id, const std::vector<int64_t>& new_dims);
const std::vector<int64_t>& cur_dims(int value_id) const {
return tensors_[value_id].cur_dims;
}

// Per-tensor resize hook; mirrors Vulkan ExecuteNode::resize_fn. Runs in
// propagate_resize when trigger_tensor_id is dirty.
void add_tensor_resize_hook(
int trigger_tensor_id,
std::function<void(WebGPUGraph&)> fn) {
tensor_resize_hooks_.push_back({trigger_tensor_id, std::move(fn)});
}

// Run hooks for changed SymInts and tensors, then clear; call before execute.
void propagate_resize();

// Mutable dispatch access for resize hooks (to rewrite workgroup_count_x).
Expand All @@ -196,19 +231,22 @@ class WebGPUGraph {
return queue_;
}

void add_dispatch(WebGPUDispatch dispatch) {
// Returns the new dispatch's index (resize hooks rewrite it via dispatch_at).
size_t add_dispatch(WebGPUDispatch dispatch) {
dispatches_.push_back(dispatch);
return dispatches_.size() - 1;
}

// Record an in-graph-order buffer-to-buffer DMA (e.g. a flat copy).
void add_buffer_copy(WGPUBuffer src, WGPUBuffer dst, size_t nbytes) {
// In-graph buffer-to-buffer DMA (e.g. flat copy); returns the dispatch index.
size_t add_buffer_copy(WGPUBuffer src, WGPUBuffer dst, size_t nbytes) {
WebGPUDispatch d;
d.kind = WebGPUDispatch::Kind::Copy;
d.copy_src = src;
d.copy_dst = dst;
d.copy_nbytes = nbytes;
d.kernel_name = "flat_copy";
dispatches_.push_back(d);
return dispatches_.size() - 1;
}

// Materialize a recorded prepack-routed constant into dst via one CPU->GPU
Expand Down Expand Up @@ -297,6 +335,7 @@ class WebGPUGraph {
};
std::unordered_map<int, SymIntSlot> symints_;
std::vector<SymIntSource> symint_sources_;
std::vector<SymIntDimSource> symint_dim_sources_;

// Resize hooks + the set of SymInts changed since the last propagate_resize.
struct ResizeHook {
Expand All @@ -306,6 +345,15 @@ class WebGPUGraph {
std::vector<ResizeHook> resize_hooks_;
std::unordered_set<int> dirty_symints_;

// Tensor-shape resize hooks + the set of tensors changed since the last
// propagate_resize (mirrors the SymInt pair above, for dynamic shapes).
struct TensorResizeHook {
int trigger_tensor_id;
std::function<void(WebGPUGraph&)> fn;
};
std::vector<TensorResizeHook> tensor_resize_hooks_;
std::unordered_set<int> dirty_tensors_;

std::vector<int> input_ids_;
std::vector<int> output_ids_;

Expand Down
13 changes: 13 additions & 0 deletions backends/webgpu/runtime/WebGPUUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <cstring>
#include <stdexcept>
#include <string>
#include <vector>

namespace executorch::backends::webgpu::utils {

Expand All @@ -24,6 +25,18 @@ inline T div_up(T a, T b) {
return (a + b - 1) / b;
}

// Product of dims (live element count); used by dynamic-resize hooks.
inline uint64_t numel_of(const std::vector<int64_t>& dims) {
uint64_t n = 1;
for (int64_t v : dims) {
if (v < 0) {
throw std::runtime_error("numel_of: negative dimension");
}
n *= static_cast<uint64_t>(v);
}
return n;
}

// Clamp workgroup size to device limit (SwiftShader caps at 128).
inline uint32_t clamp_workgroup_size(WGPUDevice device, uint32_t desired) {
WGPULimits limits = {};
Expand Down
Loading
Loading