diff --git a/CHANGELOG.md b/CHANGELOG.md index 562f1bccc0e..23aeeb15109 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -136,6 +136,7 @@ By @beholdnec in [#8505](https://github.com/gfx-rs/wgpu/pull/8505). #### Metal +- Implement `MULTI_DRAW_INDIRECT_COUNT` via compute shader emulation. By @bromles in [#9659](https://github.com/gfx-rs/wgpu/pull/9659). - Add `metal::Queue::add_wait_event` / `add_signal_event` (with `remove_*` companions) to stage `MTLSharedEvent` waits/signals on the next `Queue::submit`, for GPU-side interop with foreign APIs. Waits run on an internal CB committed before user CBs. By @AdrianEddy in [#9483](https://github.com/gfx-rs/wgpu/pull/9483). - Unconditionally enable `Features::CLIP_DISTANCES`. By @ErichDonGubler in [#9270](https://github.com/gfx-rs/wgpu/pull/9270). - Added full support for mesh shaders, including in WGSL shaders. By @inner-daemons in [#8739](https://github.com/gfx-rs/wgpu/pull/8739). diff --git a/examples/features/src/lib.rs b/examples/features/src/lib.rs index 62a547f82fd..4e230de6f4f 100644 --- a/examples/features/src/lib.rs +++ b/examples/features/src/lib.rs @@ -17,6 +17,7 @@ pub mod hello_workgroups; pub mod mesh_shader; pub mod mipmap; pub mod msaa_line; +pub mod multi_draw_indirect_count; pub mod multiple_render_targets; pub mod multiview; pub mod ray_aabb_compute; @@ -56,6 +57,7 @@ fn all_tests() -> Vec { mipmap::TEST, mipmap::TEST_QUERY, msaa_line::TEST, + multi_draw_indirect_count::TEST, multiple_render_targets::TEST, ray_aabb_compute::TEST, ray_cube_compute::TEST, diff --git a/examples/features/src/main.rs b/examples/features/src/main.rs index 7dd7f4698b6..082ed5aa513 100644 --- a/examples/features/src/main.rs +++ b/examples/features/src/main.rs @@ -80,6 +80,12 @@ const EXAMPLES: &[ExampleDesc] = &[ webgl: true, webgpu: true, }, + ExampleDesc { + name: "multi_draw_indirect_count", + function: wgpu_examples::multi_draw_indirect_count::main, + webgl: false, + webgpu: false, + }, ExampleDesc { name: "multiple_render_targets", function: wgpu_examples::multiple_render_targets::main, diff --git a/examples/features/src/multi_draw_indirect_count/mod.rs b/examples/features/src/multi_draw_indirect_count/mod.rs new file mode 100644 index 00000000000..c0a8d26ac3f --- /dev/null +++ b/examples/features/src/multi_draw_indirect_count/mod.rs @@ -0,0 +1,231 @@ +use bytemuck::{Pod, Zeroable}; +use wgpu::util::DeviceExt; + +const EXAMPLE_NAME: &str = "multi_draw_indirect_count"; + +const NUM_QUADS: u32 = 16; +const QUAD_SIZE: f32 = 0.2; + +#[repr(C)] +#[derive(Clone, Copy, Pod, Zeroable)] +struct Vertex { + position: [f32; 2], + color: [f32; 3], +} + +struct Example { + pipeline: wgpu::RenderPipeline, + vertex_buffer: wgpu::Buffer, + indirect_buffer: wgpu::Buffer, + count_buffer: wgpu::Buffer, +} + +impl crate::framework::Example for Example { + fn required_features() -> wgpu::Features { + wgpu::Features::MULTI_DRAW_INDIRECT_COUNT + } + + fn required_limits() -> wgpu::Limits { + wgpu::Limits::downlevel_defaults() + } + + fn init( + config: &wgpu::SurfaceConfiguration, + _adapter: &wgpu::Adapter, + device: &wgpu::Device, + _queue: &wgpu::Queue, + ) -> Self { + let shader = device.create_shader_module(wgpu::include_wgsl!("shader.wgsl")); + + let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor { + label: Some("Pipeline"), + layout: None, + vertex: wgpu::VertexState { + module: &shader, + entry_point: Some("vs_main"), + compilation_options: Default::default(), + buffers: &[Some(wgpu::VertexBufferLayout { + array_stride: size_of::() as u64, + step_mode: wgpu::VertexStepMode::Vertex, + attributes: &wgpu::vertex_attr_array![ + 0 => Float32x2, + 1 => Float32x3, + ], + })], + }, + primitive: wgpu::PrimitiveState { + topology: wgpu::PrimitiveTopology::TriangleList, + ..Default::default() + }, + depth_stencil: None, + multisample: wgpu::MultisampleState::default(), + fragment: Some(wgpu::FragmentState { + module: &shader, + entry_point: Some("fs_main"), + compilation_options: Default::default(), + targets: &[Some(wgpu::ColorTargetState { + format: config.format, + blend: Some(wgpu::BlendState::ALPHA_BLENDING), + write_mask: wgpu::ColorWrites::ALL, + })], + }), + multiview_mask: None, + cache: None, + }); + + let cols = 4; + let mut vertices = Vec::new(); + let mut indirect_args = Vec::new(); + + for i in 0..NUM_QUADS { + let row = i / cols; + let col = i % cols; + let cx = -0.75 + col as f32 * 0.5; + let cy = 0.75 - row as f32 * 0.5; + let s = QUAD_SIZE; + + let hue = i as f32 / NUM_QUADS as f32; + let (r, g, b) = hsv_to_rgb(hue, 0.8, 0.9); + + vertices.extend_from_slice(&[ + Vertex { + position: [cx - s, cy - s], + color: [r, g, b], + }, + Vertex { + position: [cx + s, cy - s], + color: [r, g, b], + }, + Vertex { + position: [cx - s, cy + s], + color: [r, g, b], + }, + Vertex { + position: [cx + s, cy - s], + color: [r, g, b], + }, + Vertex { + position: [cx + s, cy + s], + color: [r, g, b], + }, + Vertex { + position: [cx - s, cy + s], + color: [r, g, b], + }, + ]); + + let first_vertex = i * 6; + indirect_args.push(wgpu::util::DrawIndirectArgs { + vertex_count: 6, + instance_count: 1, + first_vertex, + first_instance: 0, + }); + } + + let vertex_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor { + label: Some("Vertex Buffer"), + contents: bytemuck::cast_slice(&vertices), + usage: wgpu::BufferUsages::VERTEX, + }); + + let indirect_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor { + label: Some("Indirect Buffer"), + contents: bytemuck::cast_slice(&indirect_args), + usage: wgpu::BufferUsages::INDIRECT, + }); + + let count_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor { + label: Some("Count Buffer"), + contents: bytemuck::cast_slice::(&[NUM_QUADS]), + usage: wgpu::BufferUsages::INDIRECT, + }); + + Example { + pipeline, + vertex_buffer, + indirect_buffer, + count_buffer, + } + } + + fn resize( + &mut self, + _config: &wgpu::SurfaceConfiguration, + _device: &wgpu::Device, + _queue: &wgpu::Queue, + ) { + } + + fn update(&mut self, _event: winit::event::WindowEvent) {} + + fn render(&mut self, view: &wgpu::TextureView, device: &wgpu::Device, queue: &wgpu::Queue) { + let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor { + label: Some("Encoder"), + }); + + { + let mut rpass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor { + label: Some("Render Pass"), + color_attachments: &[Some(wgpu::RenderPassColorAttachment { + view, + resolve_target: None, + ops: wgpu::Operations { + load: wgpu::LoadOp::Clear(wgpu::Color::BLACK), + store: wgpu::StoreOp::Store, + }, + depth_slice: None, + })], + depth_stencil_attachment: None, + timestamp_writes: None, + occlusion_query_set: None, + multiview_mask: None, + }); + + rpass.set_pipeline(&self.pipeline); + rpass.set_vertex_buffer(0, self.vertex_buffer.slice(..)); + rpass.multi_draw_indirect_count( + &self.indirect_buffer, + 0, + &self.count_buffer, + 0, + NUM_QUADS, + ); + } + + queue.submit([encoder.finish()]); + } +} + +fn hsv_to_rgb(h: f32, s: f32, v: f32) -> (f32, f32, f32) { + let i = (h * 6.0) as i32 % 6; + let f = h * 6.0 - (h * 6.0).floor(); + let p = v * (1.0 - s); + let q = v * (1.0 - f * s); + let t = v * (1.0 - (1.0 - f) * s); + match i { + 0 => (v, t, p), + 1 => (q, v, p), + 2 => (p, v, t), + 3 => (p, q, v), + 4 => (t, p, v), + _ => (v, p, q), + } +} + +pub fn main() { + crate::framework::run::(EXAMPLE_NAME); +} + +#[cfg(test)] +#[wgpu_test::gpu_test] +pub static TEST: crate::framework::ExampleTestParams = crate::framework::ExampleTestParams { + name: EXAMPLE_NAME, + image_path: "/examples/features/src/multi_draw_indirect_count/screenshot.png", + width: 256, + height: 256, + optional_features: wgpu::Features::default(), + base_test_parameters: wgpu_test::TestParameters::default(), + comparisons: &[wgpu_test::ComparisonType::Mean(0.02)], + _phantom: std::marker::PhantomData::, +}; diff --git a/examples/features/src/multi_draw_indirect_count/screenshot.png b/examples/features/src/multi_draw_indirect_count/screenshot.png new file mode 100644 index 00000000000..791f8fe8330 Binary files /dev/null and b/examples/features/src/multi_draw_indirect_count/screenshot.png differ diff --git a/examples/features/src/multi_draw_indirect_count/shader.wgsl b/examples/features/src/multi_draw_indirect_count/shader.wgsl new file mode 100644 index 00000000000..0ea1566d453 --- /dev/null +++ b/examples/features/src/multi_draw_indirect_count/shader.wgsl @@ -0,0 +1,17 @@ +struct VertexOutput { + @builtin(position) position: vec4f, + @location(0) color: vec3f, +} + +@vertex +fn vs_main(@location(0) position: vec2f, @location(1) color: vec3f) -> VertexOutput { + var out: VertexOutput; + out.position = vec4f(position, 0.0, 1.0); + out.color = color; + return out; +} + +@fragment +fn fs_main(in: VertexOutput) -> @location(0) vec4f { + return vec4f(in.color, 1.0); +} diff --git a/tests/tests/wgpu-gpu/draw_indirect.rs b/tests/tests/wgpu-gpu/draw_indirect.rs index 391253c3d2a..e707fecf9e9 100644 --- a/tests/tests/wgpu-gpu/draw_indirect.rs +++ b/tests/tests/wgpu-gpu/draw_indirect.rs @@ -29,6 +29,10 @@ pub fn all_tests(vec: &mut Vec) { INDIRECT_BUFFER_OFFSETS, MULTI_DRAW_INDEXED_INDIRECT, MULTI_DRAW_INDIRECT, + MULTI_DRAW_INDIRECT_COUNT, + MULTI_DRAW_INDEXED_INDIRECT_COUNT, + MULTI_DRAW_INDIRECT_COUNT_PARTIAL_COUNT, + MULTI_DRAW_INDEXED_INDIRECT_COUNT_PARTIAL_COUNT, ]); } @@ -833,3 +837,242 @@ static MULTI_DRAW_INDIRECT: GpuTestConfiguration = GpuTestConfiguration::new() .limits(wgpu::Limits::downlevel_defaults()), ) .run_async(|ctx| run_test_inner(ctx, get_draw_test_data(0, 6), false, true)); + +async fn run_test_multi_draw_indirect_count( + ctx: TestingContext, + test_data: TestData, + actual_count: u32, + max_count: u32, +) { + let vertex_buffer_layout = wgpu::VertexBufferLayout { + array_stride: 8, + step_mode: wgpu::VertexStepMode::Vertex, + attributes: &vertex_attr_array![0 => Float32x2], + }; + let vertex_buffer = ctx.device.create_buffer_init(&BufferInitDescriptor { + label: None, + contents: bytemuck::cast_slice(test_data.vertex_buffer_content()), + usage: wgpu::BufferUsages::VERTEX, + }); + + let (index_buffer, index_format) = match test_data.kind { + Kind::NonIndexed { .. } => (None, wgpu::IndexFormat::default()), + Kind::Indexed { + index_buffer_content, + .. + } => ( + Some(ctx.device.create_buffer_init(&BufferInitDescriptor { + label: None, + contents: bytemuck::cast_slice(index_buffer_content), + usage: wgpu::BufferUsages::INDEX, + })), + wgpu::IndexFormat::Uint32, + ), + }; + + let shader_src = " + @vertex + fn vs_main(@location(0) position: vec2f) -> @builtin(position) vec4f { + return vec4f(position, 0.0, 1.0); + } + + @fragment + fn fs_main() -> @location(0) vec4f { + return vec4f(1.0); + } + "; + + let shader = ctx + .device + .create_shader_module(wgpu::ShaderModuleDescriptor { + label: None, + source: wgpu::ShaderSource::Wgsl(shader_src.into()), + }); + + let pipeline_desc = wgpu::RenderPipelineDescriptor { + label: None, + layout: None, + vertex: wgpu::VertexState { + buffers: &[Some(vertex_buffer_layout)], + module: &shader, + entry_point: Some("vs_main"), + compilation_options: Default::default(), + }, + primitive: wgpu::PrimitiveState::default(), + depth_stencil: None, + multisample: wgpu::MultisampleState::default(), + fragment: Some(wgpu::FragmentState { + module: &shader, + entry_point: Some("fs_main"), + compilation_options: Default::default(), + targets: &[Some(wgpu::ColorTargetState { + format: wgpu::TextureFormat::R8Unorm, + blend: None, + write_mask: wgpu::ColorWrites::ALL, + })], + }), + multiview_mask: None, + cache: None, + }; + let pipeline = ctx.device.create_render_pipeline(&pipeline_desc); + + let out_texture = ctx.device.create_texture(&wgpu::TextureDescriptor { + label: None, + size: wgpu::Extent3d { + width: 256, + height: 256, + depth_or_array_layers: 1, + }, + mip_level_count: 1, + sample_count: 1, + dimension: wgpu::TextureDimension::D2, + format: wgpu::TextureFormat::R8Unorm, + usage: wgpu::TextureUsages::RENDER_ATTACHMENT | wgpu::TextureUsages::COPY_SRC, + view_formats: &[], + }); + let out_texture_view = out_texture.create_view(&wgpu::TextureViewDescriptor::default()); + + let readback_buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor { + label: None, + size: 256 * 256, + usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ, + mapped_at_creation: false, + }); + + let mut indirect_bytes = Vec::new(); + for _ in 0..max_count { + test_data.write_indirect_args(&mut indirect_bytes); + } + let indirect_buffer = ctx.device.create_buffer_init(&BufferInitDescriptor { + label: None, + contents: &indirect_bytes, + usage: wgpu::BufferUsages::INDIRECT, + }); + + let count_buffer = ctx.device.create_buffer_init(&BufferInitDescriptor { + label: None, + contents: bytemuck::cast_slice::(&[actual_count]), + usage: wgpu::BufferUsages::INDIRECT, + }); + + let mut encoder = ctx + .device + .create_command_encoder(&wgpu::CommandEncoderDescriptor::default()); + + { + let mut rpass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor { + label: None, + color_attachments: &[Some(wgpu::RenderPassColorAttachment { + view: &out_texture_view, + resolve_target: None, + ops: wgpu::Operations { + load: wgpu::LoadOp::Clear(wgpu::Color::WHITE), + store: wgpu::StoreOp::Store, + }, + depth_slice: None, + })], + depth_stencil_attachment: None, + timestamp_writes: None, + occlusion_query_set: None, + multiview_mask: None, + }); + + rpass.set_pipeline(&pipeline); + rpass.set_vertex_buffer(0, vertex_buffer.slice(..)); + if let Some(ref index_buffer) = index_buffer { + rpass.set_index_buffer(index_buffer.slice(..), index_format); + } + + if index_buffer.is_some() { + rpass.multi_draw_indexed_indirect_count( + &indirect_buffer, + 0, + &count_buffer, + 0, + max_count, + ); + } else { + rpass.multi_draw_indirect_count(&indirect_buffer, 0, &count_buffer, 0, max_count); + } + } + + encoder.copy_texture_to_buffer( + wgpu::TexelCopyTextureInfo { + texture: &out_texture, + mip_level: 0, + origin: wgpu::Origin3d::ZERO, + aspect: wgpu::TextureAspect::All, + }, + wgpu::TexelCopyBufferInfo { + buffer: &readback_buffer, + layout: wgpu::TexelCopyBufferLayout { + offset: 0, + bytes_per_row: Some(256), + rows_per_image: None, + }, + }, + wgpu::Extent3d { + width: 256, + height: 256, + depth_or_array_layers: 1, + }, + ); + + ctx.queue.submit([encoder.finish()]); + + let slice = readback_buffer.slice(..); + slice.map_async(wgpu::MapMode::Read, |_| ()); + + ctx.async_poll(wgpu::PollType::wait_indefinitely()) + .await + .unwrap(); + + let data = slice.get_mapped_range().unwrap(); + let succeeded = data.iter().all(|b| *b == u8::MAX); + assert!(succeeded); +} + +#[gpu_test] +static MULTI_DRAW_INDIRECT_COUNT: GpuTestConfiguration = GpuTestConfiguration::new() + .parameters( + TestParameters::default() + .downlevel_flags(wgpu::DownlevelFlags::INDIRECT_EXECUTION) + .features(wgpu::Features::MULTI_DRAW_INDIRECT_COUNT) + .limits(wgpu::Limits::downlevel_defaults()), + ) + .run_async(|ctx| run_test_multi_draw_indirect_count(ctx, get_draw_test_data(0, 6), 2, 2)); + +#[gpu_test] +static MULTI_DRAW_INDEXED_INDIRECT_COUNT: GpuTestConfiguration = GpuTestConfiguration::new() + .parameters( + TestParameters::default() + .downlevel_flags(wgpu::DownlevelFlags::INDIRECT_EXECUTION) + .features(wgpu::Features::MULTI_DRAW_INDIRECT_COUNT) + .limits(wgpu::Limits::downlevel_defaults()), + ) + .run_async(|ctx| { + run_test_multi_draw_indirect_count(ctx, get_indexed_draw_test_data(0, 6), 2, 2) + }); + +#[gpu_test] +static MULTI_DRAW_INDIRECT_COUNT_PARTIAL_COUNT: GpuTestConfiguration = GpuTestConfiguration::new() + .parameters( + TestParameters::default() + .downlevel_flags(wgpu::DownlevelFlags::INDIRECT_EXECUTION) + .features(wgpu::Features::MULTI_DRAW_INDIRECT_COUNT) + .limits(wgpu::Limits::downlevel_defaults()), + ) + .run_async(|ctx| run_test_multi_draw_indirect_count(ctx, get_draw_test_data(0, 6), 1, 4)); + +#[gpu_test] +static MULTI_DRAW_INDEXED_INDIRECT_COUNT_PARTIAL_COUNT: GpuTestConfiguration = + GpuTestConfiguration::new() + .parameters( + TestParameters::default() + .downlevel_flags(wgpu::DownlevelFlags::INDIRECT_EXECUTION) + .features(wgpu::Features::MULTI_DRAW_INDIRECT_COUNT) + .limits(wgpu::Limits::downlevel_defaults()), + ) + .run_async(|ctx| { + run_test_multi_draw_indirect_count(ctx, get_indexed_draw_test_data(0, 6), 1, 4) + }); diff --git a/wgpu-core/src/command/compute.rs b/wgpu-core/src/command/compute.rs index 761aaca5e5e..3b0adc0d773 100644 --- a/wgpu-core/src/command/compute.rs +++ b/wgpu-core/src/command/compute.rs @@ -629,6 +629,7 @@ pub(super) fn encode_compute_pass( as_actions: parent_state.as_actions, temp_resources: parent_state.temp_resources, indirect_draw_validation_resources: parent_state.indirect_draw_validation_resources, + multi_draw_resources: parent_state.multi_draw_resources, snatch_guard: parent_state.snatch_guard, debug_scope_depth: &mut debug_scope_depth, }, diff --git a/wgpu-core/src/command/encoder.rs b/wgpu-core/src/command/encoder.rs index 19222c6e3ad..09f92d7146e 100644 --- a/wgpu-core/src/command/encoder.rs +++ b/wgpu-core/src/command/encoder.rs @@ -42,6 +42,7 @@ pub(crate) struct EncodingState<'snatch_guard, 'cmd_enc, E: ?Sized = dyn hal::Dy pub(crate) temp_resources: &'cmd_enc mut Vec, pub(crate) indirect_draw_validation_resources: &'cmd_enc mut crate::indirect_validation::DrawResources, + pub(crate) multi_draw_resources: &'cmd_enc mut crate::multi_draw_emulation::MultiDrawResources, pub(crate) snatch_guard: &'snatch_guard SnatchGuard<'snatch_guard>, diff --git a/wgpu-core/src/command/mod.rs b/wgpu-core/src/command/mod.rs index fe5e0dd2c65..213d181a56f 100644 --- a/wgpu-core/src/command/mod.rs +++ b/wgpu-core/src/command/mod.rs @@ -804,6 +804,7 @@ pub(crate) struct BakedCommands { pub(crate) trackers: Tracker, pub(crate) temp_resources: Vec, pub(crate) indirect_draw_validation_resources: crate::indirect_validation::DrawResources, + pub(crate) multi_draw_resources: crate::multi_draw_emulation::MultiDrawResources, buffer_memory_init_actions: Vec, texture_memory_actions: CommandBufferTextureMemoryActions, } @@ -832,6 +833,7 @@ pub struct CommandBufferMutable { temp_resources: Vec, indirect_draw_validation_resources: crate::indirect_validation::DrawResources, + multi_draw_resources: crate::multi_draw_emulation::MultiDrawResources, pub(crate) commands: Vec>, @@ -848,6 +850,7 @@ impl CommandBufferMutable { trackers: self.trackers, temp_resources: self.temp_resources, indirect_draw_validation_resources: self.indirect_draw_validation_resources, + multi_draw_resources: self.multi_draw_resources, buffer_memory_init_actions: self.buffer_memory_init_actions, texture_memory_actions: self.texture_memory_actions, } @@ -904,6 +907,9 @@ impl CommandEncoder { temp_resources: Default::default(), indirect_draw_validation_resources: crate::indirect_validation::DrawResources::new(device.clone()), + multi_draw_resources: crate::multi_draw_emulation::MultiDrawResources::new( + device.clone(), + ), commands: Vec::new(), #[cfg(feature = "trace")] trace_commands: if device.trace.lock().is_some() { @@ -1049,6 +1055,7 @@ impl CommandEncoder { temp_resources: &mut cmd_buf_data.temp_resources, indirect_draw_validation_resources: &mut cmd_buf_data .indirect_draw_validation_resources, + multi_draw_resources: &mut cmd_buf_data.multi_draw_resources, snatch_guard: &snatch_guard, debug_scope_depth: &mut debug_scope_depth, }; @@ -1123,6 +1130,7 @@ impl CommandEncoder { temp_resources: &mut cmd_buf_data.temp_resources, indirect_draw_validation_resources: &mut cmd_buf_data .indirect_draw_validation_resources, + multi_draw_resources: &mut cmd_buf_data.multi_draw_resources, snatch_guard: &snatch_guard, debug_scope_depth: &mut debug_scope_depth, }; diff --git a/wgpu-core/src/command/render.rs b/wgpu-core/src/command/render.rs index fbbf0bd858b..cdc1e1848d8 100644 --- a/wgpu-core/src/command/render.rs +++ b/wgpu-core/src/command/render.rs @@ -2076,6 +2076,8 @@ pub(super) fn encode_render_pass( let mut indirect_draw_validation_batcher = crate::indirect_validation::DrawBatcher::new(); + let mut pending_multi_draws: Vec = Vec::new(); + // We automatically keep extending command buffers over time, and because // we want to insert a command buffer _before_ what we're about to record, // we need to make sure to close the previous one. @@ -2144,6 +2146,7 @@ pub(super) fn encode_render_pass( temp_resources: parent_state.temp_resources, indirect_draw_validation_resources: parent_state .indirect_draw_validation_resources, + multi_draw_resources: parent_state.multi_draw_resources, snatch_guard: parent_state.snatch_guard, debug_scope_depth: &mut debug_scope_depth, }, @@ -2343,6 +2346,7 @@ pub(super) fn encode_render_pass( count_buffer_offset, max_count, family, + &mut pending_multi_draws, ) .map_pass_err(scope)?; } @@ -2523,6 +2527,18 @@ pub(super) fn encode_render_pass( ) .map_pass_err(pass_scope)?; } + + if let Some(ref multi_draw_emulation) = device.multi_draw_emulation { + multi_draw_emulation + .inject_emulation_pass( + device, + parent_state.multi_draw_resources, + transit, + pending_multi_draws, + parent_state.snatch_guard, + ) + .map_pass_err(pass_scope)?; + } } encoder.close_and_swap().map_pass_err(pass_scope)?; @@ -3191,6 +3207,7 @@ fn multi_draw_indirect_count( count_buffer_offset: u64, max_count: u32, family: DrawCommandFamily, + pending_multi_draws: &mut Vec, ) -> Result<(), RenderPassErrorInner> { api_log!( "RenderPass::multi_draw_indirect_count (family:{family:?}) {} {offset} {} {count_buffer_offset:?} {max_count:?}", @@ -3281,34 +3298,90 @@ fn multi_draw_indirect_count( ), ); - match family { - DrawCommandFamily::Draw => unsafe { - state.pass.base.raw_encoder.draw_indirect_count( - indirect_raw, - offset, - count_raw, - count_buffer_offset, - max_count, - ); - }, - DrawCommandFamily::DrawIndexed => unsafe { - state.pass.base.raw_encoder.draw_indexed_indirect_count( - indirect_raw, - offset, - count_raw, - count_buffer_offset, - max_count, - ); - }, - DrawCommandFamily::DrawMeshTasks => unsafe { - state.pass.base.raw_encoder.draw_mesh_tasks_indirect_count( - indirect_raw, - offset, - count_raw, - count_buffer_offset, - max_count, - ); - }, + if device.multi_draw_emulation.is_some() { + if max_count == 0 { + return Ok(()); + } + + let stride_u32 = stride as u32 / 4; + let temp_size = max_count as u64 * stride; + + let temp_buffer_index = state + .pass + .base + .multi_draw_resources + .acquire_temp_entry(temp_size, device.instance_flags) + .map_err(RenderPassErrorInner::Device)?; + + pending_multi_draws.push(crate::multi_draw_emulation::PendingDraw { + temp_buffer_index, + src_buffer: indirect_buffer, + count_buffer, + src_offset: offset, + count_offset: count_buffer_offset, + max_count, + stride_u32, + }); + + let temp_raw = state + .pass + .base + .multi_draw_resources + .get_temp_buffer(temp_buffer_index); + + match family { + DrawCommandFamily::Draw => unsafe { + state + .pass + .base + .raw_encoder + .draw_indirect(temp_raw, 0, max_count); + }, + DrawCommandFamily::DrawIndexed => unsafe { + state + .pass + .base + .raw_encoder + .draw_indexed_indirect(temp_raw, 0, max_count); + }, + DrawCommandFamily::DrawMeshTasks => unsafe { + state + .pass + .base + .raw_encoder + .draw_mesh_tasks_indirect(temp_raw, 0, max_count); + }, + } + } else { + match family { + DrawCommandFamily::Draw => unsafe { + state.pass.base.raw_encoder.draw_indirect_count( + indirect_raw, + offset, + count_raw, + count_buffer_offset, + max_count, + ); + }, + DrawCommandFamily::DrawIndexed => unsafe { + state.pass.base.raw_encoder.draw_indexed_indirect_count( + indirect_raw, + offset, + count_raw, + count_buffer_offset, + max_count, + ); + }, + DrawCommandFamily::DrawMeshTasks => unsafe { + state.pass.base.raw_encoder.draw_mesh_tasks_indirect_count( + indirect_raw, + offset, + count_raw, + count_buffer_offset, + max_count, + ); + }, + } } Ok(()) } diff --git a/wgpu-core/src/device/queue.rs b/wgpu-core/src/device/queue.rs index 413adab4472..a062d25bd18 100644 --- a/wgpu-core/src/device/queue.rs +++ b/wgpu-core/src/device/queue.rs @@ -344,6 +344,7 @@ pub(crate) struct EncoderInFlight { pub(crate) temp_resources: Vec, /// We only need to keep these resources alive. _indirect_draw_validation_resources: crate::indirect_validation::DrawResources, + _multi_draw_resources: crate::multi_draw_emulation::MultiDrawResources, /// These are the buffers that have been tracked by `PendingWrites`. pub(crate) pending_buffers: FastHashMap>, @@ -477,6 +478,9 @@ impl PendingWrites { _indirect_draw_validation_resources: crate::indirect_validation::DrawResources::new( device.clone(), ), + _multi_draw_resources: crate::multi_draw_emulation::MultiDrawResources::new( + device.clone(), + ), pending_buffers, pending_textures, pending_blas_s, @@ -1485,6 +1489,7 @@ impl Queue { temp_resources: baked.temp_resources, _indirect_draw_validation_resources: baked .indirect_draw_validation_resources, + _multi_draw_resources: baked.multi_draw_resources, pending_buffers: FastHashMap::default(), pending_textures: FastHashMap::default(), pending_blas_s: FastHashMap::default(), diff --git a/wgpu-core/src/device/resource.rs b/wgpu-core/src/device/resource.rs index 834dd3459e8..1632123201f 100644 --- a/wgpu-core/src/device/resource.rs +++ b/wgpu-core/src/device/resource.rs @@ -273,6 +273,7 @@ pub struct Device { pub(crate) deferred_destroy: Mutex>, pub(crate) usage_scopes: UsageScopePool, pub(crate) indirect_validation: Option, + pub(crate) multi_draw_emulation: Option, // Optional so that we can late-initialize this after the queue is created. pub(crate) timestamp_normalizer: OnceCellOrLock, @@ -320,6 +321,9 @@ impl Drop for Device { if let Some(indirect_validation) = self.indirect_validation.take() { indirect_validation.dispose(self.raw.as_ref()); } + if let Some(multi_draw_emulation) = self.multi_draw_emulation.take() { + multi_draw_emulation.dispose(self.raw.as_ref()); + } if let Some(timestamp_normalizer) = self.timestamp_normalizer.take() { timestamp_normalizer.dispose(self.raw.as_ref()); } @@ -516,6 +520,19 @@ impl Device { None }; + let multi_draw_emulation = if desc + .required_features + .contains(wgt::Features::MULTI_DRAW_INDIRECT_COUNT) + && adapter.backend() == wgt::Backend::Metal + { + Some(crate::multi_draw_emulation::MultiDrawEmulation::new( + raw_device.as_ref(), + instance_flags, + )?) + } else { + None + }; + Ok(Self { raw: raw_device, adapter: adapter.clone(), @@ -559,6 +576,7 @@ impl Device { usage_scopes: Mutex::new(rank::DEVICE_USAGE_SCOPES, Default::default()), timestamp_normalizer: OnceCellOrLock::new(), indirect_validation, + multi_draw_emulation, }) } diff --git a/wgpu-core/src/lib.rs b/wgpu-core/src/lib.rs index 8a981a437d7..d22878ef802 100644 --- a/wgpu-core/src/lib.rs +++ b/wgpu-core/src/lib.rs @@ -81,6 +81,7 @@ mod init_tracker; pub mod instance; pub mod limits; mod lock; +mod multi_draw_emulation; pub mod pipeline; mod pipeline_cache; mod pool; diff --git a/wgpu-core/src/lock/rank.rs b/wgpu-core/src/lock/rank.rs index 401fb9c7b8f..cf747f35826 100644 --- a/wgpu-core/src/lock/rank.rs +++ b/wgpu-core/src/lock/rank.rs @@ -111,6 +111,7 @@ define_lock_ranks! { BUFFER_MAP_STATE, COMMAND_ALLOCATOR_FREE_ENCODERS, BUFFER_POOL, + MULTI_DRAW_EMULATION_TEMP_POOL, DEVICE_TRACE, DEVICE_USAGE_SCOPES, REGISTRY_STORAGE, @@ -126,6 +127,7 @@ define_lock_ranks! { BUFFER_BIND_GROUPS, BUFFER_INITIALIZATION_STATUS, BUFFER_POOL, + MULTI_DRAW_EMULATION_TEMP_POOL, DEVICE_TRACE, DEVICE_USAGE_SCOPES, REGISTRY_STORAGE, @@ -162,6 +164,7 @@ define_lock_ranks! { BUFFER_MAP_STATE, BUFFER_INITIALIZATION_STATUS, BUFFER_POOL, + MULTI_DRAW_EMULATION_TEMP_POOL, COMMAND_ALLOCATOR_FREE_ENCODERS, DEVICE_DEFERRED_DESTROY, DEVICE_TRACE, @@ -183,6 +186,7 @@ define_lock_ranks! { rank DEVICE_DEFERRED_DESTROY "Device::deferred_destroy" followed by { } rank DEVICE_TRACE "Device::trace" followed by { } rank DEVICE_USAGE_SCOPES "Device::usage_scopes" followed by { } + rank MULTI_DRAW_EMULATION_TEMP_POOL "MultiDrawEmulation::temp_pool" followed by { } rank REGISTRY_STORAGE "Registry::storage" followed by { } rank SHARED_TRACKER_INDEX_ALLOCATOR_INNER "SharedTrackerIndexAllocator::inner" followed by { } rank TEXTURE_BIND_GROUPS "Texture::bind_groups" followed by { } diff --git a/wgpu-core/src/multi_draw_emulation/mod.rs b/wgpu-core/src/multi_draw_emulation/mod.rs new file mode 100644 index 00000000000..85fbc9d6072 --- /dev/null +++ b/wgpu-core/src/multi_draw_emulation/mod.rs @@ -0,0 +1,518 @@ +use crate::{ + command::RenderPassErrorInner, + device::{Device, DeviceError}, + hal_label, + lock::{rank, Mutex}, + pipeline::{CreateComputePipelineError, CreateShaderModuleError}, + resource::{Buffer, InvalidResourceError, Labeled, RawResourceAccess}, + snatch::SnatchGuard, +}; +use alloc::{borrow::Cow, boxed::Box, string::ToString, sync::Arc, vec::Vec}; +use core::num::NonZeroU64; +use thiserror::Error; + +#[derive(Clone, Debug, Error)] +#[non_exhaustive] +enum CreateMultiDrawEmulationPipelineError { + #[error(transparent)] + DeviceError(#[from] DeviceError), + #[error(transparent)] + ShaderModule(#[from] CreateShaderModuleError), + #[error(transparent)] + ComputePipeline(#[from] CreateComputePipelineError), +} + +struct MultiDrawEmulationInner { + module: Box, + bgl: Box, + pipeline_layout: Box, + pipeline: Box, + temp_pool: Mutex>, +} + +/// This machinery requires the following limits: +/// +/// - max_bind_groups: 1, +/// - max_storage_buffers_per_shader_stage: 3, +/// - max_immediate_size: 16, +/// +/// These are all indirectly satisfied by `DownlevelFlags::INDIRECT_EXECUTION`, which is also +/// required for this module's functionality to work. +pub(crate) struct MultiDrawEmulation { + inner: Option, +} + +struct TempPoolEntry { + buffer: Box, + size: u64, +} + +pub(crate) struct PendingDraw { + pub(crate) temp_buffer_index: usize, + pub(crate) src_buffer: Arc, + pub(crate) count_buffer: Arc, + pub(crate) src_offset: u64, + pub(crate) count_offset: u64, + pub(crate) max_count: u32, + pub(crate) stride_u32: u32, +} + +pub(crate) struct MultiDrawResources { + device: Arc, + temp_entries: Vec, + bind_groups: Vec>, +} + +impl MultiDrawEmulation { + pub(crate) fn new( + device: &dyn hal::DynDevice, + instance_flags: wgt::InstanceFlags, + ) -> Result { + let inner = match MultiDrawEmulationInner::new(device, instance_flags) { + Ok(inner) => inner, + Err(e) => { + log::error!("multi-draw-emulation error: {e:?}"); + return Err(DeviceError::Lost); + } + }; + Ok(Self { inner: Some(inner) }) + } + + pub(crate) fn dispose(self, device: &dyn hal::DynDevice) { + let inner = self.inner.unwrap(); + for entry in inner.temp_pool.into_inner() { + unsafe { device.destroy_buffer(entry.buffer) }; + } + unsafe { device.destroy_compute_pipeline(inner.pipeline) }; + unsafe { device.destroy_pipeline_layout(inner.pipeline_layout) }; + unsafe { device.destroy_bind_group_layout(inner.bgl) }; + unsafe { device.destroy_shader_module(inner.module) }; + } + + fn acquire_temp_entry( + &self, + device: &dyn hal::DynDevice, + size: u64, + instance_flags: wgt::InstanceFlags, + ) -> Result { + let inner = self.inner.as_ref().unwrap(); + let mut pool = inner.temp_pool.lock(); + + if let Some(idx) = pool.iter().position(|e| e.size >= size) { + return Ok(pool.swap_remove(idx)); + } + + let buffer_desc = hal::BufferDescriptor { + label: hal_label( + Some("(wgpu internal) Multi-draw emulation temp buffer"), + instance_flags, + ), + size, + usage: wgt::BufferUses::STORAGE_READ_WRITE | wgt::BufferUses::INDIRECT, + memory_flags: hal::MemoryFlags::empty(), + }; + let buffer = + unsafe { device.create_buffer(&buffer_desc) }.map_err(DeviceError::from_hal)?; + + Ok(TempPoolEntry { buffer, size }) + } + + fn release_temp_entries(&self, entries: impl Iterator) { + let inner = self.inner.as_ref().unwrap(); + inner.temp_pool.lock().extend(entries); + } + + pub(crate) fn inject_emulation_pass( + &self, + device: &Arc, + resources: &mut MultiDrawResources, + encoder: &mut dyn hal::DynCommandEncoder, + pending: Vec, + snatch_guard: &SnatchGuard, + ) -> Result<(), RenderPassErrorInner> { + if pending.is_empty() { + return Ok(()); + } + + let inner = self.inner.as_ref().unwrap(); + + { + let mut barriers: Vec> = Vec::new(); + + for p in &pending { + let src_buf = p.src_buffer.try_raw(snatch_guard).map_err(|_| { + RenderPassErrorInner::InvalidResource(InvalidResourceError( + p.src_buffer.error_ident(), + )) + })?; + let count_buf = p.count_buffer.try_raw(snatch_guard).map_err(|_| { + RenderPassErrorInner::InvalidResource(InvalidResourceError( + p.count_buffer.error_ident(), + )) + })?; + let temp_buffer = resources.get_temp_buffer(p.temp_buffer_index); + barriers.push(hal::BufferBarrier { + buffer: src_buf, + usage: hal::StateTransition { + from: wgt::BufferUses::INDIRECT, + to: wgt::BufferUses::STORAGE_READ_ONLY, + }, + }); + barriers.push(hal::BufferBarrier { + buffer: count_buf, + usage: hal::StateTransition { + from: wgt::BufferUses::INDIRECT, + to: wgt::BufferUses::STORAGE_READ_ONLY, + }, + }); + barriers.push(hal::BufferBarrier { + buffer: temp_buffer, + usage: hal::StateTransition { + from: wgt::BufferUses::INDIRECT, + to: wgt::BufferUses::STORAGE_READ_WRITE, + }, + }); + } + + unsafe { encoder.transition_buffers(&barriers) }; + } + + let compute_desc = hal::ComputePassDescriptor { + label: hal_label( + Some("(wgpu internal) Multi-draw indirect count emulation pass"), + device.instance_flags, + ), + timestamp_writes: None, + }; + unsafe { encoder.begin_compute_pass(&compute_desc) }; + unsafe { encoder.set_compute_pipeline(inner.pipeline.as_ref()) }; + + let alignment = device.limits.min_storage_buffer_offset_alignment as u64; + + for p in &pending { + let temp_buffer = resources.get_temp_buffer(p.temp_buffer_index); + let src_buf = p.src_buffer.try_raw(snatch_guard).map_err(|_| { + RenderPassErrorInner::InvalidResource(InvalidResourceError( + p.src_buffer.error_ident(), + )) + })?; + let count_buf = p.count_buffer.try_raw(snatch_guard).map_err(|_| { + RenderPassErrorInner::InvalidResource(InvalidResourceError( + p.count_buffer.error_ident(), + )) + })?; + + let aligned_src_offset = p.src_offset - p.src_offset % alignment; + let src_remainder_u32 = ((p.src_offset - aligned_src_offset) / 4) as u32; + + let aligned_count_offset = p.count_offset - p.count_offset % alignment; + let count_remainder_u32 = ((p.count_offset - aligned_count_offset) / 4) as u32; + let count_binding_size = NonZeroU64::new((count_remainder_u32 as u64 + 1) * 4).unwrap(); + + let temp_size = NonZeroU64::new(p.max_count as u64 * p.stride_u32 as u64 * 4).unwrap(); + + let bg_desc = hal::BindGroupDescriptor { + label: hal_label( + Some("(wgpu internal) Multi-draw emulation bind group"), + device.instance_flags, + ), + layout: inner.bgl.as_ref(), + entries: &[ + hal::BindGroupEntry { + binding: 0, + resource_index: 0, + count: 1, + }, + hal::BindGroupEntry { + binding: 1, + resource_index: 1, + count: 1, + }, + hal::BindGroupEntry { + binding: 2, + resource_index: 2, + count: 1, + }, + ], + buffers: &[ + hal::BufferBinding::new_unchecked(src_buf, aligned_src_offset, None), + hal::BufferBinding::new_unchecked(temp_buffer, 0, Some(temp_size)), + hal::BufferBinding::new_unchecked( + count_buf, + aligned_count_offset, + Some(count_binding_size), + ), + ], + samplers: &[], + textures: &[], + acceleration_structures: &[], + external_textures: &[], + }; + + let bg = unsafe { device.raw().create_bind_group(&bg_desc) } + .map_err(DeviceError::from_hal) + .map_err(RenderPassErrorInner::Device)?; + + unsafe { encoder.set_bind_group(inner.pipeline_layout.as_ref(), 0, bg.as_ref(), &[]) }; + unsafe { + encoder.set_immediates( + inner.pipeline_layout.as_ref(), + 0, + &[ + p.max_count, + p.stride_u32, + src_remainder_u32, + count_remainder_u32, + ], + ) + }; + + let wg_count = p.max_count.div_ceil(64); + unsafe { encoder.dispatch_workgroups([wg_count, 1, 1]) }; + + resources.bind_groups.push(bg); + } + + unsafe { encoder.end_compute_pass() }; + + { + let mut barriers: Vec> = Vec::new(); + + for p in &pending { + let src_buf = p.src_buffer.try_raw(snatch_guard).map_err(|_| { + RenderPassErrorInner::InvalidResource(InvalidResourceError( + p.src_buffer.error_ident(), + )) + })?; + let count_buf = p.count_buffer.try_raw(snatch_guard).map_err(|_| { + RenderPassErrorInner::InvalidResource(InvalidResourceError( + p.count_buffer.error_ident(), + )) + })?; + let temp_buffer = resources.get_temp_buffer(p.temp_buffer_index); + barriers.push(hal::BufferBarrier { + buffer: src_buf, + usage: hal::StateTransition { + from: wgt::BufferUses::STORAGE_READ_ONLY, + to: wgt::BufferUses::INDIRECT, + }, + }); + barriers.push(hal::BufferBarrier { + buffer: count_buf, + usage: hal::StateTransition { + from: wgt::BufferUses::STORAGE_READ_ONLY, + to: wgt::BufferUses::INDIRECT, + }, + }); + barriers.push(hal::BufferBarrier { + buffer: temp_buffer, + usage: hal::StateTransition { + from: wgt::BufferUses::STORAGE_READ_WRITE, + to: wgt::BufferUses::INDIRECT, + }, + }); + } + + unsafe { encoder.transition_buffers(&barriers) }; + } + + Ok(()) + } +} + +impl MultiDrawEmulationInner { + fn new( + device: &dyn hal::DynDevice, + instance_flags: wgt::InstanceFlags, + ) -> Result { + let src = include_str!("multi_draw_count_emulation.wgsl"); + + #[cfg(feature = "wgsl")] + let module = naga::front::wgsl::parse_str(src).map_err(|inner| { + CreateShaderModuleError::Parsing(naga::error::ShaderError { + source: src.to_string(), + label: None, + inner: Box::new(inner), + }) + })?; + + #[cfg(not(feature = "wgsl"))] + #[allow(clippy::diverging_sub_expression)] + let module = panic!("Multi-draw emulation requires the wgsl feature flag to be enabled!"); + + let info = crate::device::create_validator( + wgt::Features::IMMEDIATES, + wgt::DownlevelFlags::empty(), + naga::valid::ValidationFlags::all(), + ) + .validate(&module) + .map_err(|inner| { + CreateShaderModuleError::Validation(naga::error::ShaderError { + source: src.to_string(), + label: None, + inner: Box::new(inner), + }) + })?; + + let hal_shader = hal::ShaderInput::Naga(hal::NagaShader { + module: Cow::Owned(module), + info, + debug_source: None, + }); + let hal_desc = hal::ShaderModuleDescriptor { + label: hal_label( + Some("(wgpu internal) Multi-draw emulation shader module"), + instance_flags, + ), + runtime_checks: wgt::ShaderRuntimeChecks::unchecked(), + }; + let shader_module = + unsafe { device.create_shader_module(&hal_desc, hal_shader) }.map_err(|error| { + match error { + hal::ShaderError::Device(error) => { + CreateShaderModuleError::Device(DeviceError::from_hal(error)) + } + hal::ShaderError::Compilation(ref msg) => { + log::error!("Shader error: {msg}"); + CreateShaderModuleError::Generation + } + } + })?; + + let bgl_desc = hal::BindGroupLayoutDescriptor { + label: hal_label( + Some("(wgpu internal) Multi-draw emulation bind group layout"), + instance_flags, + ), + flags: hal::BindGroupLayoutFlags::empty(), + entries: &[ + wgt::BindGroupLayoutEntry { + binding: 0, + visibility: wgt::ShaderStages::COMPUTE, + ty: wgt::BindingType::Buffer { + ty: wgt::BufferBindingType::Storage { read_only: true }, + has_dynamic_offset: false, + min_binding_size: Some(NonZeroU64::new(4).unwrap()), + }, + count: None, + }, + wgt::BindGroupLayoutEntry { + binding: 1, + visibility: wgt::ShaderStages::COMPUTE, + ty: wgt::BindingType::Buffer { + ty: wgt::BufferBindingType::Storage { read_only: false }, + has_dynamic_offset: false, + min_binding_size: Some(NonZeroU64::new(4).unwrap()), + }, + count: None, + }, + wgt::BindGroupLayoutEntry { + binding: 2, + visibility: wgt::ShaderStages::COMPUTE, + ty: wgt::BindingType::Buffer { + ty: wgt::BufferBindingType::Storage { read_only: true }, + has_dynamic_offset: false, + min_binding_size: Some(NonZeroU64::new(4).unwrap()), + }, + count: None, + }, + ], + }; + let bgl = unsafe { + device + .create_bind_group_layout(&bgl_desc) + .map_err(DeviceError::from_hal)? + }; + + let pipeline_layout_desc = hal::PipelineLayoutDescriptor { + label: hal_label( + Some("(wgpu internal) Multi-draw emulation pipeline layout"), + instance_flags, + ), + flags: hal::PipelineLayoutFlags::empty(), + bind_group_layouts: &[Some(bgl.as_ref())], + immediate_size: 16, + }; + let pipeline_layout = unsafe { + device + .create_pipeline_layout(&pipeline_layout_desc) + .map_err(DeviceError::from_hal)? + }; + + let pipeline_desc = hal::ComputePipelineDescriptor { + label: hal_label( + Some("(wgpu internal) Multi-draw emulation pipeline"), + instance_flags, + ), + layout: pipeline_layout.as_ref(), + stage: hal::ProgrammableStage { + module: shader_module.as_ref(), + entry_point: "main", + constants: &hashbrown::HashMap::new(), + zero_initialize_workgroup_memory: false, + }, + cache: None, + }; + let pipeline = + unsafe { device.create_compute_pipeline(&pipeline_desc) }.map_err(|err| match err { + hal::PipelineError::Device(error) => { + CreateComputePipelineError::Device(DeviceError::from_hal(error)) + } + hal::PipelineError::Linkage(_stages, msg) => { + CreateComputePipelineError::Internal(msg) + } + hal::PipelineError::EntryPoint(_stage) => CreateComputePipelineError::Internal( + crate::device::ENTRYPOINT_FAILURE_ERROR.to_string(), + ), + hal::PipelineError::PipelineConstants(_, error) => { + CreateComputePipelineError::PipelineConstants(error) + } + })?; + + Ok(Self { + module: shader_module, + bgl, + pipeline_layout, + pipeline, + temp_pool: Mutex::new(rank::MULTI_DRAW_EMULATION_TEMP_POOL, Vec::new()), + }) + } +} + +impl MultiDrawResources { + pub(crate) fn new(device: Arc) -> Self { + MultiDrawResources { + device, + temp_entries: Vec::new(), + bind_groups: Vec::new(), + } + } + + pub(crate) fn get_temp_buffer(&self, index: usize) -> &dyn hal::DynBuffer { + self.temp_entries.get(index).unwrap().buffer.as_ref() + } + + pub(crate) fn acquire_temp_entry( + &mut self, + size: u64, + instance_flags: wgt::InstanceFlags, + ) -> Result { + let emulation = self.device.multi_draw_emulation.as_ref().unwrap(); + let entry = emulation.acquire_temp_entry(self.device.raw(), size, instance_flags)?; + let index = self.temp_entries.len(); + self.temp_entries.push(entry); + Ok(index) + } +} + +impl Drop for MultiDrawResources { + fn drop(&mut self) { + let raw = self.device.raw(); + for bg in self.bind_groups.drain(..) { + unsafe { raw.destroy_bind_group(bg) }; + } + if let Some(emulation) = self.device.multi_draw_emulation.as_ref() { + emulation.release_temp_entries(self.temp_entries.drain(..)); + } + } +} diff --git a/wgpu-core/src/multi_draw_emulation/multi_draw_count_emulation.wgsl b/wgpu-core/src/multi_draw_emulation/multi_draw_count_emulation.wgsl new file mode 100644 index 00000000000..938c72132b2 --- /dev/null +++ b/wgpu-core/src/multi_draw_emulation/multi_draw_count_emulation.wgsl @@ -0,0 +1,29 @@ +struct Params { + max_count: u32, + stride: u32, + src_offset: u32, + count_offset: u32, +} + +var params: Params; + +@group(0) @binding(0) var src: array; +@group(0) @binding(1) var dst: array; +@group(0) @binding(2) var count_buf: array; + +@compute @workgroup_size(64) +fn main(@builtin(global_invocation_id) gid: vec3u) { + let count = min(count_buf[params.count_offset], params.max_count); + let src_base = params.src_offset + gid.x * params.stride; + let dst_base = gid.x * params.stride; + + if (gid.x < count) { + for (var i = 0u; i < params.stride; i = i + 1u) { + dst[dst_base + i] = src[src_base + i]; + } + } else if (gid.x < params.max_count) { + for (var i = 0u; i < params.stride; i = i + 1u) { + dst[dst_base + i] = 0u; + } + } +} diff --git a/wgpu-hal/src/metal/adapter.rs b/wgpu-hal/src/metal/adapter.rs index 47cef826522..4e2a0419652 100644 --- a/wgpu-hal/src/metal/adapter.rs +++ b/wgpu-hal/src/metal/adapter.rs @@ -1170,6 +1170,7 @@ impl super::CapabilitiesQuery { features.set(F::FLOAT32_FILTERABLE, self.supports_float_filtering); features.set(F::FLOAT32_BLENDABLE, true); features.set(F::INDIRECT_FIRST_INSTANCE, self.indirect_draw_dispatch); + features.set(F::MULTI_DRAW_INDIRECT_COUNT, self.indirect_draw_dispatch); features.set( F::TIMESTAMP_QUERY | F::TIMESTAMP_QUERY_INSIDE_ENCODERS, self.timestamp_query_support diff --git a/wgpu-types/src/features.rs b/wgpu-types/src/features.rs index 63656c72961..a34f3cae5e5 100644 --- a/wgpu-types/src/features.rs +++ b/wgpu-types/src/features.rs @@ -869,13 +869,12 @@ bitflags_array! { const PARTIALLY_BOUND_BINDING_ARRAY = 1 << 13; /// Allows the user to call [`RenderPass::multi_draw_indirect_count`] and [`RenderPass::multi_draw_indexed_indirect_count`]. /// - /// This allows the use of a buffer containing the actual number of draw calls. This feature being present also implies - /// that all calls to [`RenderPass::multi_draw_indirect`] and [`RenderPass::multi_draw_indexed_indirect`] are not being emulated - /// with a series of `draw_indirect` calls. + /// This allows the use of a buffer containing the actual number of draw calls. /// /// Supported platforms: /// - DX12 /// - Vulkan 1.2+ (or VK_KHR_draw_indirect_count) + /// - Metal (emulated via compute shader) /// /// This is a native only feature. ///