-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcompute.go
281 lines (256 loc) · 11.1 KB
/
compute.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
// Copyright (c) 2022, Cogent Core. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package vgpu
import (
"embed"
"math"
vk "github.com/goki/vulkan"
)
// https://www.reddit.com/r/vulkan/comments/us9p72/multiple_command_buffers_vs_multiple_batches_of/
// NewComputePipelineEmbed returns a new pipeline added to this System,
// using given file name from given embed.FS filesystem as a ComputeShader.
func (sy *System) NewComputePipelineEmbed(name string, efs embed.FS, fname string) *Pipeline {
pl := sy.NewPipeline(name)
pl.AddShaderEmbed(name, ComputeShader, efs, fname)
return pl
}
// ComputeCmdBuff returns the default compute command buffer: CmdPool.Buff
// which can be used for executing arbitrary compute commands.
func (sy *System) ComputeCmdBuff() vk.CommandBuffer {
return sy.CmdPool.Buff
}
// ComputeResetBindVars adds command to the given
// command buffer, to bind the Vars descriptors,
// for given collection of descriptors descIndex
// (see Vars NDescs for info).
// Required whenever variables have changed their mappings,
// before running a command.
func (sy *System) ComputeResetBindVars(cmd vk.CommandBuffer, descIndex int) {
sy.CmdResetBindVars(cmd, descIndex)
}
// ComputeResetBegin resets and begins the recording of commands
// the given command buffer -- use prior to ComputeCommand
// if not needing to call ComputeBindVars
func (sy *System) ComputeResetBegin(cmd vk.CommandBuffer) {
CmdResetBegin(cmd)
}
// Warps returns the number of warps (thread groups) that is sufficient
// to compute n elements, given specified number of threads per this dimension.
// It just rounds up to nearest even multiple of n divided by threads:
// Ceil(n / threads)
func Warps(n, threads int) int {
return int(math.Ceil(float64(n) / float64(threads)))
}
// ComputeDispatch adds commands to given cmd buffer to run the compute
// shader for given number of *warps* (groups of threads) along 3 dimensions,
// which then generate indexes passed into the shader.
// In HLSL, the [numthreads(x, y, z)] directive specifies the number
// of threads allocated per warp -- the actual number of elements
// processed is threads * warps per each dimension. See Warps function.
// The hardware typically has 32 (NVIDIA, M1, M2) or 64 (AMD) hardware
// threads per warp, and so 64 is typically used as a default sum of
// threads per warp across all of the dimensions.
// Can use subsets of dimensions by using 1 for the other dimensions,
// and see ComputeDispatch1D for a convenience method that automatically
// computes the number of warps for a 1D compute shader (everthing in x).
// Must have a CmdBegin already executed, either via ComputeBindVars
// or ComputeResetBegin call.
// Must call CommandSubmit[Wait] to execute the command.
func (pl *Pipeline) ComputeDispatch(cmd vk.CommandBuffer, nx, ny, nz int) {
vk.CmdBindPipeline(cmd, vk.PipelineBindPointCompute, pl.VkPipeline)
vk.CmdDispatch(cmd, uint32(nx), uint32(ny), uint32(nz))
}
// ComputeDispatch1D adds commands to run the compute shader for given
// number of computational elements along the first (X) dimension,
// for given number *elements* and threads per warp (typically 64).
// See ComputeDispatch for full info.
// This is just a convenience method for common 1D case that calls
// the Warps method for you.
func (pl *Pipeline) ComputeDispatch1D(cmd vk.CommandBuffer, n, threads int) {
pl.ComputeDispatch(cmd, Warps(n, threads), 1, 1)
}
// ComputeSetEvent sets an event to be signalled when everything
// up to this point in the named command buffer has completed.
// This is the best way to coordinate processing within a sequence of
// compute shader calls within a single command buffer.
// Returns an error if the named event was not found.
func (sy *System) ComputeSetEvent(cmd vk.CommandBuffer, event string) error {
ev, err := sy.EventByNameTry(event)
if err != nil {
return err
}
vk.CmdSetEvent(cmd, ev, vk.PipelineStageFlags(vk.PipelineStageComputeShaderBit))
return nil
}
// ComputeWaitEvents waits until previous ComputeSetEvent event(s) have signalled.
// This is the best way to coordinate processing within a sequence of
// compute shader calls within named command buffer.
// However, use ComputeWaitMem* calls (e.g., WriteRead) to ensure memory writes
// have completed, instead of creating an Event.
// Returns an error if the named event was not found.
func (sy *System) ComputeWaitEvents(cmd vk.CommandBuffer, event ...string) error {
evts := make([]vk.Event, len(event))
for i, enm := range event {
ev, err := sy.EventByNameTry(enm)
if err != nil {
return err
}
evts[i] = ev
}
flg := vk.PipelineStageFlags(vk.PipelineStageComputeShaderBit)
vk.CmdWaitEvents(cmd, uint32(len(evts)), evts, flg, flg, 0, nil, 0, nil, 0, nil)
return nil
}
// ComputeCopyToGPU records command to copy given regions
// in the Storage buffer memory from CPU to GPU, in one call.
// Use SyncRegValueIndexFromCPU to get the regions.
func (sy *System) ComputeCopyToGPU(cmd vk.CommandBuffer, regs ...MemReg) {
sy.Mem.CmdTransferStorageRegsToGPU(cmd, regs)
}
// ComputeCopyFromGPU records command to copy given regions
// in the Storage buffer memory from GPU to CPU, in one call.
// Use SyncRegValueIndexFromCPU to get the regions.
func (sy *System) ComputeCopyFromGPU(cmd vk.CommandBuffer, regs ...MemReg) {
sy.Mem.CmdTransferStorageRegsFromGPU(cmd, regs)
}
// ComputeWaitMemWriteRead records pipeline barrier ensuring
// global memory writes from the shader have completed and are ready to read
// in the next step of a command queue.
// Use this instead of Events to synchronize steps of a computation.
func (sy *System) ComputeWaitMemWriteRead(cmd vk.CommandBuffer) {
shader := vk.PipelineStageFlags(vk.PipelineStageComputeShaderBit)
vk.CmdPipelineBarrier(cmd, shader, shader, vk.DependencyFlags(0), 1,
[]vk.MemoryBarrier{{
SType: vk.StructureTypeMemoryBarrier,
SrcAccessMask: vk.AccessFlags(vk.AccessShaderWriteBit),
DstAccessMask: vk.AccessFlags(vk.AccessShaderReadBit),
}}, 0, nil, 0, nil)
}
// ComputeWaitMemHostToShader records pipeline barrier ensuring
// global memory writes from the host to shader have completed.
// Use this if the first commands are to copy memory from host,
// instead of creating a separate Event.
func (sy *System) ComputeWaitMemHostToShader(cmd vk.CommandBuffer) {
shader := vk.PipelineStageFlags(vk.PipelineStageComputeShaderBit)
host := vk.PipelineStageFlags(vk.PipelineStageHostBit)
vk.CmdPipelineBarrier(cmd, host, shader, vk.DependencyFlags(0), 1,
[]vk.MemoryBarrier{{
SType: vk.StructureTypeMemoryBarrier,
SrcAccessMask: vk.AccessFlags(vk.AccessHostWriteBit),
DstAccessMask: vk.AccessFlags(vk.AccessShaderReadBit),
}}, 0, nil, 0, nil)
}
// ComputeWaitMemShaderToHost records pipeline barrier ensuring
// global memory writes have completed from the compute shader,
// and are ready for the host to read.
// This is not necessary if a standard QueueWaitIdle is done at
// the end of a command (basically not really needed,
// but included for completeness).
func (sy *System) ComputeWaitMemShaderToHost(cmd vk.CommandBuffer) {
shader := vk.PipelineStageFlags(vk.PipelineStageComputeShaderBit)
host := vk.PipelineStageFlags(vk.PipelineStageHostBit)
vk.CmdPipelineBarrier(cmd, shader, host, vk.DependencyFlags(0), 1,
[]vk.MemoryBarrier{{
SType: vk.StructureTypeMemoryBarrier,
SrcAccessMask: vk.AccessFlags(vk.AccessShaderWriteBit),
DstAccessMask: vk.AccessFlags(vk.AccessHostReadBit),
}}, 0, nil, 0, nil)
}
// ComputeWaitMemoryBuff records pipeline barrier ensuring
// given buffer's memory writes have completed for given buffer from the compute shader,
// and are ready for the host to read. Vulkan docs suggest that global memory
// buffer barrier is generally better to use (ComputeWaitMem*)
func (sy *System) ComputeWaitMemoryBuff(cmd vk.CommandBuffer, buff *MemBuff) {
shader := vk.PipelineStageFlags(vk.PipelineStageComputeShaderBit)
host := vk.PipelineStageFlags(vk.PipelineStageHostBit)
vk.CmdPipelineBarrier(cmd, shader, host, vk.DependencyFlags(0), 0, nil, 1,
[]vk.BufferMemoryBarrier{{
SType: vk.StructureTypeBufferMemoryBarrier,
SrcAccessMask: vk.AccessFlags(vk.AccessShaderWriteBit),
DstAccessMask: vk.AccessFlags(vk.AccessHostReadBit),
Buffer: buff.Dev,
Size: vk.DeviceSize(buff.Size),
}}, 0, nil)
}
// ComputeSubmitWait submits the current set of commands
// in the default system CmdPool, typically from ComputeDispatch.
// Then waits for the commands to finish before returning
// control to the CPU. Results will be available immediately
// thereafter for retrieving back from the GPU.
func (sy *System) ComputeSubmitWait(cmd vk.CommandBuffer) {
CmdSubmitWait(cmd, &sy.Device)
}
// ComputeCmdEnd adds an end to given command buffer
func (sy *System) ComputeCmdEnd(cmd vk.CommandBuffer) {
CmdEnd(cmd)
}
// ComputeSubmitWaitSignal submits command in given buffer to system device queue
// with given wait semaphore and given signal semaphore (by name) when done,
// and with given fence (use empty string for none).
// This will cause the GPU to wait until the wait semphaphore is
// signaled by a previous command with that semaphore as its signal.
// The optional fence is used typically at the end of a block of
// such commands, whenever the CPU needs to be sure the submitted GPU
// commands have completed. Must use "ComputeWait" if using std
// ComputeWait function.
func (sy *System) ComputeSubmitWaitSignal(cmd vk.CommandBuffer, wait, signal, fence string) error {
CmdEnd(cmd)
ws, err := sy.SemaphoreByNameTry(wait)
if err != nil {
return err
}
ss, err := sy.SemaphoreByNameTry(signal)
if err != nil {
return err
}
fc := vk.NullFence
if fence != "" {
fc, err = sy.FenceByNameTry(fence)
if err != nil {
return err
}
}
CmdSubmitWaitSignal(cmd, &sy.Device, ws, ss, fc)
return nil
}
// ComputeSubmitSignal submits command in buffer to system device queue
// with given signal semaphore (by name) when done,
// and with given fence (use empty string for none).
// The optional fence is used typically at the end of a block of
// such commands, whenever the CPU needs to be sure the submitted GPU
// commands have completed. Must use "ComputeWait" if using std
// ComputeWait function.
func (sy *System) ComputeSubmitSignal(cmd vk.CommandBuffer, signal, fence string) error {
CmdEnd(cmd)
ss, err := sy.SemaphoreByNameTry(signal)
if err != nil {
return err
}
fc := vk.NullFence
if fence != "" {
fc, err = sy.FenceByNameTry(fence)
if err != nil {
return err
}
}
CmdSubmitSignal(cmd, &sy.Device, ss, fc)
return nil
}
// ComputeWaitFence waits for given fence (by name), and resets the fence
func (sy *System) ComputeWaitFence(fence string) error {
fc, err := sy.FenceByNameTry(fence)
if err != nil {
return err
}
vk.WaitForFences(sy.Device.Device, 1, []vk.Fence{fc}, vk.True, vk.MaxUint64)
vk.ResetFences(sy.Device.Device, 1, []vk.Fence{fc})
return nil
}
// ComputeWait waits for the standard ComputeWait fence
func (sy *System) ComputeWait() error {
fc, _ := sy.FenceByNameTry("ComputeWait")
vk.WaitForFences(sy.Device.Device, 1, []vk.Fence{fc}, vk.True, vk.MaxUint64)
vk.ResetFences(sy.Device.Device, 1, []vk.Fence{fc})
return nil
}