-
Notifications
You must be signed in to change notification settings - Fork 13
Unit tests and benchmark for subgroup2 and workgroup2 stuff #192
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
8090a2d
3a2ff14
dd021a0
ca21941
0bb41db
24a93bb
17dda8e
3d4e0f2
0192999
8c9d55e
07d6980
be756d5
1963b51
99cf5d8
1d5e433
a3bb526
355c605
6b57674
7da1bec
750b3d2
f11b3df
9f690ee
755f89a
b8415ad
474281d
7d06332
874557c
28ea75f
e8c2831
93b4d0b
2ba2b82
d567e71
54acf2a
030d622
ca71a39
6018e9a
3a9758c
e496e98
20011f5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
#include "common.hlsl" | ||
|
||
#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" | ||
#include "nbl/builtin/hlsl/subgroup/basic.hlsl" | ||
#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl" | ||
#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" | ||
|
||
#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" | ||
|
||
// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 | ||
uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} | ||
|
||
#ifndef ITEMS_PER_INVOCATION | ||
#error "Define ITEMS_PER_INVOCATION!" | ||
#endif | ||
|
||
typedef vector<uint32_t, ITEMS_PER_INVOCATION> type_t; | ||
|
||
// unfortunately DXC chokes on descriptors as static members | ||
// https://github.com/microsoft/DirectXShaderCompiler/issues/5940 | ||
[[vk::binding(0, 0)]] StructuredBuffer<type_t> inputValue; | ||
[[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; | ||
Comment on lines
+19
to
+22
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you make the test use BDA so its simpler (no descriptor sets, just BDA sent via push constants) |
||
|
||
// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way | ||
uint32_t globalIndex(); | ||
// since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs | ||
bool canStore(); | ||
|
||
#ifndef OPERATION | ||
#error "Define OPERATION!" | ||
#endif | ||
|
||
#ifndef SUBGROUP_SIZE_LOG2 | ||
#error "Define SUBGROUP_SIZE_LOG2!" | ||
#endif | ||
template<template<class> class binop, typename T, uint32_t N> | ||
static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) | ||
Comment on lines
+36
to
+37
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. grab a binop thats already instantiated (use
|
||
{ | ||
// TODO static assert vector<T, N> == type_t | ||
//using type_t = vector<T, N>; | ||
using config_t = nbl::hlsl::subgroup2::Configuration<SUBGROUP_SIZE_LOG2>; | ||
using params_t = nbl::hlsl::subgroup2::ArithmeticParams<config_t, typename binop<T>::base_t, N, nbl::hlsl::jit::device_capabilities>; | ||
|
||
if (globalIndex()==0u) | ||
output[binop<T>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize()); | ||
|
||
operation_t<params_t> func; | ||
if (canStore()) | ||
output[binop<T>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); | ||
} | ||
|
||
|
||
type_t test() | ||
{ | ||
const uint32_t idx = globalIndex(); | ||
type_t sourceVal = inputValue[idx]; | ||
|
||
subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
return sourceVal; | ||
} | ||
|
||
#include "nbl/builtin/hlsl/workgroup/basic.hlsl" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
#pragma shader_stage(compute) | ||
|
||
#include "workgroupCommon.hlsl" | ||
|
||
template<class Config, class Binop> | ||
struct DataProxy | ||
{ | ||
using dtype_t = vector<uint32_t, Config::ItemsPerInvocation_0>; | ||
static_assert(nbl::hlsl::is_same_v<dtype_t, type_t>); | ||
|
||
void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value) | ||
{ | ||
value = inputValue[ix]; | ||
} | ||
void set(const uint32_t ix, const dtype_t value) | ||
{ | ||
output[Binop::BindingIndex].template Store<type_t>(sizeof(uint32_t) + sizeof(type_t) * ix, value); | ||
} | ||
|
||
void workgroupExecutionAndMemoryBarrier() | ||
{ | ||
nbl::hlsl::glsl::barrier(); | ||
//nbl::hlsl::glsl::memoryBarrierShared(); implied by the above | ||
} | ||
}; | ||
|
||
static ScratchProxy arithmeticAccessor; | ||
|
||
template<class Binop, class device_capabilities> | ||
struct operation_t | ||
{ | ||
using binop_base_t = typename Binop::base_t; | ||
using otype_t = typename Binop::type_t; | ||
|
||
void operator()() | ||
{ | ||
DataProxy<config_t,Binop> dataAccessor; | ||
nbl::hlsl::OPERATION<config_t,binop_base_t,device_capabilities>::template __call<DataProxy<config_t,Binop>, ScratchProxy>(dataAccessor,arithmeticAccessor); | ||
// we barrier before because we alias the accessors for Binop | ||
arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); | ||
} | ||
}; | ||
|
||
|
||
template<template<class> class binop, typename T, uint32_t N> | ||
static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) | ||
{ | ||
if (globalIndex()==0u) | ||
output[binop<T>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize()); | ||
|
||
operation_t<binop<T>,nbl::hlsl::jit::device_capabilities> func; | ||
func(); // store is done with data accessor now | ||
} | ||
|
||
|
||
type_t test() | ||
{ | ||
const type_t sourceVal = inputValue[globalIndex()]; | ||
|
||
subtest<bit_and, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<bit_xor, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<bit_or, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<plus, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<multiplies, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<minimum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
subtest<maximum, uint32_t, ITEMS_PER_INVOCATION>(sourceVal); | ||
return sourceVal; | ||
} | ||
|
||
|
||
uint32_t globalIndex() | ||
{ | ||
return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex(); | ||
} | ||
|
||
bool canStore() | ||
{ | ||
return nbl::hlsl::workgroup::SubgroupContiguousIndex()<ITEMS_PER_WG; | ||
} | ||
|
||
[numthreads(WORKGROUP_SIZE,1,1)] | ||
void main() | ||
{ | ||
const type_t sourceVal = test(); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl" | ||
|
||
#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl" | ||
|
||
#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" | ||
|
||
#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" | ||
#include "nbl/builtin/hlsl/subgroup/basic.hlsl" | ||
#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" | ||
|
||
#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" | ||
|
||
#include "common.hlsl" | ||
|
||
static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2; | ||
|
||
// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 | ||
uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} | ||
|
||
#ifndef ITEMS_PER_INVOCATION | ||
#error "Define ITEMS_PER_INVOCATION!" | ||
#endif | ||
|
||
using config_t = nbl::hlsl::workgroup2::Configuration<WORKGROUP_SIZE_LOG2, SUBGROUP_SIZE_LOG2, ITEMS_PER_INVOCATION>; | ||
|
||
typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t; | ||
|
||
// unfortunately DXC chokes on descriptors as static members | ||
// https://github.com/microsoft/DirectXShaderCompiler/issues/5940 | ||
[[vk::binding(0, 0)]] StructuredBuffer<type_t> inputValue; | ||
[[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; | ||
|
||
// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way | ||
uint32_t globalIndex(); | ||
// since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs | ||
bool canStore(); | ||
|
||
#ifndef OPERATION | ||
#error "Define OPERATION!" | ||
#endif | ||
#ifndef SUBGROUP_SIZE_LOG2 | ||
#error "Define SUBGROUP_SIZE_LOG2!" | ||
#endif | ||
|
||
// final (level 1/2) scan needs to fit in one subgroup exactly | ||
groupshared uint32_t scratch[config_t::SubgroupsPerVirtualWorkgroup*config_t::ItemsPerInvocation_1]; | ||
|
||
struct ScratchProxy | ||
{ | ||
void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value) | ||
{ | ||
value = scratch[ix]; | ||
} | ||
void set(const uint32_t ix, const uint32_t value) | ||
{ | ||
scratch[ix] = value; | ||
} | ||
|
||
uint32_t atomicOr(const uint32_t ix, const uint32_t value) | ||
{ | ||
return nbl::hlsl::glsl::atomicOr(scratch[ix],value); | ||
} | ||
|
||
void workgroupExecutionAndMemoryBarrier() | ||
{ | ||
nbl::hlsl::glsl::barrier(); | ||
//nbl::hlsl::glsl::memoryBarrierShared(); implied by the above | ||
} | ||
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
don't include non
subgroup2
headers