Devsh-Graphics-Programming
diff --git a/‎11_FFT/app_resources/shader.comp.hlsl
Lines changed: 6 additions & 6 deletions b/‎11_FFT/app_resources/shader.comp.hlsl
Lines changed: 6 additions & 6 deletions
diff --git a/‎23_ArithmeticUnitTest/CMakeLists.txt renamed to ‎23_Arithmetic2UnitTest/CMakeLists.txt b/‎23_ArithmeticUnitTest/CMakeLists.txt renamed to ‎23_Arithmetic2UnitTest/CMakeLists.txt
diff --git a/‎23_ArithmeticUnitTest/app_resources/common.hlsl renamed to ‎23_Arithmetic2UnitTest/app_resources/common.hlsl
Lines changed: 7 additions & 7 deletions b/‎23_ArithmeticUnitTest/app_resources/common.hlsl renamed to ‎23_Arithmetic2UnitTest/app_resources/common.hlsl
Lines changed: 7 additions & 7 deletions
diff --git a/‎23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
Lines changed: 19 additions & 0 deletions b/‎23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
Lines changed: 19 additions & 0 deletions
diff --git a/‎23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
Lines changed: 55 additions & 0 deletions b/‎23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
Lines changed: 55 additions & 0 deletions
diff --git a/‎23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
Lines changed: 74 additions & 0 deletions b/‎23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
Lines changed: 74 additions & 0 deletions
diff --git a/‎23_ArithmeticUnitTest/config.json.template renamed to ‎23_Arithmetic2UnitTest/config.json.template b/‎23_ArithmeticUnitTest/config.json.template renamed to ‎23_Arithmetic2UnitTest/config.json.template
@@ -14,13 +14,13 @@ uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(uint32_t(ConstevalParamete
 
 struct SharedMemoryAccessor 
 {
-	template <typename IndexType, typename AccessType>
+	template <typename AccessType, typename IndexType>
 	void set(IndexType idx, AccessType value)
 	{
 		sharedmem[idx] = value;
 	}
 
-	template <typename IndexType, typename AccessType>
+	template <typename AccessType, typename IndexType>
 	void get(IndexType idx, NBL_REF_ARG(AccessType) value)
 	{
 		value = sharedmem[idx];
@@ -44,14 +44,14 @@ struct Accessor
     }
 
 	// TODO: can't use our own BDA yet, because it doesn't support the types `workgroup::FFT` will invoke these templates with
-	template <typename AccessType>
-	void get(const uint32_t index, NBL_REF_ARG(AccessType) value)
+	template <typename AccessType, typename IndexType>
+	void get(const IndexType index, NBL_REF_ARG(AccessType) value)
 	{
 		value = vk::RawBufferLoad<AccessType>(address + index * sizeof(AccessType));
 	}
 
-	template <typename AccessType>
-	void set(const uint32_t index, const AccessType value)
+	template <typename AccessType, typename IndexType>
+	void set(const IndexType index, const AccessType value)
 	{
 		vk::RawBufferStore<AccessType>(address + index * sizeof(AccessType), value);
 	}
 
@@ -1,15 +1,14 @@
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 #include "nbl/builtin/hlsl/functional.hlsl"
 
-template<uint32_t kScanElementCount=1024*1024>
-struct Output
+struct PushConstantData
 {
-	NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanElementCount = kScanElementCount;
-
-	uint32_t subgroupSize;
-	uint32_t data[ScanElementCount];
+    uint64_t pInputBuf;
+    uint64_t pOutputBuf[8];
 };
 
+namespace arithmetic
+{
 // Thanks to our unified HLSL/C++ STD lib we're able to remove a whole load of code
 template<typename T>
 struct bit_and : nbl::hlsl::bit_and<T>
@@ -92,5 +91,6 @@ struct ballot : nbl::hlsl::plus<T>
 	static inline constexpr const char* name = "bitcount";
 #endif
 };
+}
 
-#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
@@ -0,0 +1,19 @@
+#include "common.hlsl"
+
+using namespace nbl;
+using namespace hlsl;
+
+[[vk::push_constant]] PushConstantData pc;
+
+struct device_capabilities
+{
+#ifdef TEST_NATIVE
+    NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = true;
+#else
+    NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = false;
+#endif
+};
+
+#ifndef OPERATION
+#error "Define OPERATION!"
+#endif
@@ -0,0 +1,55 @@
+#pragma shader_stage(compute)
+
+#define operation_t nbl::hlsl::OPERATION
+
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_params.hlsl"
+
+#include "shaderCommon.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/basic.hlsl"
+
+template<class Binop, class device_capabilities>
+using params_t = SUBGROUP_CONFIG_T;
+
+typedef vector<uint32_t, params_t<typename arithmetic::bit_and<uint32_t>::base_t, device_capabilities>::ItemsPerInvocation> type_t;
+
+uint32_t globalIndex()
+{
+    return glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+workgroup::SubgroupContiguousIndex();
+}
+
+template<class Binop>
+static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
+{
+    const uint64_t outputBufAddr = pc.pOutputBuf[Binop::BindingIndex];
+
+    assert(glsl::gl_SubgroupSize() == params_t<typename Binop::base_t, device_capabilities>::config_t::Size)
+
+    operation_t<params_t<typename Binop::base_t, device_capabilities> > func;
+    type_t val = func(sourceVal);
+
+    vk::RawBufferStore<type_t>(outputBufAddr + sizeof(type_t) * globalIndex(), val, sizeof(uint32_t));
+}
+
+type_t test()
+{
+    const uint32_t idx = globalIndex();
+    type_t sourceVal = vk::RawBufferLoad<type_t>(pc.pInputBuf + idx * sizeof(type_t));
+
+    subtest<arithmetic::bit_and<uint32_t> >(sourceVal);
+    subtest<arithmetic::bit_xor<uint32_t> >(sourceVal);
+    subtest<arithmetic::bit_or<uint32_t> >(sourceVal);
+    subtest<arithmetic::plus<uint32_t> >(sourceVal);
+    subtest<arithmetic::multiplies<uint32_t> >(sourceVal);
+    subtest<arithmetic::minimum<uint32_t> >(sourceVal);
+    subtest<arithmetic::maximum<uint32_t> >(sourceVal);
+    return sourceVal;
+}
+
+[numthreads(WORKGROUP_SIZE,1,1)]
+void main()
+{
+    test();
+}
@@ -0,0 +1,74 @@
+#pragma shader_stage(compute)
+
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl"
+
+using config_t = WORKGROUP_CONFIG_T;
+
+#include "shaderCommon.hlsl"
+
+typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
+
+// final (level 1/2) scan needs to fit in one subgroup exactly
+groupshared uint32_t scratch[mpl::max_v<int16_t,config_t::SharedScratchElementCount,1>];
+
+#include "../../common/include/WorkgroupDataAccessors.hlsl"
+
+static ScratchProxy arithmeticAccessor;
+
+template<class Binop, class device_capabilities>
+struct operation_t
+{
+    using binop_base_t = typename Binop::base_t;
+    using otype_t = typename Binop::type_t;
+
+    // workgroup reduction returns the value of the reduction
+    // workgroup scans do no return anything, but use the data accessor to do the storing directly
+    void operator()()
+    {
+        using data_proxy_t = PreloadedDataProxy<config_t::WorkgroupSizeLog2,config_t::VirtualWorkgroupSize,config_t::ItemsPerInvocation_0>;
+        data_proxy_t dataAccessor = data_proxy_t::create(pc.pInputBuf, pc.pOutputBuf[Binop::BindingIndex]);
+        dataAccessor.preload();
+#if IS_REDUCTION
+        otype_t value =
+#endif
+        OPERATION<config_t,binop_base_t,device_capabilities>::template __call<data_proxy_t, ScratchProxy>(dataAccessor,arithmeticAccessor);
+        // we barrier before because we alias the accessors for Binop
+        arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
+#if IS_REDUCTION
+        [unroll]
+        for (uint32_t i = 0; i < data_proxy_t::PreloadedDataCount; i++)
+            dataAccessor.preloaded[i] = value;
+#endif
+        dataAccessor.unload();
+    }
+};
+
+
+template<class Binop>
+static void subtest()
+{
+    assert(glsl::gl_SubgroupSize() == config_t::SubgroupSize)
+
+    operation_t<Binop,device_capabilities> func;
+    func();
+}
+
+void test()
+{
+    subtest<arithmetic::bit_and<uint32_t> >();
+    subtest<arithmetic::bit_xor<uint32_t> >();
+    subtest<arithmetic::bit_or<uint32_t> >();
+    subtest<arithmetic::plus<uint32_t> >();
+    subtest<arithmetic::multiplies<uint32_t> >();
+    subtest<arithmetic::minimum<uint32_t> >();
+    subtest<arithmetic::maximum<uint32_t> >();
+}
+
+[numthreads(config_t::WorkgroupSize,1,1)]
+void main()
+{
+    test();
+}
Original file line number	Diff line number	Diff line change
`@@ -14,13 +14,13 @@ uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(uint32_t(ConstevalParamete`
`14`	`14`
`15`	`15`	`struct SharedMemoryAccessor`
`16`	`16`	`{`
`17`		`- template <typename IndexType, typename AccessType>`
	`17`	`+ template <typename AccessType, typename IndexType>`
`18`	`18`	`void set(IndexType idx, AccessType value)`
`19`	`19`	`{`
`20`	`20`	`sharedmem[idx] = value;`
`21`	`21`	`}`
`22`	`22`
`23`		`- template <typename IndexType, typename AccessType>`
	`23`	`+ template <typename AccessType, typename IndexType>`
`24`	`24`	`void get(IndexType idx, NBL_REF_ARG(AccessType) value)`
`25`	`25`	`{`
`26`	`26`	`value = sharedmem[idx];`
`@@ -44,14 +44,14 @@ struct Accessor`
`44`	`44`	`}`
`45`	`45`
`46`	`46`	// TODO: can't use our own BDA yet, because it doesn't support the types `workgroup::FFT` will invoke these templates with
`47`		`- template <typename AccessType>`
`48`		`- void get(const uint32_t index, NBL_REF_ARG(AccessType) value)`
	`47`	`+ template <typename AccessType, typename IndexType>`
	`48`	`+ void get(const IndexType index, NBL_REF_ARG(AccessType) value)`
`49`	`49`	`{`
`50`	`50`	`value = vk::RawBufferLoad<AccessType>(address + index * sizeof(AccessType));`
`51`	`51`	`}`
`52`	`52`
`53`		`- template <typename AccessType>`
`54`		`- void set(const uint32_t index, const AccessType value)`
	`53`	`+ template <typename AccessType, typename IndexType>`
	`54`	`+ void set(const IndexType index, const AccessType value)`
`55`	`55`	`{`
`56`	`56`	`vk::RawBufferStore<AccessType>(address + index * sizeof(AccessType), value);`
`57`	`57`	`}`