diff --git a/11_FFT/app_resources/shader.comp.hlsl b/11_FFT/app_resources/shader.comp.hlsl
index ecbf4f092..63a85b0c4 100644
--- a/11_FFT/app_resources/shader.comp.hlsl
+++ b/11_FFT/app_resources/shader.comp.hlsl
@@ -14,13 +14,13 @@ uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(uint32_t(ConstevalParamete
 
 struct SharedMemoryAccessor 
 {
-	template <typename IndexType, typename AccessType>
+	template <typename AccessType, typename IndexType>
 	void set(IndexType idx, AccessType value)
 	{
 		sharedmem[idx] = value;
 	}
 
-	template <typename IndexType, typename AccessType>
+	template <typename AccessType, typename IndexType>
 	void get(IndexType idx, NBL_REF_ARG(AccessType) value)
 	{
 		value = sharedmem[idx];
@@ -44,14 +44,14 @@ struct Accessor
     }
 
 	// TODO: can't use our own BDA yet, because it doesn't support the types `workgroup::FFT` will invoke these templates with
-	template <typename AccessType>
-	void get(const uint32_t index, NBL_REF_ARG(AccessType) value)
+	template <typename AccessType, typename IndexType>
+	void get(const IndexType index, NBL_REF_ARG(AccessType) value)
 	{
 		value = vk::RawBufferLoad<AccessType>(address + index * sizeof(AccessType));
 	}
 
-	template <typename AccessType>
-	void set(const uint32_t index, const AccessType value)
+	template <typename AccessType, typename IndexType>
+	void set(const IndexType index, const AccessType value)
 	{
 		vk::RawBufferStore<AccessType>(address + index * sizeof(AccessType), value);
 	}
diff --git a/23_ArithmeticUnitTest/CMakeLists.txt b/23_Arithmetic2UnitTest/CMakeLists.txt
similarity index 100%
rename from 23_ArithmeticUnitTest/CMakeLists.txt
rename to 23_Arithmetic2UnitTest/CMakeLists.txt
diff --git a/23_ArithmeticUnitTest/app_resources/common.hlsl b/23_Arithmetic2UnitTest/app_resources/common.hlsl
similarity index 89%
rename from 23_ArithmeticUnitTest/app_resources/common.hlsl
rename to 23_Arithmetic2UnitTest/app_resources/common.hlsl
index 10892a2b9..6654645cf 100644
--- a/23_ArithmeticUnitTest/app_resources/common.hlsl
+++ b/23_Arithmetic2UnitTest/app_resources/common.hlsl
@@ -1,15 +1,14 @@
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 #include "nbl/builtin/hlsl/functional.hlsl"
 
-template<uint32_t kScanElementCount=1024*1024>
-struct Output
+struct PushConstantData
 {
-	NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanElementCount = kScanElementCount;
-
-	uint32_t subgroupSize;
-	uint32_t data[ScanElementCount];
+    uint64_t pInputBuf;
+    uint64_t pOutputBuf[8];
 };
 
+namespace arithmetic
+{
 // Thanks to our unified HLSL/C++ STD lib we're able to remove a whole load of code
 template<typename T>
 struct bit_and : nbl::hlsl::bit_and<T>
@@ -92,5 +91,6 @@ struct ballot : nbl::hlsl::plus<T>
 	static inline constexpr const char* name = "bitcount";
 #endif
 };
+}
 
-#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
\ No newline at end of file
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
diff --git a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
new file mode 100644
index 000000000..3793b08f8
--- /dev/null
+++ b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl
@@ -0,0 +1,19 @@
+#include "common.hlsl"
+
+using namespace nbl;
+using namespace hlsl;
+
+[[vk::push_constant]] PushConstantData pc;
+
+struct device_capabilities
+{
+#ifdef TEST_NATIVE
+    NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = true;
+#else
+    NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = false;
+#endif
+};
+
+#ifndef OPERATION
+#error "Define OPERATION!"
+#endif
diff --git a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
new file mode 100644
index 000000000..3105aec56
--- /dev/null
+++ b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl
@@ -0,0 +1,55 @@
+#pragma shader_stage(compute)
+
+#define operation_t nbl::hlsl::OPERATION
+
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_params.hlsl"
+
+#include "shaderCommon.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/basic.hlsl"
+
+template<class Binop, class device_capabilities>
+using params_t = SUBGROUP_CONFIG_T;
+
+typedef vector<uint32_t, params_t<typename arithmetic::bit_and<uint32_t>::base_t, device_capabilities>::ItemsPerInvocation> type_t;
+
+uint32_t globalIndex()
+{
+    return glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+workgroup::SubgroupContiguousIndex();
+}
+
+template<class Binop>
+static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
+{
+    const uint64_t outputBufAddr = pc.pOutputBuf[Binop::BindingIndex];
+
+    assert(glsl::gl_SubgroupSize() == params_t<typename Binop::base_t, device_capabilities>::config_t::Size)
+
+    operation_t<params_t<typename Binop::base_t, device_capabilities> > func;
+    type_t val = func(sourceVal);
+
+    vk::RawBufferStore<type_t>(outputBufAddr + sizeof(type_t) * globalIndex(), val, sizeof(uint32_t));
+}
+
+type_t test()
+{
+    const uint32_t idx = globalIndex();
+    type_t sourceVal = vk::RawBufferLoad<type_t>(pc.pInputBuf + idx * sizeof(type_t));
+
+    subtest<arithmetic::bit_and<uint32_t> >(sourceVal);
+    subtest<arithmetic::bit_xor<uint32_t> >(sourceVal);
+    subtest<arithmetic::bit_or<uint32_t> >(sourceVal);
+    subtest<arithmetic::plus<uint32_t> >(sourceVal);
+    subtest<arithmetic::multiplies<uint32_t> >(sourceVal);
+    subtest<arithmetic::minimum<uint32_t> >(sourceVal);
+    subtest<arithmetic::maximum<uint32_t> >(sourceVal);
+    return sourceVal;
+}
+
+[numthreads(WORKGROUP_SIZE,1,1)]
+void main()
+{
+    test();
+}
diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
new file mode 100644
index 000000000..2a32ed20e
--- /dev/null
+++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl
@@ -0,0 +1,74 @@
+#pragma shader_stage(compute)
+
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl"
+
+using config_t = WORKGROUP_CONFIG_T;
+
+#include "shaderCommon.hlsl"
+
+typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
+
+// final (level 1/2) scan needs to fit in one subgroup exactly
+groupshared uint32_t scratch[mpl::max_v<int16_t,config_t::SharedScratchElementCount,1>];
+
+#include "../../common/include/WorkgroupDataAccessors.hlsl"
+
+static ScratchProxy arithmeticAccessor;
+
+template<class Binop, class device_capabilities>
+struct operation_t
+{
+    using binop_base_t = typename Binop::base_t;
+    using otype_t = typename Binop::type_t;
+
+    // workgroup reduction returns the value of the reduction
+    // workgroup scans do no return anything, but use the data accessor to do the storing directly
+    void operator()()
+    {
+        using data_proxy_t = PreloadedDataProxy<config_t::WorkgroupSizeLog2,config_t::VirtualWorkgroupSize,config_t::ItemsPerInvocation_0>;
+        data_proxy_t dataAccessor = data_proxy_t::create(pc.pInputBuf, pc.pOutputBuf[Binop::BindingIndex]);
+        dataAccessor.preload();
+#if IS_REDUCTION
+        otype_t value =
+#endif
+        OPERATION<config_t,binop_base_t,device_capabilities>::template __call<data_proxy_t, ScratchProxy>(dataAccessor,arithmeticAccessor);
+        // we barrier before because we alias the accessors for Binop
+        arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
+#if IS_REDUCTION
+        [unroll]
+        for (uint32_t i = 0; i < data_proxy_t::PreloadedDataCount; i++)
+            dataAccessor.preloaded[i] = value;
+#endif
+        dataAccessor.unload();
+    }
+};
+
+
+template<class Binop>
+static void subtest()
+{
+    assert(glsl::gl_SubgroupSize() == config_t::SubgroupSize)
+
+    operation_t<Binop,device_capabilities> func;
+    func();
+}
+
+void test()
+{
+    subtest<arithmetic::bit_and<uint32_t> >();
+    subtest<arithmetic::bit_xor<uint32_t> >();
+    subtest<arithmetic::bit_or<uint32_t> >();
+    subtest<arithmetic::plus<uint32_t> >();
+    subtest<arithmetic::multiplies<uint32_t> >();
+    subtest<arithmetic::minimum<uint32_t> >();
+    subtest<arithmetic::maximum<uint32_t> >();
+}
+
+[numthreads(config_t::WorkgroupSize,1,1)]
+void main()
+{
+    test();
+}
\ No newline at end of file
diff --git a/23_ArithmeticUnitTest/config.json.template b/23_Arithmetic2UnitTest/config.json.template
similarity index 100%
rename from 23_ArithmeticUnitTest/config.json.template
rename to 23_Arithmetic2UnitTest/config.json.template
diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp
new file mode 100644
index 000000000..65ef126ad
--- /dev/null
+++ b/23_Arithmetic2UnitTest/main.cpp
@@ -0,0 +1,505 @@
+#include "nbl/application_templates/BasicMultiQueueApplication.hpp"
+#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+#include "app_resources/common.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_params.hlsl"
+
+using namespace nbl;
+using namespace core;
+using namespace asset;
+using namespace system;
+using namespace video;
+
+// method emulations on the CPU, to verify the results of the GPU methods
+template<class Binop>
+struct emulatedReduction
+{
+	using type_t = typename Binop::type_t;
+
+	static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount)
+	{
+		const type_t red = std::reduce(in,in+itemCount,Binop::identity,Binop());
+		std::fill(out,out+itemCount,red);
+	}
+
+	static inline constexpr const char* name = "reduction";
+};
+template<class Binop>
+struct emulatedScanInclusive
+{
+	using type_t = typename Binop::type_t;
+
+	static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount)
+	{
+		std::inclusive_scan(in,in+itemCount,out,Binop());
+	}
+	static inline constexpr const char* name = "inclusive_scan";
+};
+template<class Binop>
+struct emulatedScanExclusive
+{
+	using type_t = typename Binop::type_t;
+
+	static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount)
+	{
+		std::exclusive_scan(in,in+itemCount,out,Binop::identity,Binop());
+	}
+	static inline constexpr const char* name = "exclusive_scan";
+};
+
+class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+{
+	using device_base_t = application_templates::BasicMultiQueueApplication;
+	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+
+public:
+	Workgroup2ScanTestApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+		system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+	bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+	{
+		if (!device_base_t::onAppInitialized(std::move(system)))
+			return false;
+		if (!asset_base_t::onAppInitialized(std::move(system)))
+			return false;
+
+		transferDownQueue = getTransferDownQueue();
+		computeQueue = getComputeQueue();
+
+		// TODO: get the element count from argv
+		const uint32_t elementCount = 1024 * 1024;
+		// populate our random data buffer on the CPU and create a GPU copy
+		inputData = new uint32_t[elementCount];
+		smart_refctd_ptr<IGPUBuffer> gpuinputDataBuffer;
+		{
+			std::mt19937 randGenerator(0xdeadbeefu);
+			for (uint32_t i = 0u; i < elementCount; i++)
+				inputData[i] = randGenerator(); // TODO: change to using xoroshiro, then we can skip having the input buffer at all
+
+			IGPUBuffer::SCreationParams inputDataBufferCreationParams = {};
+			inputDataBufferCreationParams.size = sizeof(uint32_t) * elementCount;
+			inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+			m_utils->createFilledDeviceLocalBufferOnDedMem(
+				SIntendedSubmitInfo{.queue=getTransferUpQueue()},
+				std::move(inputDataBufferCreationParams),
+				inputData
+			).move_into(gpuinputDataBuffer);
+		}
+
+		// create 8 buffers for 8 operations
+		for (auto i=0u; i<OutputBufferCount; i++)
+		{
+			IGPUBuffer::SCreationParams params = {};
+			params.size = gpuinputDataBuffer->getSize();
+			params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+
+			outputBuffers[i] = m_device->createBuffer(std::move(params));
+			auto mreq = outputBuffers[i]->getMemoryReqs();
+			mreq.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits();
+			assert(mreq.memoryTypeBits);
+
+			auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+			assert(bufferMem.isValid());
+		}
+		pc.pInputBuf = gpuinputDataBuffer->getDeviceAddress();
+		for (uint32_t i = 0; i < OutputBufferCount; i++)
+			pc.pOutputBuf[i] = outputBuffers[i]->getDeviceAddress();
+
+		// create Pipeline Layout
+		{
+			SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0,.size = sizeof(PushConstantData) };
+			pipelineLayout = m_device->createPipelineLayout({&pcRange, 1});
+		}
+
+		const auto spirv_isa_cache_path = localOutputCWD / "spirv_isa_cache.bin";
+		// enclose to make sure file goes out of scope and we can reopen it
+		{
+			smart_refctd_ptr<const IFile> spirv_isa_cache_input;
+			// try to load SPIR-V to ISA cache
+			{
+				ISystem::future_t<smart_refctd_ptr<IFile>> fileCreate;
+				m_system->createFile(fileCreate, spirv_isa_cache_path, IFile::ECF_READ | IFile::ECF_MAPPABLE | IFile::ECF_COHERENT);
+				if (auto lock = fileCreate.acquire())
+					spirv_isa_cache_input = *lock;
+			}
+			// create the cache
+			{
+				std::span<const uint8_t> spirv_isa_cache_data = {};
+				if (spirv_isa_cache_input)
+					spirv_isa_cache_data = { reinterpret_cast<const uint8_t*>(spirv_isa_cache_input->getMappedPointer()),spirv_isa_cache_input->getSize() };
+				else
+					m_logger->log("Failed to load SPIR-V 2 ISA cache!", ILogger::ELL_PERFORMANCE);
+				// Normally we'd deserialize a `ICPUPipelineCache` properly and pass that instead
+				m_spirv_isa_cache = m_device->createPipelineCache(spirv_isa_cache_data);
+			}
+		}
+		{
+			// TODO: rename `deleteDirectory` to just `delete`? and a `IFile::setSize()` ?
+			m_system->deleteDirectory(spirv_isa_cache_path);
+			ISystem::future_t<smart_refctd_ptr<IFile>> fileCreate;
+			m_system->createFile(fileCreate, spirv_isa_cache_path, IFile::ECF_WRITE);
+			// I can be relatively sure I'll succeed to acquire the future, the pointer to created file might be null though.
+			m_spirv_isa_cache_output = *fileCreate.acquire();
+			if (!m_spirv_isa_cache_output)
+				logFail("Failed to Create SPIR-V to ISA cache file.");
+		}
+
+		// load shader source from file
+		auto getShaderSource = [&](const char* filePath) -> auto
+		{
+			IAssetLoader::SAssetLoadParams lparams = {};
+			lparams.logger = m_logger.get();
+			lparams.workingDirectory = "";
+			auto bundle = m_assetMgr->getAsset(filePath, lparams);
+			if (bundle.getContents().empty() || bundle.getAssetType()!=IAsset::ET_SHADER)
+			{
+				m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath);
+				exit(-1);
+			}
+			auto firstAssetInBundle = bundle.getContents()[0];
+			return smart_refctd_ptr_static_cast<ICPUShader>(firstAssetInBundle);
+		};
+
+		auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl");
+		auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl");
+		// now create or retrieve final resources to run our tests
+		sema = m_device->createSemaphore(timelineValue);
+		resultsBuffer = ICPUBuffer::create({ outputBuffers[0]->getSize() });
+		{
+			smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+			if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1}))
+			{
+				logFail("Failed to create Command Buffers!\n");
+				return false;
+			}
+		}
+
+		const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
+		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
+		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
+		for (uint32_t useNative = 0; useNative <= uint32_t(m_physicalDevice->getProperties().limits.shaderSubgroupArithmetic); useNative++)
+		{
+			if (useNative)
+				m_logger->log("Testing with native subgroup arithmetic", ILogger::ELL_INFO);
+			else
+				m_logger->log("Testing with emulated subgroup arithmetic", ILogger::ELL_INFO);
+
+			for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
+			{
+				const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
+				for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize *= 2u)
+				{
+					// make sure renderdoc captures everything for debugging
+					m_api->startCapture();
+					m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize);
+
+					for (uint32_t j = 0; j < ItemsPerInvocations.size(); j++)
+					{
+						const uint32_t itemsPerInvocation = ItemsPerInvocations[j];
+						uint32_t itemsPerWG = workgroupSize * itemsPerInvocation;
+						m_logger->log("Testing Items per Invocation %u", ILogger::ELL_INFO, itemsPerInvocation);
+						bool passed = true;
+						passed = runTest<emulatedReduction, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
+						logTestOutcome(passed, itemsPerWG);
+						passed = runTest<emulatedScanInclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
+						logTestOutcome(passed, itemsPerWG);
+						passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
+						logTestOutcome(passed, itemsPerWG);
+
+						hlsl::workgroup2::SArithmeticConfiguration wgConfig;
+					    wgConfig.init(hlsl::findMSB(workgroupSize), subgroupSizeLog2, itemsPerInvocation);
+						itemsPerWG = wgConfig.VirtualWorkgroupSize * wgConfig.ItemsPerInvocation_0;
+						m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
+						passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
+						logTestOutcome(passed, itemsPerWG);
+						passed = runTest<emulatedScanInclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
+						logTestOutcome(passed, itemsPerWG);
+						passed = runTest<emulatedScanExclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed;
+						logTestOutcome(passed, itemsPerWG);
+					}
+					m_api->endCapture();
+
+					// save cache every now and then	
+					{
+						auto cpu = m_spirv_isa_cache->convertToCPUCache();
+						// Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata
+						auto bin = cpu->getEntries().begin()->second.bin;
+						IFile::success_t success;
+						m_spirv_isa_cache_output->write(success, bin->data(), 0ull, bin->size());
+						if (!success)
+							logFail("Could not write Create SPIR-V to ISA cache to disk!");
+					}
+				}
+			}
+		}
+
+		return true;
+	}
+
+	virtual bool onAppTerminated() override
+	{
+		m_logger->log("==========Result==========", ILogger::ELL_INFO);
+		m_logger->log("Fail Count: %u", ILogger::ELL_INFO, totalFailCount);
+		delete[] inputData;
+		return true;
+	}
+
+	// the unit test is carried out on init
+	void workLoopBody() override {}
+
+	//
+	bool keepRunning() override { return false; }
+
+private:
+	void logTestOutcome(bool passed, uint32_t workgroupSize)
+	{
+		if (passed)
+			m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize);
+		else
+		{
+			totalFailCount++;
+			m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize);
+		}
+	}
+
+	// create pipeline (specialized every test) [TODO: turn into a future/async]
+	smart_refctd_ptr<IGPUComputePipeline> createPipeline(const ICPUShader* overridenUnspecialized, const uint8_t subgroupSizeLog2)
+	{
+		auto shader = m_device->createShader(overridenUnspecialized);
+		IGPUComputePipeline::SCreationParams params = {};
+		params.layout = pipelineLayout.get();
+		params.shader = {
+			.entryPoint = "main",
+			.shader = shader.get(),
+			.entries = nullptr,
+			.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(subgroupSizeLog2),
+			.requireFullSubgroups = true
+		};
+		core::smart_refctd_ptr<IGPUComputePipeline> pipeline;
+		if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{&params,1},&pipeline))
+			return nullptr;
+		return pipeline;
+	}
+
+	template<template<class> class Arithmetic, bool WorkgroupTest>
+	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, bool useNative, uint32_t itemsPerWG, uint32_t itemsPerInvoc = 1u)
+	{
+		std::string arith_name = Arithmetic<arithmetic::bit_xor<float>>::name;
+		const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize);
+
+		auto compiler = make_smart_refctd_ptr<asset::CHLSLCompiler>(smart_refctd_ptr(m_system));
+		CHLSLCompiler::SOptions options = {};
+		options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE;
+		options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
+		options.spirvOptimizer = nullptr;
+#ifndef _NBL_DEBUG
+		ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
+		auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
+		options.spirvOptimizer = opt.get();
+#else
+		options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT;
+#endif
+		options.preprocessorOptions.sourceIdentifier = source->getFilepathHint();
+		options.preprocessorOptions.logger = m_logger.get();
+
+		auto* includeFinder = compiler->getDefaultIncludeFinder();
+		options.preprocessorOptions.includeFinder = includeFinder;
+
+		smart_refctd_ptr<ICPUShader> overriddenUnspecialized;
+		if constexpr (WorkgroupTest)
+		{
+			hlsl::workgroup2::SArithmeticConfiguration wgConfig;
+			wgConfig.init(hlsl::findMSB(workgroupSize), subgroupSizeLog2, itemsPerInvoc);
+
+			const std::string definitions[3] = {
+				"workgroup2::" + arith_name,
+				wgConfig.getConfigTemplateStructString(),
+				std::to_string(arith_name=="reduction")
+			};
+
+			const IShaderCompiler::SMacroDefinition defines[4] = {
+				{ "OPERATION", definitions[0] },
+				{ "WORKGROUP_CONFIG_T", definitions[1] },
+				{ "IS_REDUCTION", definitions[2] },
+				{ "TEST_NATIVE", "1" }
+			};
+			if (useNative)
+				options.preprocessorOptions.extraDefines = { defines, defines + 4 };
+			else
+				options.preprocessorOptions.extraDefines = { defines, defines + 3 };
+
+			overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
+		}
+		else
+		{
+			hlsl::subgroup2::SArithmeticParams sgParams;
+			sgParams.init(subgroupSizeLog2, itemsPerInvoc);
+
+			const std::string definitions[3] = { 
+				"subgroup2::" + arith_name,
+				std::to_string(workgroupSize),
+				sgParams.getParamTemplateStructString()
+			};
+
+			const IShaderCompiler::SMacroDefinition defines[4] = {
+				{ "OPERATION", definitions[0] },
+				{ "WORKGROUP_SIZE", definitions[1] },
+				{ "SUBGROUP_CONFIG_T", definitions[2] },
+				{ "TEST_NATIVE", "1" }
+			};
+			if (useNative)
+				options.preprocessorOptions.extraDefines = { defines, defines + 4 };
+			else
+				options.preprocessorOptions.extraDefines = { defines, defines + 3 };
+
+			overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
+		}
+
+		auto pipeline = createPipeline(overriddenUnspecialized.get(),subgroupSizeLog2);
+
+		// TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`)
+		uint32_t workgroupCount = 1;// min(elementCount / itemsPerWG, m_physicalDevice->getLimits().maxComputeWorkGroupCount[0]);
+
+		cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE);
+		cmdbuf->bindComputePipeline(pipeline.get());
+		cmdbuf->pushConstants(pipelineLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PushConstantData), &pc);
+		cmdbuf->dispatch(workgroupCount, 1, 1);
+		{
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount];
+			for (auto i=0u; i<OutputBufferCount; i++)
+			{
+				memoryBarrier[i] = {
+					.barrier = {
+						.dep = {
+							.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+							.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
+							// in theory we don't need the HOST BITS cause we block on a semaphore but might as well add them
+							.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT|PIPELINE_STAGE_FLAGS::HOST_BIT,
+							.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS|ACCESS_FLAGS::HOST_READ_BIT
+						}
+					},
+					.range = {0ull,outputBuffers[i]->getSize(),outputBuffers[i]}
+				};
+			}
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = {.memBarriers={},.bufBarriers=memoryBarrier};
+			cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info);
+		}
+		cmdbuf->end();
+
+		const IQueue::SSubmitInfo::SSemaphoreInfo signal[1] = {{.semaphore=sema.get(),.value=++timelineValue}};
+		const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = {{.cmdbuf=cmdbuf.get()}};
+		const IQueue::SSubmitInfo submits[1] = {{.commandBuffers=cmdbufs,.signalSemaphores=signal}};
+		computeQueue->submit(submits);
+		const ISemaphore::SWaitInfo wait[1] = {{.semaphore=sema.get(),.value=timelineValue}};
+		m_device->blockForSemaphores(wait);
+
+		const uint32_t subgroupSize = 1u << subgroupSizeLog2;
+		// check results
+		bool passed = validateResults<Arithmetic, arithmetic::bit_and<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc);
+		passed = validateResults<Arithmetic, arithmetic::bit_xor<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, arithmetic::bit_or<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, arithmetic::plus<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, arithmetic::multiplies<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, arithmetic::minimum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed;
+		passed = validateResults<Arithmetic, arithmetic::maximum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed;
+
+		return passed;
+	}
+
+	//returns true if result matches
+	template<template<class> class Arithmetic, class Binop, bool WorkgroupTest>
+	bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount, const uint32_t subgroupSize, const uint32_t itemsPerInvoc)
+	{
+		bool success = true;
+
+		// download data
+		const SBufferRange<IGPUBuffer> bufferRange = {0u, resultsBuffer->getSize(), outputBuffers[Binop::BindingIndex]};
+		m_utils->downloadBufferRangeViaStagingBufferAutoSubmit(SIntendedSubmitInfo{.queue=transferDownQueue},bufferRange,resultsBuffer->getPointer());
+
+		using type_t = typename Binop::type_t;
+		const auto testData = reinterpret_cast<const uint32_t*>(resultsBuffer->getPointer());
+
+		// TODO: parallel for (the temporary values need to be threadlocal or what?)
+		// now check if the data obtained has valid values
+		type_t* tmp = new type_t[itemsPerWG];
+		for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++)
+		{
+			if constexpr (WorkgroupTest)
+			{
+				const auto workgroupOffset = workgroupID * itemsPerWG;
+				Arithmetic<Binop>::impl(tmp, inputData + workgroupOffset, itemsPerWG);
+
+				for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++)
+				{
+					const auto globalInvocationIndex = workgroupOffset + localInvocationIndex;
+					const auto cpuVal = tmp[localInvocationIndex];
+					const auto gpuVal = testData[globalInvocationIndex];
+					if (cpuVal != gpuVal)
+					{
+						m_logger->log(
+							"Failed test #%d  (%s)  (%s) Expected %u got %u for workgroup %d and localinvoc %d",
+							ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name,
+							cpuVal, gpuVal, workgroupID, localInvocationIndex
+						);
+						success = false;
+						break;
+					}
+				}
+			}
+			else
+			{
+				const auto workgroupOffset = workgroupID * itemsPerWG;
+				const auto workgroupSize = itemsPerWG / itemsPerInvoc;
+				for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < workgroupSize; pseudoSubgroupID += subgroupSize)
+					Arithmetic<Binop>::impl(tmp + pseudoSubgroupID * itemsPerInvoc, inputData + workgroupOffset + pseudoSubgroupID * itemsPerInvoc, subgroupSize * itemsPerInvoc);
+
+				for (uint32_t localInvocationIndex = 0u; localInvocationIndex < workgroupSize; localInvocationIndex++)
+				{
+					const auto localOffset = localInvocationIndex * itemsPerInvoc;
+					const auto globalInvocationIndex = workgroupOffset + localOffset;
+
+					for (uint32_t itemInvocationIndex = 0u; itemInvocationIndex < itemsPerInvoc; itemInvocationIndex++)
+					{
+						const auto cpuVal = tmp[localOffset + itemInvocationIndex];
+						const auto gpuVal = testData[globalInvocationIndex + itemInvocationIndex];
+						if (cpuVal != gpuVal)
+						{
+							m_logger->log(
+								"Failed test #%d  (%s)  (%s) Expected %u got %u for workgroup %d and localinvoc %d and iteminvoc %d",
+								ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name,
+								cpuVal, gpuVal, workgroupID, localInvocationIndex, itemInvocationIndex
+							);
+							success = false;
+							break;
+						}
+					}
+				}
+			}
+		}
+		delete[] tmp;
+
+		return success;
+	}
+
+	IQueue* transferDownQueue;
+	IQueue* computeQueue;
+	smart_refctd_ptr<IGPUPipelineCache> m_spirv_isa_cache;
+	smart_refctd_ptr<IFile> m_spirv_isa_cache_output;
+
+	uint32_t* inputData = nullptr;
+	constexpr static inline uint32_t OutputBufferCount = 8u;
+	smart_refctd_ptr<IGPUBuffer> outputBuffers[OutputBufferCount];
+	smart_refctd_ptr<IGPUPipelineLayout> pipelineLayout;
+	PushConstantData pc;
+
+	smart_refctd_ptr<ISemaphore> sema;
+	uint64_t timelineValue = 0;
+	smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
+	smart_refctd_ptr<ICPUBuffer> resultsBuffer;
+
+	uint32_t totalFailCount = 0;
+
+	constexpr static inline std::array<uint32_t, 4> ItemsPerInvocations = { 1, 2, 3, 4 };
+};
+
+NBL_MAIN_FUNC(Workgroup2ScanTestApp)
\ No newline at end of file
diff --git a/23_ArithmeticUnitTest/pipeline.groovy b/23_Arithmetic2UnitTest/pipeline.groovy
similarity index 100%
rename from 23_ArithmeticUnitTest/pipeline.groovy
rename to 23_Arithmetic2UnitTest/pipeline.groovy
diff --git a/23_ArithmeticUnitTest/app_resources/shaderCommon.hlsl b/23_ArithmeticUnitTest/app_resources/shaderCommon.hlsl
deleted file mode 100644
index 13ee8d21e..000000000
--- a/23_ArithmeticUnitTest/app_resources/shaderCommon.hlsl
+++ /dev/null
@@ -1,55 +0,0 @@
-#include "common.hlsl"
-
-#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
-#include "nbl/builtin/hlsl/subgroup/basic.hlsl"
-#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl"
-
-#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
-
-// https://github.com/microsoft/DirectXShaderCompiler/issues/6144
-uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);}
-
-// unfortunately DXC chokes on descriptors as static members
-// https://github.com/microsoft/DirectXShaderCompiler/issues/5940
-[[vk::binding(0, 0)]] StructuredBuffer<uint32_t> inputValue;
-[[vk::binding(1, 0)]] RWByteAddressBuffer output[8];
-
-// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way
-uint32_t globalIndex();
-// since we test ITEMS_PER_WG<WorkgroupSize we need this so workgroups don't overwrite each other's outputs
-bool canStore();
-
-//typedef decltype(inputValue[0]) type_t;
-typedef uint32_t type_t;
-
-
-#ifndef OPERATION
-#error "Define OPERATION!"
-#endif
-template<template<class> class binop>
-static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal)
-{
-	if (globalIndex()==0u)
-		output[binop<type_t>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
-		
-	operation_t<typename binop<type_t>::base_t,nbl::hlsl::jit::device_capabilities> func;
-	if (canStore())
-		output[binop<type_t>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal));
-}
-
-
-type_t test()
-{
-	const type_t sourceVal = inputValue[globalIndex()];
-
-	subtest<bit_and>(sourceVal);
-	subtest<bit_xor>(sourceVal);
-	subtest<bit_or>(sourceVal);
-	subtest<plus>(sourceVal);
-	subtest<multiplies>(sourceVal);
-	subtest<minimum>(sourceVal);
-	subtest<maximum>(sourceVal);
-	return sourceVal;
-}
-
-#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
\ No newline at end of file
diff --git a/23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl b/23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl
deleted file mode 100644
index 479265d73..000000000
--- a/23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl
+++ /dev/null
@@ -1,18 +0,0 @@
-#pragma shader_stage(compute)
-
-#define operation_t nbl::hlsl::OPERATION
-
-#include "shaderCommon.hlsl"
-
-uint32_t globalIndex()
-{
-	return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex();
-}
-
-bool canStore() {return true;}
-
-[numthreads(WORKGROUP_SIZE,1,1)]
-void main()
-{
-	test();
-}
\ No newline at end of file
diff --git a/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl b/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl
deleted file mode 100644
index 9bafae47f..000000000
--- a/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl
+++ /dev/null
@@ -1,107 +0,0 @@
-#pragma shader_stage(compute)
-
-
-#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl"
-
-static const uint32_t ArithmeticSz = nbl::hlsl::workgroup::scratch_size_arithmetic<ITEMS_PER_WG>::value;
-static const uint32_t BallotSz = nbl::hlsl::workgroup::scratch_size_ballot<ITEMS_PER_WG>::value;
-static const uint32_t ScratchSz = ArithmeticSz+BallotSz;
-
-// TODO: Can we make it a static variable in the ScratchProxy struct?
-groupshared uint32_t scratch[ScratchSz];
-
-
-#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
-
-
-template<uint16_t offset>
-struct ScratchProxy
-{
-	void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value)
-	{
-		value = scratch[ix+offset];
-	}
-	void set(const uint32_t ix, const uint32_t value)
-	{
-		scratch[ix+offset] = value;
-	}
-
-	uint32_t atomicOr(const uint32_t ix, const uint32_t value)
-	{
-		return nbl::hlsl::glsl::atomicOr(scratch[ix],value);
-	}
-
-	void workgroupExecutionAndMemoryBarrier()
-	{
-		nbl::hlsl::glsl::barrier();
-		//nbl::hlsl::glsl::memoryBarrierShared(); implied by the above
-	}
-};
-
-static ScratchProxy<0> arithmeticAccessor;
-
-
-#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl"
-
-
-template<class Binop, class device_capabilities>
-struct operation_t
-{
-	using type_t = typename Binop::type_t;
-
-	type_t operator()(type_t value)
-	{
-		type_t retval = nbl::hlsl::OPERATION<Binop,ITEMS_PER_WG,device_capabilities>::template __call<ScratchProxy<0> >(value,arithmeticAccessor);
-		// we barrier before because we alias the accessors for Binop
-		arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
-		return retval;
-	}
-};
-
-
-#include "shaderCommon.hlsl"
-
-static ScratchProxy<ArithmeticSz> ballotAccessor;
-
-
-uint32_t globalIndex()
-{
-	return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex();
-}
-
-bool canStore()
-{
-	return nbl::hlsl::workgroup::SubgroupContiguousIndex()<ITEMS_PER_WG;
-}
-
-[numthreads(WORKGROUP_SIZE,1,1)]
-void main()
-{
-	const type_t sourceVal = test();
-	if (globalIndex()==0u)
-		output[ballot<type_t>::BindingIndex].template Store<uint32_t>(0,nbl::hlsl::glsl::gl_SubgroupSize());
-
-	// we can only ballot booleans, so low bit
-	nbl::hlsl::workgroup::ballot<ScratchProxy<ArithmeticSz> >(bool(sourceVal & 0x1u), ballotAccessor);
-	// need to barrier between ballot and usages of a ballot by myself
-	ballotAccessor.workgroupExecutionAndMemoryBarrier();
-
-	uint32_t destVal = 0xdeadbeefu;
-#define CONSTEXPR_OP_TYPE_TEST(IS_OP) nbl::hlsl::is_same<nbl::hlsl::OPERATION<nbl::hlsl::bit_xor<float>,0x45>,nbl::hlsl::workgroup::IS_OP<nbl::hlsl::bit_xor<float>,0x45> >::value
-#define BALLOT_TEMPLATE_ARGS ITEMS_PER_WG,decltype(ballotAccessor),decltype(arithmeticAccessor),nbl::hlsl::jit::device_capabilities
-	if (CONSTEXPR_OP_TYPE_TEST(reduction))
-		destVal = nbl::hlsl::workgroup::ballotBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
-	else if (CONSTEXPR_OP_TYPE_TEST(inclusive_scan))
-		destVal = nbl::hlsl::workgroup::ballotInclusiveBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
-	else if (CONSTEXPR_OP_TYPE_TEST(exclusive_scan))
-		destVal = nbl::hlsl::workgroup::ballotExclusiveBitCount<BALLOT_TEMPLATE_ARGS>(ballotAccessor,arithmeticAccessor);
-	else
-	{
-		assert(false);
-	}
-#undef BALLOT_TEMPLATE_ARGS
-#undef CONSTEXPR_OP_TYPE_TEST
-
-	if (canStore())
-		output[ballot<type_t>::BindingIndex].template Store<type_t>(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),destVal);
-}
\ No newline at end of file
diff --git a/23_ArithmeticUnitTest/main.cpp b/23_ArithmeticUnitTest/main.cpp
deleted file mode 100644
index 147d231e2..000000000
--- a/23_ArithmeticUnitTest/main.cpp
+++ /dev/null
@@ -1,462 +0,0 @@
-#include "nbl/application_templates/BasicMultiQueueApplication.hpp"
-#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
-#include "app_resources/common.hlsl"
-
-using namespace nbl;
-using namespace core;
-using namespace asset;
-using namespace system;
-using namespace video;
-
-// method emulations on the CPU, to verify the results of the GPU methods
-template<class Binop>
-struct emulatedReduction
-{
-	using type_t = typename Binop::type_t;
-
-	static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount)
-	{
-		const type_t red = std::reduce(in,in+itemCount,Binop::identity,Binop());
-		std::fill(out,out+itemCount,red);
-	}
-
-	static inline constexpr const char* name = "reduction";
-};
-template<class Binop>
-struct emulatedScanInclusive
-{
-	using type_t = typename Binop::type_t;
-
-	static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount)
-	{
-		std::inclusive_scan(in,in+itemCount,out,Binop());
-	}
-	static inline constexpr const char* name = "inclusive_scan";
-};
-template<class Binop>
-struct emulatedScanExclusive
-{
-	using type_t = typename Binop::type_t;
-
-	static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount)
-	{
-		std::exclusive_scan(in,in+itemCount,out,Binop::identity,Binop());
-	}
-	static inline constexpr const char* name = "exclusive_scan";
-};
-
-class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
-{
-	using device_base_t = application_templates::BasicMultiQueueApplication;
-	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
-
-public:
-	ArithmeticUnitTestApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
-		system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
-
-	bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
-	{
-		if (!device_base_t::onAppInitialized(std::move(system)))
-			return false;
-		if (!asset_base_t::onAppInitialized(std::move(system)))
-			return false;
-
-		transferDownQueue = getTransferDownQueue();
-		computeQueue = getComputeQueue();
-
-		// TODO: get the element count from argv
-		const uint32_t elementCount = Output<>::ScanElementCount;
-		// populate our random data buffer on the CPU and create a GPU copy
-		inputData = new uint32_t[elementCount];
-		smart_refctd_ptr<IGPUBuffer> gpuinputDataBuffer;
-		{
-			std::mt19937 randGenerator(0xdeadbeefu);
-			for (uint32_t i = 0u; i < elementCount; i++)
-				inputData[i] = randGenerator(); // TODO: change to using xoroshiro, then we can skip having the input buffer at all
-
-			IGPUBuffer::SCreationParams inputDataBufferCreationParams = {};
-			inputDataBufferCreationParams.size = sizeof(Output<>::data[0]) * elementCount;
-			inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT;
-			m_utils->createFilledDeviceLocalBufferOnDedMem(
-				SIntendedSubmitInfo{.queue=getTransferUpQueue()},
-				std::move(inputDataBufferCreationParams),
-				inputData
-			).move_into(gpuinputDataBuffer);
-		}
-
-		// create 8 buffers for 8 operations
-		for (auto i=0u; i<OutputBufferCount; i++)
-		{
-			IGPUBuffer::SCreationParams params = {};
-			params.size = sizeof(uint32_t) + gpuinputDataBuffer->getSize();
-			params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT;
-
-			outputBuffers[i] = m_device->createBuffer(std::move(params));
-			auto mreq = outputBuffers[i]->getMemoryReqs();
-			mreq.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits();
-			assert(mreq.memoryTypeBits);
-
-			auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get());
-			assert(bufferMem.isValid());
-		}
-
-		// create Descriptor Set and Pipeline Layout
-		{
-			// create Descriptor Set Layout
-			smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout;
-			{
-				IGPUDescriptorSetLayout::SBinding binding[2];
-				for (uint32_t i = 0u; i < 2; i++)
-					binding[i] = {{},i,IDescriptor::E_TYPE::ET_STORAGE_BUFFER,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr };
-				binding[1].count = OutputBufferCount;
-				dsLayout = m_device->createDescriptorSetLayout(binding);
-			}
-
-			// set and transient pool
-			auto descPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE,{&dsLayout.get(),1});
-			descriptorSet = descPool->createDescriptorSet(smart_refctd_ptr(dsLayout));
-			{
-				IGPUDescriptorSet::SDescriptorInfo infos[1+OutputBufferCount];
-				infos[0].desc = gpuinputDataBuffer;
-				infos[0].info.buffer = { 0u,gpuinputDataBuffer->getSize() };
-				for (uint32_t i = 1u; i <= OutputBufferCount; i++)
-				{
-					auto buff = outputBuffers[i - 1];
-					infos[i].info.buffer = { 0u,buff->getSize() };
-					infos[i].desc = std::move(buff); // save an atomic in the refcount
-
-				}
-
-				IGPUDescriptorSet::SWriteDescriptorSet writes[2];
-				for (uint32_t i=0u; i<2; i++)
-					writes[i] = {descriptorSet.get(),i,0u,1u,infos+i};
-				writes[1].count = OutputBufferCount;
-
-				m_device->updateDescriptorSets(2, writes, 0u, nullptr);
-			}
-
-			pipelineLayout = m_device->createPipelineLayout({},std::move(dsLayout));
-		}
-
-		const auto spirv_isa_cache_path = localOutputCWD/"spirv_isa_cache.bin";
-		// enclose to make sure file goes out of scope and we can reopen it
-		{
-			smart_refctd_ptr<const IFile> spirv_isa_cache_input;
-			// try to load SPIR-V to ISA cache
-			{
-				ISystem::future_t<smart_refctd_ptr<IFile>> fileCreate;
-				m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_READ|IFile::ECF_MAPPABLE|IFile::ECF_COHERENT);
-				if (auto lock=fileCreate.acquire())
-					spirv_isa_cache_input = *lock;
-			}
-			// create the cache
-			{
-				std::span<const uint8_t> spirv_isa_cache_data = {};
-				if (spirv_isa_cache_input)
-					spirv_isa_cache_data = {reinterpret_cast<const uint8_t*>(spirv_isa_cache_input->getMappedPointer()),spirv_isa_cache_input->getSize()};
-				else
-					m_logger->log("Failed to load SPIR-V 2 ISA cache!",ILogger::ELL_PERFORMANCE);
-				// Normally we'd deserialize a `ICPUPipelineCache` properly and pass that instead
-				m_spirv_isa_cache = m_device->createPipelineCache(spirv_isa_cache_data);
-			}
-		}
-		{
-			// TODO: rename `deleteDirectory` to just `delete`? and a `IFile::setSize()` ?
-			m_system->deleteDirectory(spirv_isa_cache_path);
-			ISystem::future_t<smart_refctd_ptr<IFile>> fileCreate;
-			m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_WRITE);
-			// I can be relatively sure I'll succeed to acquire the future, the pointer to created file might be null though.
-			m_spirv_isa_cache_output=*fileCreate.acquire();
-			if (!m_spirv_isa_cache_output)
-				logFail("Failed to Create SPIR-V to ISA cache file.");
-		}
-
-		// load shader source from file
-		auto getShaderSource = [&](const char* filePath) -> auto
-		{
-			IAssetLoader::SAssetLoadParams lparams = {};
-			lparams.logger = m_logger.get();
-			lparams.workingDirectory = "";
-			auto bundle = m_assetMgr->getAsset(filePath, lparams);
-			if (bundle.getContents().empty() || bundle.getAssetType()!=IAsset::ET_SHADER)
-			{
-				m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath);
-				exit(-1);
-			}
-			auto firstAssetInBundle = bundle.getContents()[0];
-			return smart_refctd_ptr_static_cast<ICPUShader>(firstAssetInBundle);
-		};
-
-		auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl");
-		auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl");
-		// now create or retrieve final resources to run our tests
-		sema = m_device->createSemaphore(timelineValue);
-		resultsBuffer = ICPUBuffer::create({ outputBuffers[0]->getSize() });
-		{
-			smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-			if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1}))
-			{
-				logFail("Failed to create Command Buffers!\n");
-				return false;
-			}
-		}
-
-		const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations;
-		const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize;
-		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
-		for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u)
-		{
-			const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize);
-			for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize += subgroupSize)
-			{
-				// make sure renderdoc captures everything for debugging
-				m_api->startCapture();
-				m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize);
-
-				bool passed = true;
-				// TODO async the testing
-				passed = runTest<emulatedReduction, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
-				logTestOutcome(passed, workgroupSize);
-				passed = runTest<emulatedScanInclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
-				logTestOutcome(passed, workgroupSize);
-				passed = runTest<emulatedScanExclusive, false>(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed;
-				logTestOutcome(passed, workgroupSize);
-				for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--)
-				{
-					m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG);
-					passed = runTest<emulatedReduction, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
-					logTestOutcome(passed, itemsPerWG);
-					passed = runTest<emulatedScanInclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
-					logTestOutcome(passed, itemsPerWG);
-					passed = runTest<emulatedScanExclusive, true>(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed;
-					logTestOutcome(passed, itemsPerWG);
-				}
-				m_api->endCapture();
-
-				// save cache every now and then	
-				{
-					auto cpu = m_spirv_isa_cache->convertToCPUCache();
-					// Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata
-					auto bin = cpu->getEntries().begin()->second.bin;
-					IFile::success_t success;
-					m_spirv_isa_cache_output->write(success,bin->data(),0ull,bin->size());
-					if (!success)
-						logFail("Could not write Create SPIR-V to ISA cache to disk!");
-				}
-			}
-		}
-
-		return true;
-	}
-
-	virtual bool onAppTerminated() override
-	{
-		m_logger->log("==========Result==========", ILogger::ELL_INFO);
-		m_logger->log("Fail Count: %u", ILogger::ELL_INFO, totalFailCount);
-		delete[] inputData;
-		return true;
-	}
-
-	// the unit test is carried out on init
-	void workLoopBody() override {}
-
-	//
-	bool keepRunning() override { return false; }
-
-private:
-	void logTestOutcome(bool passed, uint32_t workgroupSize)
-	{
-		if (passed)
-			m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize);
-		else
-		{
-			totalFailCount++;
-			m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize);
-		}
-	}
-
-	// create pipeline (specialized every test) [TODO: turn into a future/async]
-	smart_refctd_ptr<IGPUComputePipeline> createPipeline(const ICPUShader* overridenUnspecialized, const uint8_t subgroupSizeLog2)
-	{
-		auto shader = m_device->createShader(overridenUnspecialized);
-		IGPUComputePipeline::SCreationParams params = {};
-		params.layout = pipelineLayout.get();
-		params.shader = {
-			.entryPoint = "main",
-			.shader = shader.get(),
-			.entries = nullptr,
-			.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(subgroupSizeLog2),
-			.requireFullSubgroups = true
-		};
-		core::smart_refctd_ptr<IGPUComputePipeline> pipeline;
-		if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{&params,1},&pipeline))
-			return nullptr;
-		return pipeline;
-	}
-
-	/*template<template<class> class Arithmetic, bool WorkgroupTest>
-	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u)
-	{
-		return true;
-	}*/
-
-	template<template<class> class Arithmetic, bool WorkgroupTest>
-	bool runTest(const smart_refctd_ptr<const ICPUShader>& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u)
-	{
-		std::string arith_name = Arithmetic<bit_xor<float>>::name;
-
-		smart_refctd_ptr<ICPUShader> overridenUnspecialized;
-		if constexpr (WorkgroupTest)
-		{
-			overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
-				source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n",
-				(("workgroup::") + arith_name).c_str(), workgroupSize, itemsPerWG
-			);
-		}
-		else
-		{
-			itemsPerWG = workgroupSize;
-			overridenUnspecialized = CHLSLCompiler::createOverridenCopy(
-				source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n",
-				(("subgroup::") + arith_name).c_str(), workgroupSize
-			);
-		}
-		auto pipeline = createPipeline(overridenUnspecialized.get(),subgroupSizeLog2);
-
-		// TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`)
-		const uint32_t workgroupCount = elementCount / itemsPerWG;
-		cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE);
-		cmdbuf->bindComputePipeline(pipeline.get());
-		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get());
-		cmdbuf->dispatch(workgroupCount, 1, 1);
-		{
-			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount];
-			for (auto i=0u; i<OutputBufferCount; i++)
-			{
-				memoryBarrier[i] = {
-					.barrier = {
-						.dep = {
-							.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
-							.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
-							// in theory we don't need the HOST BITS cause we block on a semaphore but might as well add them
-							.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT|PIPELINE_STAGE_FLAGS::HOST_BIT,
-							.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS|ACCESS_FLAGS::HOST_READ_BIT
-						}
-					},
-					.range = {0ull,outputBuffers[i]->getSize(),outputBuffers[i]}
-				};
-			}
-			IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = {.memBarriers={},.bufBarriers=memoryBarrier};
-			cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info);
-		}
-		cmdbuf->end();
-
-		const IQueue::SSubmitInfo::SSemaphoreInfo signal[1] = {{.semaphore=sema.get(),.value=++timelineValue}};
-		const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = {{.cmdbuf=cmdbuf.get()}};
-		const IQueue::SSubmitInfo submits[1] = {{.commandBuffers=cmdbufs,.signalSemaphores=signal}};
-		computeQueue->submit(submits);
-		const ISemaphore::SWaitInfo wait[1] = {{.semaphore=sema.get(),.value=timelineValue}};
-		m_device->blockForSemaphores(wait);
-
-		// check results
-		bool passed = validateResults<Arithmetic, bit_and<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount);
-		passed = validateResults<Arithmetic, bit_xor<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		passed = validateResults<Arithmetic, bit_or<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		passed = validateResults<Arithmetic, plus<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		passed = validateResults<Arithmetic, multiplies<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		passed = validateResults<Arithmetic, minimum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		passed = validateResults<Arithmetic, maximum<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-		if constexpr (WorkgroupTest)
-			passed = validateResults<Arithmetic, ballot<uint32_t>, WorkgroupTest>(itemsPerWG, workgroupCount) && passed;
-
-		return passed;
-	}
-
-	//returns true if result matches
-	template<template<class> class Arithmetic, class Binop, bool WorkgroupTest>
-	bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount)
-	{
-		bool success = true;
-
-		// download data
-		const SBufferRange<IGPUBuffer> bufferRange = {0u, resultsBuffer->getSize(), outputBuffers[Binop::BindingIndex]};
-		m_utils->downloadBufferRangeViaStagingBufferAutoSubmit(SIntendedSubmitInfo{.queue=transferDownQueue},bufferRange,resultsBuffer->getPointer());
-
-		using type_t = typename Binop::type_t;
-		const auto dataFromBuffer = reinterpret_cast<const uint32_t*>(resultsBuffer->getPointer());
-		const auto subgroupSize = dataFromBuffer[0];
-		if (subgroupSize<nbl::hlsl::subgroup::MinSubgroupSize || subgroupSize>nbl::hlsl::subgroup::MaxSubgroupSize)
-		{
-			m_logger->log("Unexpected Subgroup Size %u", ILogger::ELL_ERROR, subgroupSize);
-			return false;
-		}
-
-		const auto testData = reinterpret_cast<const type_t*>(dataFromBuffer + 1);
-		// TODO: parallel for (the temporary values need to be threadlocal or what?)
-		// now check if the data obtained has valid values
-		type_t* tmp = new type_t[itemsPerWG];
-		type_t* ballotInput = new type_t[itemsPerWG];
-		for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++)
-		{
-			const auto workgroupOffset = workgroupID * itemsPerWG;
-
-			if constexpr (WorkgroupTest)
-			{
-				if constexpr (std::is_same_v<ballot<type_t>, Binop>)
-				{
-					for (auto i = 0u; i < itemsPerWG; i++)
-						ballotInput[i] = inputData[i + workgroupOffset] & 0x1u;
-					Arithmetic<Binop>::impl(tmp, ballotInput, itemsPerWG);
-				}
-				else
-					Arithmetic<Binop>::impl(tmp, inputData + workgroupOffset, itemsPerWG);
-			}
-			else
-			{
-				for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize)
-					Arithmetic<Binop>::impl(tmp + pseudoSubgroupID, inputData + workgroupOffset + pseudoSubgroupID, subgroupSize);
-			}
-
-			for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++)
-			{
-				const auto globalInvocationIndex = workgroupOffset + localInvocationIndex;
-				const auto cpuVal = tmp[localInvocationIndex];
-				const auto gpuVal = testData[globalInvocationIndex];
-				if (cpuVal != gpuVal)
-				{
-					m_logger->log(
-						"Failed test #%d  (%s)  (%s) Expected %u got %u for workgroup %d and localinvoc %d",
-						ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name,
-						cpuVal, gpuVal, workgroupID, localInvocationIndex
-					);
-					success = false;
-					break;
-				}
-			}
-		}
-		delete[] ballotInput;
-		delete[] tmp;
-
-		return success;
-	}
-
-	IQueue* transferDownQueue;
-	IQueue* computeQueue;
-	smart_refctd_ptr<IGPUPipelineCache> m_spirv_isa_cache;
-	smart_refctd_ptr<IFile> m_spirv_isa_cache_output;
-
-	uint32_t* inputData = nullptr;
-	constexpr static inline uint32_t OutputBufferCount = 8u;
-	smart_refctd_ptr<IGPUBuffer> outputBuffers[OutputBufferCount];
-	smart_refctd_ptr<IGPUDescriptorSet> descriptorSet;
-	smart_refctd_ptr<IGPUPipelineLayout> pipelineLayout;
-
-	smart_refctd_ptr<ISemaphore> sema;
-	uint64_t timelineValue = 0;
-	smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
-	smart_refctd_ptr<ICPUBuffer> resultsBuffer;
-
-	uint32_t totalFailCount = 0;
-};
-
-NBL_MAIN_FUNC(ArithmeticUnitTestApp)
\ No newline at end of file
diff --git a/28_FFTBloom/app_resources/fft_common.hlsl b/28_FFTBloom/app_resources/fft_common.hlsl
index 41f8821cc..9f2be1432 100644
--- a/28_FFTBloom/app_resources/fft_common.hlsl
+++ b/28_FFTBloom/app_resources/fft_common.hlsl
@@ -5,13 +5,13 @@ groupshared uint32_t sharedmem[FFTParameters::SharedMemoryDWORDs];
 
 struct SharedMemoryAccessor
 {
-	template <typename IndexType, typename AccessType>
+	template <typename AccessType, typename IndexType>
 	void set(IndexType idx, AccessType value)
 	{
 		sharedmem[idx] = value;
 	}
 
-	template <typename IndexType, typename AccessType>
+	template <typename AccessType, typename IndexType>
 	void get(IndexType idx, NBL_REF_ARG(AccessType) value)
 	{
 		value = sharedmem[idx];
@@ -36,14 +36,14 @@ struct PreloadedAccessorCommonBase
 
 struct PreloadedAccessorBase : PreloadedAccessorCommonBase
 {
-	template <typename AccessType>
-	void set(uint32_t idx, AccessType value)
+	template <typename AccessType, typename IndexType>
+	void set(IndexType idx, AccessType value)
 	{
 		preloaded[idx >> WorkgroupSizeLog2] = value;
 	}
 
-	template <typename AccessType>
-	void get(uint32_t idx, NBL_REF_ARG(AccessType) value)
+	template <typename AccessType, typename IndexType>
+	void get(IndexType idx, NBL_REF_ARG(AccessType) value)
 	{
 		value = preloaded[idx >> WorkgroupSizeLog2];
 	}
@@ -54,14 +54,14 @@ struct PreloadedAccessorBase : PreloadedAccessorCommonBase
 // In the case for preloading all channels at once we make it stateful so we track which channel we're running FFT on
 struct MultiChannelPreloadedAccessorBase : PreloadedAccessorCommonBase
 {
-	template <typename AccessType>
-	void set(uint32_t idx, AccessType value)
+	template <typename AccessType, typename IndexType>
+	void set(IndexType idx, AccessType value)
 	{
 		preloaded[currentChannel][idx >> WorkgroupSizeLog2] = value;
 	}
 
-	template <typename AccessType>
-	void get(uint32_t idx, NBL_REF_ARG(AccessType) value)
+	template <typename AccessType, typename IndexType>
+	void get(IndexType idx, NBL_REF_ARG(AccessType) value)
 	{
 		value = preloaded[currentChannel][idx >> WorkgroupSizeLog2];
 	}
diff --git a/29_Arithmetic2Bench/CMakeLists.txt b/29_Arithmetic2Bench/CMakeLists.txt
new file mode 100644
index 000000000..0724366c9
--- /dev/null
+++ b/29_Arithmetic2Bench/CMakeLists.txt
@@ -0,0 +1,25 @@
+
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
\ No newline at end of file
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
new file mode 100644
index 000000000..f6ad3e678
--- /dev/null
+++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl
@@ -0,0 +1,57 @@
+#pragma shader_stage(compute)
+
+#define operation_t nbl::hlsl::OPERATION
+
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_params.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
+#include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
+
+#include "shaderCommon.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/basic.hlsl"
+
+template<class Binop, class device_capabilities>
+using params_t = SUBGROUP_CONFIG_T;
+
+NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation = params_t<typename arithmetic::plus<uint32_t>::base_t, device_capabilities>::ItemsPerInvocation;
+
+typedef vector<uint32_t, ItemsPerInvocation> type_t;
+
+uint32_t globalIndex()
+{
+    return glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+workgroup::SubgroupContiguousIndex();
+}
+
+template<class Binop>
+static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal)
+{
+    type_t value = sourceVal;
+
+    const uint64_t outputBufAddr = pc.pOutputBuf[Binop::BindingIndex];
+
+    operation_t<params_t<typename Binop::base_t, device_capabilities> > func;
+    // [unroll]
+    for (uint32_t i = 0; i < NUM_LOOPS; i++)
+        value = func(value);
+
+    vk::RawBufferStore<type_t>(outputBufAddr + sizeof(type_t) * globalIndex(), value, sizeof(uint32_t));
+}
+
+void benchmark()
+{
+    const uint32_t invocationIndex = globalIndex();
+    type_t sourceVal;
+    Xoroshiro64Star xoroshiro = Xoroshiro64Star::construct(uint32_t2(invocationIndex,invocationIndex+1));
+    [unroll]
+    for (uint16_t i = 0; i < ItemsPerInvocation; i++)
+        sourceVal[i] = xoroshiro();
+
+    subbench<arithmetic::plus<uint32_t> >(sourceVal);
+}
+
+[numthreads(WORKGROUP_SIZE,1,1)]
+void main()
+{
+    benchmark();
+}
diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
new file mode 100644
index 000000000..a56945467
--- /dev/null
+++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl
@@ -0,0 +1,124 @@
+#pragma shader_stage(compute)
+
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl"
+#include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
+
+using config_t = WORKGROUP_CONFIG_T;
+
+#include "shaderCommon.hlsl"
+
+typedef vector<uint32_t, config_t::ItemsPerInvocation_0> type_t;
+
+// final (level 1/2) scan needs to fit in one subgroup exactly
+groupshared uint32_t scratch[mpl::max_v<int16_t,config_t::SharedScratchElementCount,1>];
+
+#include "../../common/include/WorkgroupDataAccessors.hlsl"
+
+template<uint16_t WorkgroupSizeLog2, uint16_t VirtualWorkgroupSize, uint16_t ItemsPerInvocation>
+struct RandomizedInputDataProxy
+{
+    using dtype_t = vector<uint32_t, ItemsPerInvocation>;
+
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(1u) << WorkgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t PreloadedDataCount = VirtualWorkgroupSize / WorkgroupSize;
+
+    static RandomizedInputDataProxy<WorkgroupSizeLog2, VirtualWorkgroupSize, ItemsPerInvocation> create(uint64_t inputBuf, uint64_t outputBuf)
+    {
+        RandomizedInputDataProxy<WorkgroupSizeLog2, VirtualWorkgroupSize, ItemsPerInvocation> retval;
+        retval.data = DataProxy<VirtualWorkgroupSize, ItemsPerInvocation>::create(inputBuf, outputBuf);
+        return retval;
+    }
+
+    template<typename AccessType, typename IndexType>
+    void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
+    {
+        value = preloaded[ix>>WorkgroupSizeLog2];
+    }
+    template<typename AccessType, typename IndexType>
+    void set(const IndexType ix, const AccessType value)
+    {
+        preloaded[ix>>WorkgroupSizeLog2] = value;
+    }
+
+    void preload()
+    {
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        Xoroshiro64Star xoroshiro = Xoroshiro64Star::construct(uint32_t2(invocationIndex,invocationIndex+1));
+        [unroll]
+        for (uint16_t idx = 0; idx < PreloadedDataCount; idx++)
+            [unroll]
+            for (uint16_t i = 0; i < ItemsPerInvocation; i++)
+               preloaded[idx][i] = xoroshiro();
+    }
+    void unload()
+    {
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        [unroll]
+        for (uint16_t idx = 0; idx < PreloadedDataCount; idx++)
+            data.template set<dtype_t, uint16_t>(idx * WorkgroupSize + invocationIndex, preloaded[idx]);
+    }
+
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        glsl::barrier();
+        //glsl::memoryBarrierShared(); implied by the above
+    }
+
+    DataProxy<VirtualWorkgroupSize, ItemsPerInvocation> data;
+    dtype_t preloaded[PreloadedDataCount];
+};
+
+static ScratchProxy arithmeticAccessor;
+
+using data_proxy_t = RandomizedInputDataProxy<config_t::WorkgroupSizeLog2,config_t::VirtualWorkgroupSize,config_t::ItemsPerInvocation_0>;
+
+template<class Binop, class device_capabilities>
+struct operation_t
+{
+    using binop_base_t = typename Binop::base_t;
+    using otype_t = typename Binop::type_t;
+
+    void operator()(data_proxy_t dataAccessor)
+    {
+#if IS_REDUCTION
+        otype_t value = 
+#endif
+        OPERATION<config_t,binop_base_t,device_capabilities>::template __call<data_proxy_t, ScratchProxy>(dataAccessor,arithmeticAccessor);
+        // we barrier before because we alias the accessors for Binop
+        arithmeticAccessor.workgroupExecutionAndMemoryBarrier();
+#if IS_REDUCTION
+        [unroll]
+        for (uint32_t i = 0; i < data_proxy_t::PreloadedDataCount; i++)
+            dataAccessor.preloaded[i] = value;
+#endif
+    }
+};
+
+template<class Binop>
+static void subbench()
+{
+    data_proxy_t dataAccessor = data_proxy_t::create(0, pc.pOutputBuf[Binop::BindingIndex]);
+    dataAccessor.preload();
+
+    operation_t<Binop,device_capabilities> func;
+    for (uint32_t i = 0; i < NUM_LOOPS; i++)
+        func(dataAccessor);
+
+    dataAccessor.unload();
+}
+
+void benchmark()
+{
+    // only benchmark plus op
+    subbench<arithmetic::plus<uint32_t> >();
+}
+
+
+[numthreads(config_t::WorkgroupSize,1,1)]
+void main()
+{
+    benchmark();
+}
diff --git a/29_Arithmetic2Bench/app_resources/common.hlsl b/29_Arithmetic2Bench/app_resources/common.hlsl
new file mode 100644
index 000000000..cca5af987
--- /dev/null
+++ b/29_Arithmetic2Bench/app_resources/common.hlsl
@@ -0,0 +1,34 @@
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/functional.hlsl"
+
+struct PushConstantData
+{
+    uint64_t pOutputBuf[2];
+};
+
+namespace arithmetic
+{
+template<typename T>
+struct plus : nbl::hlsl::plus<T>
+{
+    using base_t = nbl::hlsl::plus<T>;
+
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 0;
+#ifndef __HLSL_VERSION
+    static inline constexpr const char* name = "plus";
+#endif
+};
+
+template<typename T>
+struct ballot : nbl::hlsl::plus<T>
+{
+    using base_t = nbl::hlsl::plus<T>;
+
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 1;
+#ifndef __HLSL_VERSION
+    static inline constexpr const char* name = "bitcount";
+#endif
+};
+}
+
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
diff --git a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
new file mode 100644
index 000000000..242ededd8
--- /dev/null
+++ b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl
@@ -0,0 +1,26 @@
+#include "common.hlsl"
+
+using namespace nbl;
+using namespace hlsl;
+
+[[vk::push_constant]] PushConstantData pc;
+
+struct device_capabilities
+{
+#ifdef TEST_NATIVE
+    NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = true;
+#else
+    NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = false;
+#endif
+};
+
+#ifndef OPERATION
+#error "Define OPERATION!"
+#endif
+
+#ifndef NUM_LOOPS
+#error "Define NUM_LOOPS!"
+#endif
+
+// NOTE added dummy output image to be able to profile with Nsight, which still doesn't support profiling headless compute shaders
+[[vk::binding(2, 0)]] RWTexture2D<float32_t4> outImage; // dummy
diff --git a/29_Arithmetic2Bench/config.json.template b/29_Arithmetic2Bench/config.json.template
new file mode 100644
index 000000000..f961745c1
--- /dev/null
+++ b/29_Arithmetic2Bench/config.json.template
@@ -0,0 +1,28 @@
+{
+  "enableParallelBuild": true,
+  "threadsPerBuildProcess" : 2,
+  "isExecuted": false,
+  "scriptPath": "",
+  "cmake": {
+    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
+    "buildModes": [],
+    "requiredOptions": []
+  }, 
+  "profiles": [
+    {
+      "backend": "vulkan",
+      "platform": "windows",
+      "buildModes": [],
+      "runConfiguration": "Release",
+      "gpuArchitectures": []
+    }
+  ],
+  "dependencies": [],
+  "data": [
+    {
+      "dependencies": [],
+      "command": [""],
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp
new file mode 100644
index 000000000..2d5afeb4c
--- /dev/null
+++ b/29_Arithmetic2Bench/main.cpp
@@ -0,0 +1,689 @@
+#include "SimpleWindowedApplication.hpp"
+#include "CEventCallback.hpp"
+#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+#include "app_resources/common.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl"
+#include "nbl/builtin/hlsl/subgroup2/arithmetic_params.hlsl"
+
+using namespace nbl;
+using namespace core;
+using namespace system;
+using namespace asset;
+using namespace ui;
+using namespace video;
+
+template<typename SwapchainResources> requires std::is_base_of_v<ISimpleManagedSurface::ISwapchainResources, SwapchainResources>
+class CExplicitSurfaceFormatResizeSurface final : public ISimpleManagedSurface
+{
+public:
+	using this_t = CExplicitSurfaceFormatResizeSurface<SwapchainResources>;
+
+	// Factory method so we can fail, requires a `_surface` created from a window and with a callback that inherits from `ICallback` declared just above
+	template<typename Surface> requires std::is_base_of_v<CSurface<typename Surface::window_t, typename Surface::immediate_base_t>, Surface>
+	static inline core::smart_refctd_ptr<this_t> create(core::smart_refctd_ptr<Surface>&& _surface)
+	{
+		if (!_surface)
+			return nullptr;
+
+		auto _window = _surface->getWindow();
+		ICallback* cb = nullptr;
+		if (_window)
+			cb = dynamic_cast<ICallback*>(_window->getEventCallback());
+
+		return core::smart_refctd_ptr<this_t>(new this_t(std::move(_surface), cb), core::dont_grab);
+	}
+
+	// Factory method so we can fail, requires a `_surface` created from a native surface
+	template<typename Surface> requires std::is_base_of_v<CSurfaceNative<typename Surface::window_t, typename Surface::immediate_base_t>, Surface>
+	static inline core::smart_refctd_ptr<this_t> create(core::smart_refctd_ptr<Surface>&& _surface, ICallback* cb)
+	{
+		if (!_surface)
+			return nullptr;
+
+		return core::smart_refctd_ptr<this_t>(new this_t(std::move(_surface), cb), core::dont_grab);
+	}
+
+	//
+	inline bool init(CThreadSafeQueueAdapter* queue, std::unique_ptr<SwapchainResources>&& scResources, const ISwapchain::SSharedCreationParams& sharedParams = {})
+	{
+		if (!scResources || !base_init(queue))
+			return init_fail();
+
+		m_sharedParams = sharedParams;
+		if (!m_sharedParams.deduce(queue->getOriginDevice()->getPhysicalDevice(), getSurface()))
+			return init_fail();
+
+		m_swapchainResources = std::move(scResources);
+		return true;
+	}
+
+	// Can be public because we don't need to worry about mutexes unlike the Smooth Resize class
+	inline ISwapchainResources* getSwapchainResources() override { return m_swapchainResources.get(); }
+
+	// need to see if the swapchain is invalidated (e.g. because we're starting from 0-area old Swapchain) and try to recreate the swapchain
+	inline SAcquireResult acquireNextImage()
+	{
+		if (!isWindowOpen())
+		{
+			becomeIrrecoverable();
+			return {};
+		}
+
+		if (!m_swapchainResources || (m_swapchainResources->getStatus() != ISwapchainResources::STATUS::USABLE && !recreateSwapchain(m_surfaceFormat)))
+			return {};
+
+		return ISimpleManagedSurface::acquireNextImage();
+	}
+
+	// its enough to just foward though
+	inline bool present(const uint8_t imageIndex, const std::span<const IQueue::SSubmitInfo::SSemaphoreInfo> waitSemaphores)
+	{
+		return ISimpleManagedSurface::present(imageIndex, waitSemaphores);
+	}
+
+	//
+	inline bool recreateSwapchain(const ISurface::SFormat& explicitSurfaceFormat)
+	{
+		assert(m_swapchainResources);
+		// dont assign straight to `m_swapchainResources` because of complex refcounting and cycles
+		core::smart_refctd_ptr<ISwapchain> newSwapchain;
+		// TODO: This block of code could be rolled up into `ISimpleManagedSurface::ISwapchainResources` eventually
+		{
+			auto* surface = getSurface();
+			auto device = const_cast<ILogicalDevice*>(getAssignedQueue()->getOriginDevice());
+			// 0s are invalid values, so they indicate we want them deduced
+			m_sharedParams.width = 0;
+			m_sharedParams.height = 0;
+			// Question: should we re-query the supported queues, formats, present modes, etc. just-in-time??
+			auto* swapchain = m_swapchainResources->getSwapchain();
+			if (swapchain ? swapchain->deduceRecreationParams(m_sharedParams) : m_sharedParams.deduce(device->getPhysicalDevice(), surface))
+			{
+				// super special case, we can't re-create the swapchain but its possible to recover later on
+				if (m_sharedParams.width == 0 || m_sharedParams.height == 0)
+				{
+					// we need to keep the old-swapchain around, but can drop the rest
+					m_swapchainResources->invalidate();
+					return false;
+				}
+				// now lets try to create a new swapchain
+				if (swapchain)
+					newSwapchain = swapchain->recreate(m_sharedParams);
+				else
+				{
+					ISwapchain::SCreationParams params = {
+						.surface = core::smart_refctd_ptr<ISurface>(surface),
+						.surfaceFormat = explicitSurfaceFormat,
+						.sharedParams = m_sharedParams
+						// we're not going to support concurrent sharing in this simple class
+					};
+					m_surfaceFormat = explicitSurfaceFormat;
+					newSwapchain = CVulkanSwapchain::create(core::smart_refctd_ptr<const ILogicalDevice>(device), std::move(params));
+				}
+			}
+			else // parameter deduction failed
+				return false;
+		}
+
+		if (newSwapchain)
+		{
+			m_swapchainResources->invalidate();
+			return m_swapchainResources->onCreateSwapchain(getAssignedQueue()->getFamilyIndex(), std::move(newSwapchain));
+		}
+		else
+			becomeIrrecoverable();
+
+		return false;
+	}
+
+protected:
+	using ISimpleManagedSurface::ISimpleManagedSurface;
+
+	//
+	inline void deinit_impl() override final
+	{
+		becomeIrrecoverable();
+	}
+
+	//
+	inline void becomeIrrecoverable() override { m_swapchainResources = nullptr; }
+
+	// gets called when OUT_OF_DATE upon an acquire
+	inline SAcquireResult handleOutOfDate() override final
+	{
+		// recreate swapchain and try to acquire again
+		if (recreateSwapchain(m_surfaceFormat))
+			return ISimpleManagedSurface::acquireNextImage();
+		return {};
+	}
+
+private:
+	// Because the surface can start minimized (extent={0,0}) we might not be able to create the swapchain right away, so store creation parameters until we can create it.
+	ISwapchain::SSharedCreationParams m_sharedParams = {};
+	// The swapchain might not be possible to create or recreate right away, so this might be
+	// either nullptr before the first successful acquire or the old to-be-retired swapchain.
+	std::unique_ptr<SwapchainResources> m_swapchainResources = {};
+
+	ISurface::SFormat m_surfaceFormat = {};
+};
+
+// NOTE added swapchain + drawing frames to be able to profile with Nsight, which still doesn't support profiling headless compute shaders
+class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+{
+	using device_base_t = examples::SimpleWindowedApplication;
+	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+
+	constexpr static inline uint32_t WIN_W = 1280;
+	constexpr static inline uint32_t WIN_H = 720;
+	constexpr static inline uint32_t MaxFramesInFlight = 5;
+
+public:
+	ArithmeticBenchApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+		system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+	inline core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const override
+	{
+		if (!m_surface)
+		{
+			{
+				auto windowCallback = core::make_smart_refctd_ptr<CEventCallback>(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger));
+				IWindow::SCreationParams params = {};
+				params.callback = core::make_smart_refctd_ptr<ISimpleManagedSurface::ICallback>();
+				params.width = WIN_W;
+				params.height = WIN_H;
+				params.x = 32;
+				params.y = 32;
+				params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE;
+				params.windowCaption = "ArithmeticBenchApp";
+				params.callback = windowCallback;
+				const_cast<std::remove_const_t<decltype(m_window)>&>(m_window) = m_winMgr->createWindow(std::move(params));
+			}
+
+			auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast<IWindowWin32>(m_window));
+			const_cast<std::remove_const_t<decltype(m_surface)>&>(m_surface) = CExplicitSurfaceFormatResizeSurface<ISimpleManagedSurface::ISwapchainResources>::create(std::move(surface));
+		}
+
+		if (m_surface)
+			return { {m_surface->getSurface()/*,EQF_NONE*/} };
+
+		return {};
+	}
+
+	bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+	{
+		m_inputSystem = make_smart_refctd_ptr<InputSystem>(logger_opt_smart_ptr(smart_refctd_ptr(m_logger)));
+
+		if (!device_base_t::onAppInitialized(std::move(system)))
+			return false;
+		if (!asset_base_t::onAppInitialized(std::move(system)))
+			return false;
+
+		m_semaphore = m_device->createSemaphore(m_realFrameIx);
+		if (!m_semaphore)
+			return logFail("Failed to Create a Semaphore!");
+
+		ISwapchain::SCreationParams swapchainParams = { .surface = m_surface->getSurface() };
+		asset::E_FORMAT preferredFormats[] = { asset::EF_R8G8B8A8_UNORM };
+		if (!swapchainParams.deduceFormat(m_physicalDevice, preferredFormats))
+			return logFail("Could not choose a Surface Format for the Swapchain!");
+
+		swapchainParams.sharedParams.imageUsage = IGPUImage::E_USAGE_FLAGS::EUF_RENDER_ATTACHMENT_BIT | IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT;
+
+		auto graphicsQueue = getGraphicsQueue();
+		if (!m_surface || !m_surface->init(graphicsQueue, std::make_unique<ISimpleManagedSurface::ISwapchainResources>(), swapchainParams.sharedParams))
+			return logFail("Could not create Window & Surface or initialize the Surface!");
+
+		auto pool = m_device->createCommandPool(graphicsQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+
+		for (auto i = 0u; i < MaxFramesInFlight; i++)
+		{
+			if (!pool)
+				return logFail("Couldn't create Command Pool!");
+			if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 }))
+				return logFail("Couldn't create Command Buffer!");
+		}
+
+		m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H);
+		m_surface->recreateSwapchain(swapchainParams.surfaceFormat);
+
+		transferDownQueue = getTransferDownQueue();
+		computeQueue = getComputeQueue();
+
+		// create 2 buffers for 2 operations
+		for (auto i=0u; i<OutputBufferCount; i++)
+		{
+			IGPUBuffer::SCreationParams params = {};
+			params.size = sizeof(uint32_t) * (ElementCount+1);
+			params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+
+			outputBuffers[i] = m_device->createBuffer(std::move(params));
+			auto mreq = outputBuffers[i]->getMemoryReqs();
+			mreq.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits();
+			assert(mreq.memoryTypeBits);
+
+			auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+			assert(bufferMem.isValid());
+		}
+		for (auto i = 0u; i < OutputBufferCount; i++)
+			pc.pOutputBuf[i] = outputBuffers[i]->getDeviceAddress();
+
+		// create image views for swapchain images
+		for (uint32_t i = 0; i < ISwapchain::MaxImages; i++)
+		{
+			IGPUImage* scImg = m_surface->getSwapchainResources()->getImage(i);
+			if (scImg == nullptr)
+				continue;
+			IGPUImageView::SCreationParams viewParams = {
+				.flags = IGPUImageView::ECF_NONE,
+				.subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT,
+				.image = smart_refctd_ptr<IGPUImage>(scImg),
+				.viewType = IGPUImageView::ET_2D,
+				.format = scImg->getCreationParameters().format
+			};
+			swapchainImageViews[i] = m_device->createImageView(std::move(viewParams));
+		}
+
+		// create Descriptor Sets and Pipeline Layouts
+		smart_refctd_ptr<IGPUPipelineLayout> benchPplnLayout;
+		{
+			// set and transient pool
+			smart_refctd_ptr<IGPUDescriptorSetLayout> benchLayout;
+			{
+				IGPUDescriptorSetLayout::SBinding binding[1];
+				binding[0] = { {},2,IDescriptor::E_TYPE::ET_STORAGE_IMAGE,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr };
+				benchLayout = m_device->createDescriptorSetLayout(binding);
+			}
+
+			const uint32_t setCount = ISwapchain::MaxImages;
+			benchPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, { &benchLayout.get(),1 }, &setCount);
+			for (auto i = 0u; i < ISwapchain::MaxImages; i++)
+			{
+			    benchDs[i] = benchPool->createDescriptorSet(smart_refctd_ptr(benchLayout));
+				if (!benchDs[i])
+					return logFail("Could not create Descriptor Set!");
+			}
+
+			SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0,.size = sizeof(PushConstantData) };
+			benchPplnLayout = m_device->createPipelineLayout({ &pcRange, 1 }, std::move(benchLayout));
+		}
+		if (UseNativeArithmetic && !m_physicalDevice->getProperties().limits.shaderSubgroupArithmetic)
+		{
+			logFail("UseNativeArithmetic is true but device does not support shaderSubgroupArithmetic!");
+			return false;
+		}
+
+		IGPUDescriptorSet::SWriteDescriptorSet dsWrites[ISwapchain::MaxImages];
+		for (auto i = 0u; i < ISwapchain::MaxImages; i++)
+		{
+			if (swapchainImageViews[i].get() == nullptr)
+				continue;
+
+			video::IGPUDescriptorSet::SDescriptorInfo dsInfo;
+			dsInfo.info.image.imageLayout = IImage::LAYOUT::GENERAL;
+			dsInfo.desc = swapchainImageViews[i];
+
+			dsWrites[i] =
+			{
+				.dstSet = benchDs[i].get(),
+				.binding = 2u,
+				.arrayElement = 0u,
+				.count = 1u,
+				.info = &dsInfo,
+			};
+			m_device->updateDescriptorSets(1u, &dsWrites[i], 0u, nullptr);
+		}
+
+
+		// load shader source from file
+		auto getShaderSource = [&](const char* filePath) -> auto
+		{
+			IAssetLoader::SAssetLoadParams lparams = {};
+			lparams.logger = m_logger.get();
+			lparams.workingDirectory = "";
+			auto bundle = m_assetMgr->getAsset(filePath, lparams);
+			if (bundle.getContents().empty() || bundle.getAssetType()!=IAsset::ET_SHADER)
+			{
+				m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath);
+				exit(-1);
+			}
+			auto firstAssetInBundle = bundle.getContents()[0];
+			return smart_refctd_ptr_static_cast<ICPUShader>(firstAssetInBundle);
+		};
+
+		// for each workgroup size (manually adjust items per invoc, operation else uses up a lot of ram)
+		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
+		smart_refctd_ptr<ICPUShader> shaderSource;
+		if constexpr (DoWorkgroupBenchmarks)
+			shaderSource = getShaderSource("app_resources/benchmarkWorkgroup.comp.hlsl");
+		else
+			shaderSource = getShaderSource("app_resources/benchmarkSubgroup.comp.hlsl");
+
+		for (uint32_t op = 0; op < arithmeticOperations.size(); op++)
+			for (uint32_t i = 0; i < workgroupSizes.size(); i++)
+				benchSets[op*workgroupSizes.size()+i] = createBenchmarkPipelines<DoWorkgroupBenchmarks>(shaderSource, benchPplnLayout.get(), ElementCount, arithmeticOperations[op], hlsl::findMSB(MaxSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops);
+
+		m_winMgr->show(m_window.get());
+
+		return true;
+	}
+
+	virtual bool onAppTerminated() override
+	{
+		return true;
+	}
+
+	// the unit test is carried out on init
+	void workLoopBody() override
+	{
+		const auto resourceIx = m_realFrameIx % MaxFramesInFlight;
+
+		const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight());
+
+		if (m_realFrameIx >= framesInFlight)
+		{
+			const ISemaphore::SWaitInfo cbDonePending[] =
+			{
+				{
+					.semaphore = m_semaphore.get(),
+					.value = m_realFrameIx + 1 - framesInFlight
+				}
+			};
+			if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS)
+				return;
+		}
+
+		m_currentImageAcquire = m_surface->acquireNextImage();
+		if (!m_currentImageAcquire)
+			return;
+
+		auto* const cmdbuf = m_cmdBufs.data()[resourceIx].get();
+		cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT);
+		cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+		const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize;
+		const auto SubgroupSizeLog2 = hlsl::findMSB(MaxSubgroupSize);
+
+		cmdbuf->bindDescriptorSets(EPBP_COMPUTE, benchSets[0].pipeline->getLayout(), 0u, 1u, &benchDs[m_currentImageAcquire.imageIndex].get());
+		cmdbuf->pushConstants(benchSets[0].pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PushConstantData), &pc);
+
+		for (uint32_t i = 0; i < benchSets.size(); i++)
+			runBenchmark<DoWorkgroupBenchmarks>(cmdbuf, benchSets[i], ElementCount, SubgroupSizeLog2);
+
+		// barrier transition to PRESENT
+		{
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1];
+			imageBarriers[0].barrier = {
+				   .dep = {
+					   .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+					   .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
+					   .dstStageMask = PIPELINE_STAGE_FLAGS::NONE,
+					   .dstAccessMask = ACCESS_FLAGS::NONE
+					}
+			};
+			imageBarriers[0].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex);
+			imageBarriers[0].subresourceRange = {
+				.aspectMask = IImage::EAF_COLOR_BIT,
+				.baseMipLevel = 0u,
+				.levelCount = 1u,
+				.baseArrayLayer = 0u,
+				.layerCount = 1u
+			};
+			imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED;
+			imageBarriers[0].newLayout = IImage::LAYOUT::PRESENT_SRC;
+
+			cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers });
+		}
+
+		cmdbuf->end();
+
+		// submit
+		{
+			auto* queue = getGraphicsQueue();
+			const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] =
+			{
+				{
+					.semaphore = m_semaphore.get(),
+					.value = ++m_realFrameIx,
+					.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+				}
+			};
+			{
+				{
+					const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] =
+					{
+						{.cmdbuf = cmdbuf }
+					};
+
+					const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] =
+					{
+						{
+							.semaphore = m_currentImageAcquire.semaphore,
+							.value = m_currentImageAcquire.acquireCount,
+							.stageMask = PIPELINE_STAGE_FLAGS::NONE
+						}
+					};
+					const IQueue::SSubmitInfo infos[] =
+					{
+						{
+							.waitSemaphores = acquired,
+							.commandBuffers = commandBuffers,
+							.signalSemaphores = rendered
+						}
+					};
+
+					if (queue->submit(infos) == IQueue::RESULT::SUCCESS)
+					{
+						const nbl::video::ISemaphore::SWaitInfo waitInfos[] =
+						{ {
+							.semaphore = m_semaphore.get(),
+							.value = m_realFrameIx
+						} };
+
+						m_device->blockForSemaphores(waitInfos); // this is not solution, quick wa to not throw validation errors
+					}
+					else
+						--m_realFrameIx;
+				}
+			}
+
+			m_surface->present(m_currentImageAcquire.imageIndex, rendered);
+		}
+
+		numSubmits++;
+	}
+
+	//
+	bool keepRunning() override { return numSubmits < MaxNumSubmits; }
+
+private:
+	// create pipeline (specialized every test) [TODO: turn into a future/async]
+	smart_refctd_ptr<IGPUComputePipeline> createPipeline(const ICPUShader* overridenUnspecialized, const IGPUPipelineLayout* layout, const uint8_t subgroupSizeLog2)
+	{
+		auto shader = m_device->createShader(overridenUnspecialized);
+		IGPUComputePipeline::SCreationParams params = {};
+		params.layout = layout;
+		params.shader = {
+			.entryPoint = "main",
+			.shader = shader.get(),
+			.entries = nullptr,
+			.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(subgroupSizeLog2),
+			.requireFullSubgroups = true
+		};
+		core::smart_refctd_ptr<IGPUComputePipeline> pipeline;
+		if (!m_device->createComputePipelines(nullptr,{&params,1},&pipeline))
+			return nullptr;
+		return pipeline;
+	}
+
+	struct BenchmarkSet
+	{
+		smart_refctd_ptr<IGPUComputePipeline> pipeline;
+		uint32_t workgroupSize;
+		uint32_t itemsPerInvocation;
+	};
+
+	template<bool WorkgroupBench>
+	BenchmarkSet createBenchmarkPipelines(const smart_refctd_ptr<const ICPUShader>&source, const IGPUPipelineLayout* layout, const uint32_t elementCount, const std::string& arith_name, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u)
+	{
+		auto compiler = make_smart_refctd_ptr<asset::CHLSLCompiler>(smart_refctd_ptr(m_system));
+		CHLSLCompiler::SOptions options = {};
+		options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE;
+		options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion;
+		options.spirvOptimizer = nullptr;
+#ifndef _NBL_DEBUG
+		ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO;
+		auto opt = make_smart_refctd_ptr<ISPIRVOptimizer>(std::span<ISPIRVOptimizer::E_OPTIMIZER_PASS>(&optPasses, 1));
+		options.spirvOptimizer = opt.get();
+#else
+		options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT;
+#endif
+		options.preprocessorOptions.sourceIdentifier = source->getFilepathHint();
+		options.preprocessorOptions.logger = m_logger.get();
+
+		auto* includeFinder = compiler->getDefaultIncludeFinder();
+		options.preprocessorOptions.includeFinder = includeFinder;
+
+		const uint32_t subgroupSize = 0x1u << subgroupSizeLog2;
+		const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize);
+		hlsl::workgroup2::SArithmeticConfiguration wgConfig;
+	    wgConfig.init(workgroupSizeLog2, subgroupSizeLog2, itemsPerInvoc);
+		const uint32_t itemsPerWG = wgConfig.VirtualWorkgroupSize * wgConfig.ItemsPerInvocation_0;
+		smart_refctd_ptr<ICPUShader> overriddenUnspecialized;
+		if constexpr (WorkgroupBench)
+		{
+			const std::string definitions[4] = {
+				"workgroup2::" + arith_name,
+				wgConfig.getConfigTemplateStructString(),
+				std::to_string(numLoops),
+				std::to_string(arith_name=="reduction")
+			};
+
+			const IShaderCompiler::SMacroDefinition defines[5] = {
+				{ "OPERATION", definitions[0] },
+				{ "WORKGROUP_CONFIG_T", definitions[1] },
+				{ "NUM_LOOPS", definitions[2] },
+				{ "IS_REDUCTION", definitions[3] },
+				{ "TEST_NATIVE", "1" }
+			};
+			if (UseNativeArithmetic)
+				options.preprocessorOptions.extraDefines = { defines, defines + 5 };
+			else
+				options.preprocessorOptions.extraDefines = { defines, defines + 4 };
+
+			overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
+		}
+		else
+		{
+			hlsl::subgroup2::SArithmeticParams sgParams;
+			sgParams.init(subgroupSizeLog2, itemsPerInvoc);
+
+			const std::string definitions[4] = { 
+				"subgroup2::" + arith_name,
+				std::to_string(workgroupSize),
+				sgParams.getParamTemplateStructString(),
+				std::to_string(numLoops)
+			};
+
+			const IShaderCompiler::SMacroDefinition defines[5] = {
+				{ "OPERATION", definitions[0] },
+				{ "WORKGROUP_SIZE", definitions[1] },
+				{ "SUBGROUP_CONFIG_T", definitions[2] },
+				{ "NUM_LOOPS", definitions[3] },
+				{ "TEST_NATIVE", "1" }
+			};
+			if (UseNativeArithmetic)
+				options.preprocessorOptions.extraDefines = { defines, defines + 5 };
+			else
+				options.preprocessorOptions.extraDefines = { defines, defines + 4 };
+
+			overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options);
+		}
+		
+		BenchmarkSet set;
+		set.pipeline = createPipeline(overriddenUnspecialized.get(), layout, subgroupSizeLog2);
+		if constexpr (WorkgroupBench)
+		{
+			set.workgroupSize = itemsPerWG;
+		}
+		else
+		{
+			set.workgroupSize = workgroupSize;
+		}
+		set.itemsPerInvocation = itemsPerInvoc;
+
+		return set;
+	};
+
+	template<bool WorkgroupBench>
+	void runBenchmark(IGPUCommandBuffer* cmdbuf, const BenchmarkSet& set, const uint32_t elementCount, const uint8_t subgroupSizeLog2)
+	{
+		uint32_t workgroupCount;
+		if constexpr (WorkgroupBench)
+			workgroupCount = elementCount / set.workgroupSize;
+		else
+			workgroupCount = elementCount / (set.workgroupSize * set.itemsPerInvocation);
+
+		cmdbuf->bindComputePipeline(set.pipeline.get());
+		cmdbuf->dispatch(workgroupCount, 1, 1);
+		{
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount];
+			for (auto i = 0u; i < OutputBufferCount; i++)
+			{
+				memoryBarrier[i] = {
+					.barrier = {
+						.dep = {
+							.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+							.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
+							// in theory we don't need the HOST BITS cause we block on a semaphore but might as well add them
+							.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT | PIPELINE_STAGE_FLAGS::HOST_BIT,
+							.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS | ACCESS_FLAGS::HOST_READ_BIT
+						}
+					},
+					.range = {0ull,outputBuffers[i]->getSize(),outputBuffers[i]}
+				};
+			}
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .memBarriers = {},.bufBarriers = memoryBarrier };
+			cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE, info);
+		}
+	}
+
+	IQueue* transferDownQueue;
+	IQueue* computeQueue;
+
+	smart_refctd_ptr<IWindow> m_window;
+	smart_refctd_ptr<CExplicitSurfaceFormatResizeSurface<ISimpleManagedSurface::ISwapchainResources>> m_surface;
+	smart_refctd_ptr<ISemaphore> m_semaphore;
+	uint64_t m_realFrameIx = 0;
+	std::array<smart_refctd_ptr<IGPUCommandBuffer>, MaxFramesInFlight> m_cmdBufs;
+	ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {};
+
+	smart_refctd_ptr<InputSystem> m_inputSystem;
+
+	std::array<smart_refctd_ptr<IGPUImageView>, ISwapchain::MaxImages> swapchainImageViews;
+
+	constexpr static inline uint32_t MaxNumSubmits = 30;
+	uint32_t numSubmits = 0;
+	constexpr static inline uint32_t ElementCount = 1024 * 1024;
+
+	/* PARAMETERS TO CHANGE FOR DIFFERENT BENCHMARKS */
+	constexpr static inline bool DoWorkgroupBenchmarks = true;
+	constexpr static inline bool UseNativeArithmetic = true;
+	uint32_t ItemsPerInvocation = 4u;
+	constexpr static inline uint32_t NumLoops = 1000u;
+	constexpr static inline uint32_t NumBenchmarks = 6u;
+	std::array<uint32_t, NumBenchmarks> workgroupSizes = { 32, 64, 128, 256, 512, 1024 };
+	std::array<std::string, 3u> arithmeticOperations = { "reduction", "inclusive_scan", "exclusive_scan" };
+
+
+	std::array<BenchmarkSet, NumBenchmarks*3u> benchSets;
+	smart_refctd_ptr<IDescriptorPool> benchPool;
+	std::array<smart_refctd_ptr<IGPUDescriptorSet>, ISwapchain::MaxImages> benchDs;
+
+	constexpr static inline uint32_t OutputBufferCount = 2u;
+	smart_refctd_ptr<IGPUBuffer> outputBuffers[OutputBufferCount];
+	smart_refctd_ptr<IGPUBuffer> gpuOutputAddressesBuffer;
+	PushConstantData pc;
+
+	uint64_t timelineValue = 0;
+};
+
+NBL_MAIN_FUNC(ArithmeticBenchApp)
\ No newline at end of file
diff --git a/29_Arithmetic2Bench/pipeline.groovy b/29_Arithmetic2Bench/pipeline.groovy
new file mode 100644
index 000000000..7ea9947e0
--- /dev/null
+++ b/29_Arithmetic2Bench/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CArithemticUnitTestBuilder extends IBuilder
+{
+	public CArithemticUnitTestBuilder(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CArithemticUnitTestBuilder(_agent, _info)
+}
+
+return this
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0b3279a48..31ebaddf9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,12 +58,13 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(20_AllocatorTest EXCLUDE_FROM_ALL)
 	add_subdirectory(21_LRUCacheUnitTest EXCLUDE_FROM_ALL)
 	add_subdirectory(22_CppCompat EXCLUDE_FROM_ALL)
-	add_subdirectory(23_ArithmeticUnitTest EXCLUDE_FROM_ALL)
+	add_subdirectory(23_Arithmetic2UnitTest EXCLUDE_FROM_ALL)
 	add_subdirectory(24_ColorSpaceTest EXCLUDE_FROM_ALL)
 	add_subdirectory(25_FilterTest EXCLUDE_FROM_ALL)
 	add_subdirectory(26_Blur EXCLUDE_FROM_ALL)
 	add_subdirectory(27_MPMCScheduler EXCLUDE_FROM_ALL)	
 	add_subdirectory(28_FFTBloom EXCLUDE_FROM_ALL)
+	add_subdirectory(29_Arithmetic2Bench EXCLUDE_FROM_ALL)
 	# add_subdirectory(36_CUDAInterop EXCLUDE_FROM_ALL)
 
 	# Showcase compute pathtracing
diff --git a/common/include/WorkgroupDataAccessors.hlsl b/common/include/WorkgroupDataAccessors.hlsl
new file mode 100644
index 000000000..7287a4135
--- /dev/null
+++ b/common/include/WorkgroupDataAccessors.hlsl
@@ -0,0 +1,124 @@
+#ifndef _WORKGROUP_DATA_ACCESSORS_HLSL_
+#define _WORKGROUP_DATA_ACCESSORS_HLSL_
+
+#include "nbl/builtin/hlsl/bda/legacy_bda_accessor.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+
+struct ScratchProxy
+{
+    template<typename AccessType, typename IndexType>
+    void get(const uint32_t ix, NBL_REF_ARG(AccessType) value)
+    {
+        value = scratch[ix];
+    }
+    template<typename AccessType, typename IndexType>
+    void set(const uint32_t ix, const AccessType value)
+    {
+        scratch[ix] = value;
+    }
+
+    uint32_t atomicOr(const uint32_t ix, const uint32_t value)
+    {
+        return glsl::atomicOr(scratch[ix],value);
+    }
+
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        glsl::barrier();
+        //glsl::memoryBarrierShared(); implied by the above
+    }
+};
+
+template<uint16_t VirtualWorkgroupSize, uint16_t ItemsPerInvocation>
+struct DataProxy
+{
+    using dtype_t = vector<uint32_t, ItemsPerInvocation>;
+    // function template AccessType should be the same as dtype_t
+
+    static DataProxy<VirtualWorkgroupSize, ItemsPerInvocation> create(const uint64_t inputBuf, const uint64_t outputBuf)
+    {
+        DataProxy<VirtualWorkgroupSize, ItemsPerInvocation> retval;
+        const uint32_t workgroupOffset = glsl::gl_WorkGroupID().x * VirtualWorkgroupSize * sizeof(dtype_t);
+        retval.accessor = DoubleLegacyBdaAccessor<dtype_t>::create(inputBuf + workgroupOffset, outputBuf + workgroupOffset);
+        return retval;
+    }
+
+    template<typename AccessType, typename IndexType>
+    void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
+    {
+        accessor.get(ix, value);
+    }
+    template<typename AccessType, typename IndexType>
+    void set(const IndexType ix, const AccessType value)
+    {
+        accessor.set(ix, value);
+    }
+
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        glsl::barrier();
+        //glsl::memoryBarrierShared(); implied by the above
+    }
+
+    DoubleLegacyBdaAccessor<dtype_t> accessor;
+};
+
+template<uint16_t WorkgroupSizeLog2, uint16_t VirtualWorkgroupSize, uint16_t ItemsPerInvocation>
+struct PreloadedDataProxy
+{
+    using dtype_t = vector<uint32_t, ItemsPerInvocation>;
+
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(1u) << WorkgroupSizeLog2;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t PreloadedDataCount = VirtualWorkgroupSize / WorkgroupSize;
+
+    static PreloadedDataProxy<WorkgroupSizeLog2, VirtualWorkgroupSize, ItemsPerInvocation> create(const uint64_t inputBuf, const uint64_t outputBuf)
+    {
+        PreloadedDataProxy<WorkgroupSizeLog2, VirtualWorkgroupSize, ItemsPerInvocation> retval;
+        retval.data = DataProxy<VirtualWorkgroupSize, ItemsPerInvocation>::create(inputBuf, outputBuf);
+        return retval;
+    }
+
+    template<typename AccessType, typename IndexType>
+    void get(const IndexType ix, NBL_REF_ARG(AccessType) value)
+    {
+        value = preloaded[ix>>WorkgroupSizeLog2];
+    }
+    template<typename AccessType, typename IndexType>
+    void set(const IndexType ix, const AccessType value)
+    {
+        preloaded[ix>>WorkgroupSizeLog2] = value;
+    }
+
+    void preload()
+    {
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        [unroll]
+        for (uint16_t idx = 0; idx < PreloadedDataCount; idx++)
+            data.template get<dtype_t, uint16_t>(idx * WorkgroupSize + invocationIndex, preloaded[idx]);
+    }
+    void unload()
+    {
+        const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex();
+        [unroll]
+        for (uint16_t idx = 0; idx < PreloadedDataCount; idx++)
+            data.template set<dtype_t, uint16_t>(idx * WorkgroupSize + invocationIndex, preloaded[idx]);
+    }
+
+    void workgroupExecutionAndMemoryBarrier()
+    {
+        glsl::barrier();
+        //glsl::memoryBarrierShared(); implied by the above
+    }
+
+    DataProxy<VirtualWorkgroupSize, ItemsPerInvocation> data;
+    dtype_t preloaded[PreloadedDataCount];
+};
+
+}
+}
+
+#endif