diff --git a/23_ArithmeticUnitTest/CMakeLists.txt b/23_Arithmetic2UnitTest/CMakeLists.txt similarity index 100% rename from 23_ArithmeticUnitTest/CMakeLists.txt rename to 23_Arithmetic2UnitTest/CMakeLists.txt diff --git a/23_ArithmeticUnitTest/app_resources/common.hlsl b/23_Arithmetic2UnitTest/app_resources/common.hlsl similarity index 100% rename from 23_ArithmeticUnitTest/app_resources/common.hlsl rename to 23_Arithmetic2UnitTest/app_resources/common.hlsl diff --git a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl new file mode 100644 index 000000000..376f69579 --- /dev/null +++ b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl @@ -0,0 +1,68 @@ +#include "common.hlsl" + +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/subgroup/basic.hlsl" +#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" + +#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" + +// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 +uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} + +#ifndef ITEMS_PER_INVOCATION +#error "Define ITEMS_PER_INVOCATION!" +#endif + +typedef vector type_t; + +// unfortunately DXC chokes on descriptors as static members +// https://github.com/microsoft/DirectXShaderCompiler/issues/5940 +[[vk::binding(0, 0)]] StructuredBuffer inputValue; +[[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; + +// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way +uint32_t globalIndex(); +// since we test ITEMS_PER_WG class binop, typename T, uint32_t N> +static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) +{ + // TODO static assert vector == type_t + //using type_t = vector; + using config_t = nbl::hlsl::subgroup2::Configuration; + using params_t = nbl::hlsl::subgroup2::ArithmeticParams::base_t, N, nbl::hlsl::jit::device_capabilities>; + + if (globalIndex()==0u) + output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); + + operation_t func; + if (canStore()) + output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); +} + + +type_t test() +{ + const uint32_t idx = globalIndex(); + type_t sourceVal = inputValue[idx]; + + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + return sourceVal; +} + +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" diff --git a/23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl similarity index 62% rename from 23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl rename to 23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl index 479265d73..2cc1ccb60 100644 --- a/23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl @@ -6,7 +6,7 @@ uint32_t globalIndex() { - return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); + return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); } bool canStore() {return true;} @@ -14,5 +14,5 @@ bool canStore() {return true;} [numthreads(WORKGROUP_SIZE,1,1)] void main() { - test(); -} \ No newline at end of file + test(); +} diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl new file mode 100644 index 000000000..7f1b5dcbe --- /dev/null +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -0,0 +1,85 @@ +#pragma shader_stage(compute) + +#include "workgroupCommon.hlsl" + +template +struct DataProxy +{ + using dtype_t = vector; + static_assert(nbl::hlsl::is_same_v); + + void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value) + { + value = inputValue[ix]; + } + void set(const uint32_t ix, const dtype_t value) + { + output[Binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * ix, value); + } + + void workgroupExecutionAndMemoryBarrier() + { + nbl::hlsl::glsl::barrier(); + //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above + } +}; + +static ScratchProxy arithmeticAccessor; + +template +struct operation_t +{ + using binop_base_t = typename Binop::base_t; + using otype_t = typename Binop::type_t; + + void operator()() + { + DataProxy dataAccessor; + nbl::hlsl::OPERATION::template __call, ScratchProxy>(dataAccessor,arithmeticAccessor); + // we barrier before because we alias the accessors for Binop + arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); + } +}; + + +template class binop, typename T, uint32_t N> +static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) +{ + if (globalIndex()==0u) + output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); + + operation_t,nbl::hlsl::jit::device_capabilities> func; + func(); // store is done with data accessor now +} + + +type_t test() +{ + const type_t sourceVal = inputValue[globalIndex()]; + + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + return sourceVal; +} + + +uint32_t globalIndex() +{ + return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex(); +} + +bool canStore() +{ + return nbl::hlsl::workgroup::SubgroupContiguousIndex(); + +typedef vector type_t; + +// unfortunately DXC chokes on descriptors as static members +// https://github.com/microsoft/DirectXShaderCompiler/issues/5940 +[[vk::binding(0, 0)]] StructuredBuffer inputValue; +[[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; + +// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way +uint32_t globalIndex(); +// since we test ITEMS_PER_WG&& system) override @@ -138,39 +138,6 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu pipelineLayout = m_device->createPipelineLayout({},std::move(dsLayout)); } - const auto spirv_isa_cache_path = localOutputCWD/"spirv_isa_cache.bin"; - // enclose to make sure file goes out of scope and we can reopen it - { - smart_refctd_ptr spirv_isa_cache_input; - // try to load SPIR-V to ISA cache - { - ISystem::future_t> fileCreate; - m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_READ|IFile::ECF_MAPPABLE|IFile::ECF_COHERENT); - if (auto lock=fileCreate.acquire()) - spirv_isa_cache_input = *lock; - } - // create the cache - { - std::span spirv_isa_cache_data = {}; - if (spirv_isa_cache_input) - spirv_isa_cache_data = {reinterpret_cast(spirv_isa_cache_input->getMappedPointer()),spirv_isa_cache_input->getSize()}; - else - m_logger->log("Failed to load SPIR-V 2 ISA cache!",ILogger::ELL_PERFORMANCE); - // Normally we'd deserialize a `ICPUPipelineCache` properly and pass that instead - m_spirv_isa_cache = m_device->createPipelineCache(spirv_isa_cache_data); - } - } - { - // TODO: rename `deleteDirectory` to just `delete`? and a `IFile::setSize()` ? - m_system->deleteDirectory(spirv_isa_cache_path); - ISystem::future_t> fileCreate; - m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_WRITE); - // I can be relatively sure I'll succeed to acquire the future, the pointer to created file might be null though. - m_spirv_isa_cache_output=*fileCreate.acquire(); - if (!m_spirv_isa_cache_output) - logFail("Failed to Create SPIR-V to ISA cache file."); - } - // load shader source from file auto getShaderSource = [&](const char* filePath) -> auto { @@ -207,42 +174,35 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) { const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); - for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize += subgroupSize) + for (uint32_t i = 0; i < WorkgroupSizes.size(); i++) { + const uint32_t workgroupSize = WorkgroupSizes[i]; // make sure renderdoc captures everything for debugging m_api->startCapture(); m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize); - bool passed = true; - // TODO async the testing - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; - logTestOutcome(passed, workgroupSize); - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; - logTestOutcome(passed, workgroupSize); - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; - logTestOutcome(passed, workgroupSize); - for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--) + for (uint32_t j = 0; j < ItemsPerInvocations.size(); j++) { + const uint32_t itemsPerInvocation = ItemsPerInvocations[j]; + m_logger->log("Testing Items per Invocation %u", ILogger::ELL_INFO, itemsPerInvocation); + bool passed = true; + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed; + logTestOutcome(passed, workgroupSize); + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed; + logTestOutcome(passed, workgroupSize); + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, ~0u, itemsPerInvocation) && passed; + logTestOutcome(passed, workgroupSize); + + const uint32_t itemsPerWG = workgroupSize <= subgroupSize ? workgroupSize * itemsPerInvocation : itemsPerInvocation * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2; // TODO use Config somehow m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG, itemsPerInvocation) && passed; logTestOutcome(passed, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG, itemsPerInvocation) && passed; logTestOutcome(passed, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG, itemsPerInvocation) && passed; logTestOutcome(passed, itemsPerWG); } m_api->endCapture(); - - // save cache every now and then - { - auto cpu = m_spirv_isa_cache->convertToCPUCache(); - // Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata - auto bin = cpu->getEntries().begin()->second.bin; - IFile::success_t success; - m_spirv_isa_cache_output->write(success,bin->data(),0ull,bin->size()); - if (!success) - logFail("Could not write Create SPIR-V to ISA cache to disk!"); - } } } @@ -289,42 +249,89 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu .requireFullSubgroups = true }; core::smart_refctd_ptr pipeline; - if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{¶ms,1},&pipeline)) + if (!m_device->createComputePipelines(nullptr,{¶ms,1},&pipeline)) return nullptr; return pipeline; } - /*template class Arithmetic, bool WorkgroupTest> - bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u) - { - return true; - }*/ - template class Arithmetic, bool WorkgroupTest> - bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u) + bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u, uint32_t itemsPerInvoc = 1u) { std::string arith_name = Arithmetic>::name; - - smart_refctd_ptr overridenUnspecialized; + const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize); + + auto compiler = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); + CHLSLCompiler::SOptions options = {}; + options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE; + options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion; + options.spirvOptimizer = nullptr; +#ifndef _NBL_DEBUG + ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; + auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); + options.spirvOptimizer = opt.get(); +#else + options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT; +#endif + options.preprocessorOptions.sourceIdentifier = source->getFilepathHint(); + options.preprocessorOptions.logger = m_logger.get(); + + auto* includeFinder = compiler->getDefaultIncludeFinder(); + includeFinder->addSearchPath("nbl/builtin/hlsl/jit", core::make_smart_refctd_ptr(m_physicalDevice->getLimits(), m_device->getEnabledFeatures())); + options.preprocessorOptions.includeFinder = includeFinder; + + smart_refctd_ptr overriddenUnspecialized; if constexpr (WorkgroupTest) { - overridenUnspecialized = CHLSLCompiler::createOverridenCopy( - source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n", - (("workgroup::") + arith_name).c_str(), workgroupSize, itemsPerWG - ); + const std::string definitions[5] = { + "workgroup2::" + arith_name, + std::to_string(workgroupSizeLog2), + std::to_string(itemsPerWG), + std::to_string(itemsPerInvoc), + std::to_string(subgroupSizeLog2) + }; + + const IShaderCompiler::SMacroDefinition defines[5] = { + { "OPERATION", definitions[0] }, + { "WORKGROUP_SIZE_LOG2", definitions[1] }, + { "ITEMS_PER_WG", definitions[2] }, + { "ITEMS_PER_INVOCATION", definitions[3] }, + { "SUBGROUP_SIZE_LOG2", definitions[4] } + }; + options.preprocessorOptions.extraDefines = { defines, defines + 5 }; + + overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); } else { - itemsPerWG = workgroupSize; - overridenUnspecialized = CHLSLCompiler::createOverridenCopy( - source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n", - (("subgroup::") + arith_name).c_str(), workgroupSize - ); + const std::string definitions[4] = { + "subgroup2::" + arith_name, + std::to_string(workgroupSize), + std::to_string(itemsPerInvoc), + std::to_string(subgroupSizeLog2) + }; + + const IShaderCompiler::SMacroDefinition defines[4] = { + { "OPERATION", definitions[0] }, + { "WORKGROUP_SIZE", definitions[1] }, + { "ITEMS_PER_INVOCATION", definitions[2] }, + { "SUBGROUP_SIZE_LOG2", definitions[3] } + }; + options.preprocessorOptions.extraDefines = { defines, defines + 4 }; + + overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); } - auto pipeline = createPipeline(overridenUnspecialized.get(),subgroupSizeLog2); + + auto pipeline = createPipeline(overriddenUnspecialized.get(),subgroupSizeLog2); // TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`) - const uint32_t workgroupCount = elementCount / itemsPerWG; + uint32_t workgroupCount; + if constexpr (WorkgroupTest) + workgroupCount = elementCount / itemsPerWG; + else + { + itemsPerWG = workgroupSize; + workgroupCount = elementCount / (itemsPerWG * itemsPerInvoc); + } cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE); cmdbuf->bindComputePipeline(pipeline.get()); cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get()); @@ -359,22 +366,20 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu m_device->blockForSemaphores(wait); // check results - bool passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount); - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - if constexpr (WorkgroupTest) - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; + bool passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc); + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, itemsPerInvoc) && passed; return passed; } //returns true if result matches template class Arithmetic, class Binop, bool WorkgroupTest> - bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount) + bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount, const uint32_t itemsPerInvoc) { bool success = true; @@ -394,47 +399,64 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu const auto testData = reinterpret_cast(dataFromBuffer + 1); // TODO: parallel for (the temporary values need to be threadlocal or what?) // now check if the data obtained has valid values - type_t* tmp = new type_t[itemsPerWG]; - type_t* ballotInput = new type_t[itemsPerWG]; + type_t* tmp; + if constexpr (WorkgroupTest) + tmp = new type_t[itemsPerWG]; + else + tmp = new type_t[itemsPerWG * itemsPerInvoc]; for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++) { - const auto workgroupOffset = workgroupID * itemsPerWG; - if constexpr (WorkgroupTest) { - if constexpr (std::is_same_v, Binop>) + const auto workgroupOffset = workgroupID * itemsPerWG; + Arithmetic::impl(tmp, inputData + workgroupOffset, itemsPerWG); + + for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++) { - for (auto i = 0u; i < itemsPerWG; i++) - ballotInput[i] = inputData[i + workgroupOffset] & 0x1u; - Arithmetic::impl(tmp, ballotInput, itemsPerWG); + const auto globalInvocationIndex = workgroupOffset + localInvocationIndex; + const auto cpuVal = tmp[localInvocationIndex]; + const auto gpuVal = testData[globalInvocationIndex]; + if (cpuVal != gpuVal) + { + m_logger->log( + "Failed test #%d (%s) (%s) Expected %u got %u for workgroup %d and localinvoc %d", + ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name, + cpuVal, gpuVal, workgroupID, localInvocationIndex + ); + success = false; + break; + } } - else - Arithmetic::impl(tmp, inputData + workgroupOffset, itemsPerWG); } else { + const auto workgroupOffset = workgroupID * itemsPerWG * itemsPerInvoc; for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize) - Arithmetic::impl(tmp + pseudoSubgroupID, inputData + workgroupOffset + pseudoSubgroupID, subgroupSize); - } + Arithmetic::impl(tmp + pseudoSubgroupID * itemsPerInvoc, inputData + workgroupOffset + pseudoSubgroupID * itemsPerInvoc, subgroupSize * itemsPerInvoc); - for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++) - { - const auto globalInvocationIndex = workgroupOffset + localInvocationIndex; - const auto cpuVal = tmp[localInvocationIndex]; - const auto gpuVal = testData[globalInvocationIndex]; - if (cpuVal != gpuVal) + for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++) { - m_logger->log( - "Failed test #%d (%s) (%s) Expected %u got %u for workgroup %d and localinvoc %d", - ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name, - cpuVal, gpuVal, workgroupID, localInvocationIndex - ); - success = false; - break; + const auto localOffset = localInvocationIndex * itemsPerInvoc; + const auto globalInvocationIndex = workgroupOffset + localOffset; + + for (uint32_t itemInvocationIndex = 0u; itemInvocationIndex < itemsPerInvoc; itemInvocationIndex++) + { + const auto cpuVal = tmp[localOffset + itemInvocationIndex]; + const auto gpuVal = testData[globalInvocationIndex + itemInvocationIndex]; + if (cpuVal != gpuVal) + { + m_logger->log( + "Failed test #%d (%s) (%s) Expected %u got %u for workgroup %d and localinvoc %d and iteminvoc %d", + ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name, + cpuVal, gpuVal, workgroupID, localInvocationIndex, itemInvocationIndex + ); + success = false; + break; + } + } } } } - delete[] ballotInput; delete[] tmp; return success; @@ -442,8 +464,6 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu IQueue* transferDownQueue; IQueue* computeQueue; - smart_refctd_ptr m_spirv_isa_cache; - smart_refctd_ptr m_spirv_isa_cache_output; uint32_t* inputData = nullptr; constexpr static inline uint32_t OutputBufferCount = 8u; @@ -457,6 +477,9 @@ class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueu smart_refctd_ptr resultsBuffer; uint32_t totalFailCount = 0; + + constexpr static inline std::array WorkgroupSizes = { 32, 256, 512, 1024 }; + constexpr static inline std::array ItemsPerInvocations = { 1, 2, 4 }; }; -NBL_MAIN_FUNC(ArithmeticUnitTestApp) \ No newline at end of file +NBL_MAIN_FUNC(Workgroup2ScanTestApp) \ No newline at end of file diff --git a/23_ArithmeticUnitTest/pipeline.groovy b/23_Arithmetic2UnitTest/pipeline.groovy similarity index 100% rename from 23_ArithmeticUnitTest/pipeline.groovy rename to 23_Arithmetic2UnitTest/pipeline.groovy diff --git a/23_ArithmeticUnitTest/app_resources/shaderCommon.hlsl b/23_ArithmeticUnitTest/app_resources/shaderCommon.hlsl deleted file mode 100644 index 13ee8d21e..000000000 --- a/23_ArithmeticUnitTest/app_resources/shaderCommon.hlsl +++ /dev/null @@ -1,55 +0,0 @@ -#include "common.hlsl" - -#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" -#include "nbl/builtin/hlsl/subgroup/basic.hlsl" -#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl" - -#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" - -// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 -uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} - -// unfortunately DXC chokes on descriptors as static members -// https://github.com/microsoft/DirectXShaderCompiler/issues/5940 -[[vk::binding(0, 0)]] StructuredBuffer inputValue; -[[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; - -// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way -uint32_t globalIndex(); -// since we test ITEMS_PER_WG class binop> -static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) -{ - if (globalIndex()==0u) - output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); - - operation_t::base_t,nbl::hlsl::jit::device_capabilities> func; - if (canStore()) - output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); -} - - -type_t test() -{ - const type_t sourceVal = inputValue[globalIndex()]; - - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - return sourceVal; -} - -#include "nbl/builtin/hlsl/workgroup/basic.hlsl" \ No newline at end of file diff --git a/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl b/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl deleted file mode 100644 index 9bafae47f..000000000 --- a/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl +++ /dev/null @@ -1,107 +0,0 @@ -#pragma shader_stage(compute) - - -#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl" - -static const uint32_t ArithmeticSz = nbl::hlsl::workgroup::scratch_size_arithmetic::value; -static const uint32_t BallotSz = nbl::hlsl::workgroup::scratch_size_ballot::value; -static const uint32_t ScratchSz = ArithmeticSz+BallotSz; - -// TODO: Can we make it a static variable in the ScratchProxy struct? -groupshared uint32_t scratch[ScratchSz]; - - -#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" - - -template -struct ScratchProxy -{ - void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value) - { - value = scratch[ix+offset]; - } - void set(const uint32_t ix, const uint32_t value) - { - scratch[ix+offset] = value; - } - - uint32_t atomicOr(const uint32_t ix, const uint32_t value) - { - return nbl::hlsl::glsl::atomicOr(scratch[ix],value); - } - - void workgroupExecutionAndMemoryBarrier() - { - nbl::hlsl::glsl::barrier(); - //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above - } -}; - -static ScratchProxy<0> arithmeticAccessor; - - -#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" - - -template -struct operation_t -{ - using type_t = typename Binop::type_t; - - type_t operator()(type_t value) - { - type_t retval = nbl::hlsl::OPERATION::template __call >(value,arithmeticAccessor); - // we barrier before because we alias the accessors for Binop - arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); - return retval; - } -}; - - -#include "shaderCommon.hlsl" - -static ScratchProxy ballotAccessor; - - -uint32_t globalIndex() -{ - return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex(); -} - -bool canStore() -{ - return nbl::hlsl::workgroup::SubgroupContiguousIndex()::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); - - // we can only ballot booleans, so low bit - nbl::hlsl::workgroup::ballot >(bool(sourceVal & 0x1u), ballotAccessor); - // need to barrier between ballot and usages of a ballot by myself - ballotAccessor.workgroupExecutionAndMemoryBarrier(); - - uint32_t destVal = 0xdeadbeefu; -#define CONSTEXPR_OP_TYPE_TEST(IS_OP) nbl::hlsl::is_same,0x45>,nbl::hlsl::workgroup::IS_OP,0x45> >::value -#define BALLOT_TEMPLATE_ARGS ITEMS_PER_WG,decltype(ballotAccessor),decltype(arithmeticAccessor),nbl::hlsl::jit::device_capabilities - if (CONSTEXPR_OP_TYPE_TEST(reduction)) - destVal = nbl::hlsl::workgroup::ballotBitCount(ballotAccessor,arithmeticAccessor); - else if (CONSTEXPR_OP_TYPE_TEST(inclusive_scan)) - destVal = nbl::hlsl::workgroup::ballotInclusiveBitCount(ballotAccessor,arithmeticAccessor); - else if (CONSTEXPR_OP_TYPE_TEST(exclusive_scan)) - destVal = nbl::hlsl::workgroup::ballotExclusiveBitCount(ballotAccessor,arithmeticAccessor); - else - { - assert(false); - } -#undef BALLOT_TEMPLATE_ARGS -#undef CONSTEXPR_OP_TYPE_TEST - - if (canStore()) - output[ballot::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),destVal); -} \ No newline at end of file diff --git a/74_Arithmetic2Bench/CMakeLists.txt b/74_Arithmetic2Bench/CMakeLists.txt new file mode 100644 index 000000000..0724366c9 --- /dev/null +++ b/74_Arithmetic2Bench/CMakeLists.txt @@ -0,0 +1,25 @@ + +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + +if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) +endif() \ No newline at end of file diff --git a/74_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/74_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl new file mode 100644 index 000000000..2f575d39a --- /dev/null +++ b/74_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl @@ -0,0 +1,54 @@ +#pragma shader_stage(compute) + +#define operation_t nbl::hlsl::OPERATION + +#include "shaderCommon.hlsl" + +// NOTE added dummy output image to be able to profile with Nsight, which still doesn't support profiling headless compute shaders +[[vk::binding(2, 0)]] RWTexture2D outImage; // dummy + +uint32_t globalIndex() +{ + return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); +} + +bool canStore() {return true;} + +#ifndef NUM_LOOPS +#error "Define NUM_LOOPS!" +#endif + +template class binop, typename T, uint32_t N> +static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) +{ + using config_t = nbl::hlsl::subgroup2::Configuration; + using params_t = nbl::hlsl::subgroup2::ArithmeticParams::base_t, N, nbl::hlsl::jit::device_capabilities>; + type_t value = sourceVal; + + operation_t func; + // [unroll] + for (uint32_t i = 0; i < NUM_LOOPS; i++) + value = func(value); + + output[binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * globalIndex(), value); +} + +void benchmark() +{ + const uint32_t idx = globalIndex(); + type_t sourceVal = inputValue[idx]; + + subbench(sourceVal); + subbench(sourceVal); + subbench(sourceVal); + subbench(sourceVal); + subbench(sourceVal); + subbench(sourceVal); + subbench(sourceVal); +} + +[numthreads(WORKGROUP_SIZE,1,1)] +void main() +{ + benchmark(); +} diff --git a/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl new file mode 100644 index 000000000..ac6ea7fd8 --- /dev/null +++ b/74_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -0,0 +1,95 @@ +#pragma shader_stage(compute) + +#include "workgroupCommon.hlsl" + +// NOTE added dummy output image to be able to profile with Nsight, which still doesn't support profiling headless compute shaders +[[vk::binding(2, 0)]] RWTexture2D outImage; // dummy + +template +struct DataProxy +{ + using dtype_t = vector; + static_assert(nbl::hlsl::is_same_v); + + // we don't want to write/read storage multiple times in loop; doesn't seem optimized out in generated spirv + void get(const uint32_t ix, NBL_REF_ARG(dtype_t) value) + { + // value = inputValue[ix]; + value = nbl::hlsl::promote(globalIndex()); + } + void set(const uint32_t ix, const dtype_t value) + { + // output[Binop::BindingIndex].template Store(sizeof(uint32_t) + sizeof(type_t) * ix, value); + } + + void workgroupExecutionAndMemoryBarrier() + { + nbl::hlsl::glsl::barrier(); + //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above + } +}; + +static ScratchProxy arithmeticAccessor; + +template +struct operation_t +{ + using binop_base_t = typename Binop::base_t; + using otype_t = typename Binop::type_t; + + void operator()() + { + DataProxy dataAccessor; + nbl::hlsl::OPERATION::template __call, ScratchProxy>(dataAccessor,arithmeticAccessor); + // we barrier before because we alias the accessors for Binop + arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); + } +}; + +#ifndef NUM_LOOPS +#error "Define NUM_LOOPS!" +#endif + +template class binop, typename T, uint32_t N> +static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) +{ + if (globalIndex()==0u) + output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); + + operation_t,nbl::hlsl::jit::device_capabilities> func; + // TODO separate out store/load from DataProxy? so we don't do too many RW in benchmark + for (uint32_t i = 0; i < NUM_LOOPS; i++) + func(); // store is done with data accessor now +} + + +type_t benchmark() +{ + const type_t sourceVal = inputValue[globalIndex()]; + + subbench(sourceVal); + subbench(sourceVal); + subbench(sourceVal); + subbench(sourceVal); + subbench(sourceVal); + subbench(sourceVal); + subbench(sourceVal); + return sourceVal; +} + + +uint32_t globalIndex() +{ + return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex(); +} + +bool canStore() +{ + return nbl::hlsl::workgroup::SubgroupContiguousIndex() +struct Output +{ + NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanElementCount = kScanElementCount; + + uint32_t subgroupSize; + uint32_t data[ScanElementCount]; +}; + +template +struct bit_and : nbl::hlsl::bit_and +{ + using base_t = nbl::hlsl::bit_and; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 0; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "bit_and"; +#endif +}; +template +struct bit_or : nbl::hlsl::bit_or +{ + using base_t = nbl::hlsl::bit_or; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 1; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "bit_xor"; +#endif +}; +template +struct bit_xor : nbl::hlsl::bit_xor +{ + using base_t = nbl::hlsl::bit_xor; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 2; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "bit_or"; +#endif +}; +template +struct plus : nbl::hlsl::plus +{ + using base_t = nbl::hlsl::plus; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 3; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "plus"; +#endif +}; +template +struct multiplies : nbl::hlsl::multiplies +{ + using base_t = nbl::hlsl::multiplies; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 4; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "multiplies"; +#endif +}; +template +struct minimum : nbl::hlsl::minimum +{ + using base_t = nbl::hlsl::minimum; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 5; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "minimum"; +#endif +}; +template +struct maximum : nbl::hlsl::maximum +{ + using base_t = nbl::hlsl::maximum; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 6; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "maximum"; +#endif +}; + +template +struct ballot : nbl::hlsl::plus +{ + using base_t = nbl::hlsl::plus; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 7; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "bitcount"; +#endif +}; + +#include "nbl/builtin/hlsl/subgroup/basic.hlsl" diff --git a/74_Arithmetic2Bench/app_resources/shaderCommon.hlsl b/74_Arithmetic2Bench/app_resources/shaderCommon.hlsl new file mode 100644 index 000000000..376f69579 --- /dev/null +++ b/74_Arithmetic2Bench/app_resources/shaderCommon.hlsl @@ -0,0 +1,68 @@ +#include "common.hlsl" + +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/subgroup/basic.hlsl" +#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" + +#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" + +// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 +uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} + +#ifndef ITEMS_PER_INVOCATION +#error "Define ITEMS_PER_INVOCATION!" +#endif + +typedef vector type_t; + +// unfortunately DXC chokes on descriptors as static members +// https://github.com/microsoft/DirectXShaderCompiler/issues/5940 +[[vk::binding(0, 0)]] StructuredBuffer inputValue; +[[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; + +// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way +uint32_t globalIndex(); +// since we test ITEMS_PER_WG class binop, typename T, uint32_t N> +static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) +{ + // TODO static assert vector == type_t + //using type_t = vector; + using config_t = nbl::hlsl::subgroup2::Configuration; + using params_t = nbl::hlsl::subgroup2::ArithmeticParams::base_t, N, nbl::hlsl::jit::device_capabilities>; + + if (globalIndex()==0u) + output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); + + operation_t func; + if (canStore()) + output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); +} + + +type_t test() +{ + const uint32_t idx = globalIndex(); + type_t sourceVal = inputValue[idx]; + + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + subtest(sourceVal); + return sourceVal; +} + +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" diff --git a/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl b/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl new file mode 100644 index 000000000..026687cfa --- /dev/null +++ b/74_Arithmetic2Bench/app_resources/workgroupCommon.hlsl @@ -0,0 +1,69 @@ +#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl" + +#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl" + +#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" + +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/subgroup/basic.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" + +#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" + +#include "common.hlsl" + +static const uint32_t WORKGROUP_SIZE = 1u << WORKGROUP_SIZE_LOG2; + +// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 +uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} + +#ifndef ITEMS_PER_INVOCATION +#error "Define ITEMS_PER_INVOCATION!" +#endif + +using config_t = nbl::hlsl::workgroup2::Configuration; + +typedef vector type_t; + +// unfortunately DXC chokes on descriptors as static members +// https://github.com/microsoft/DirectXShaderCompiler/issues/5940 +[[vk::binding(0, 0)]] StructuredBuffer inputValue; +[[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; + +// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way +uint32_t globalIndex(); +// since we test ITEMS_PER_WG +struct emulatedReduction +{ + using type_t = typename Binop::type_t; + + static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) + { + const type_t red = std::reduce(in,in+itemCount,Binop::identity,Binop()); + std::fill(out,out+itemCount,red); + } + + static inline constexpr const char* name = "reduction"; +}; +template +struct emulatedScanInclusive +{ + using type_t = typename Binop::type_t; + + static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) + { + std::inclusive_scan(in,in+itemCount,out,Binop()); + } + static inline constexpr const char* name = "inclusive_scan"; +}; +template +struct emulatedScanExclusive +{ + using type_t = typename Binop::type_t; + + static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) + { + std::exclusive_scan(in,in+itemCount,out,Binop::identity,Binop()); + } + static inline constexpr const char* name = "exclusive_scan"; +}; + +// NOTE added swapchain + drawing frames to be able to profile with Nsight, which still doesn't support profiling headless compute shaders +class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +{ + using device_base_t = examples::SimpleWindowedApplication; + using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + + constexpr static inline uint32_t WIN_W = 1280; + constexpr static inline uint32_t WIN_H = 720; + constexpr static inline uint32_t MaxFramesInFlight = 5; + +public: + ArithmeticBenchApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + inline core::vector getSurfaces() const override + { + if (!m_surface) + { + { + auto windowCallback = core::make_smart_refctd_ptr(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger)); + IWindow::SCreationParams params = {}; + params.callback = core::make_smart_refctd_ptr(); + params.width = WIN_W; + params.height = WIN_H; + params.x = 32; + params.y = 32; + params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE; + params.windowCaption = "ArithmeticBenchApp"; + params.callback = windowCallback; + const_cast&>(m_window) = m_winMgr->createWindow(std::move(params)); + } + + auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast(m_window)); + const_cast&>(m_surface) = CSimpleResizeSurface::create(std::move(surface)); + } + + if (m_surface) + return { {m_surface->getSurface()/*,EQF_NONE*/} }; + + return {}; + } + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + m_inputSystem = make_smart_refctd_ptr(logger_opt_smart_ptr(smart_refctd_ptr(m_logger))); + + if (!device_base_t::onAppInitialized(std::move(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + m_semaphore = m_device->createSemaphore(m_realFrameIx); + if (!m_semaphore) + return logFail("Failed to Create a Semaphore!"); + + ISwapchain::SCreationParams swapchainParams = { .surface = m_surface->getSurface() }; + if (!swapchainParams.deduceFormat(m_physicalDevice)) + return logFail("Could not choose a Surface Format for the Swapchain!"); + + auto graphicsQueue = getGraphicsQueue(); + if (!m_surface || !m_surface->init(graphicsQueue, std::make_unique(), swapchainParams.sharedParams)) + return logFail("Could not create Window & Surface or initialize the Surface!"); + + auto pool = m_device->createCommandPool(graphicsQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + + for (auto i = 0u; i < MaxFramesInFlight; i++) + { + if (!pool) + return logFail("Couldn't create Command Pool!"); + if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 })) + return logFail("Couldn't create Command Buffer!"); + } + + m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); + m_surface->recreateSwapchain(); + + transferDownQueue = getTransferDownQueue(); + computeQueue = getComputeQueue(); + + // TODO: get the element count from argv + const uint32_t elementCount = Output<>::ScanElementCount; + // populate our random data buffer on the CPU and create a GPU copy + inputData = new uint32_t[elementCount]; + smart_refctd_ptr gpuinputDataBuffer; + { + std::mt19937 randGenerator(0xdeadbeefu); + for (uint32_t i = 0u; i < elementCount; i++) + inputData[i] = randGenerator(); // TODO: change to using xoroshiro, then we can skip having the input buffer at all + + IGPUBuffer::SCreationParams inputDataBufferCreationParams = {}; + inputDataBufferCreationParams.size = sizeof(Output<>::data[0]) * elementCount; + inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT; + m_utils->createFilledDeviceLocalBufferOnDedMem( + SIntendedSubmitInfo{.queue=getTransferUpQueue()}, + std::move(inputDataBufferCreationParams), + inputData + ).move_into(gpuinputDataBuffer); + } + + // create 8 buffers for 8 operations + for (auto i=0u; igetSize(); + params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT; + + outputBuffers[i] = m_device->createBuffer(std::move(params)); + auto mreq = outputBuffers[i]->getMemoryReqs(); + mreq.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); + assert(mreq.memoryTypeBits); + + auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get()); + assert(bufferMem.isValid()); + } + + // create dummy image + dummyImg = m_device->createImage({ + { + .type = IGPUImage::ET_2D, + .samples = asset::ICPUImage::ESCF_1_BIT, + .format = asset::EF_R16G16B16A16_SFLOAT, + .extent = {WIN_W, WIN_H, 1}, + .mipLevels = 1, + .arrayLayers = 1, + .flags = IImage::ECF_NONE, + .usage = core::bitflag(asset::IImage::EUF_STORAGE_BIT) | asset::IImage::EUF_TRANSFER_SRC_BIT + } + }); + if (!dummyImg || !m_device->allocate(dummyImg->getMemoryReqs(), dummyImg.get()).isValid()) + return logFail("Could not create HDR Image"); + + // create Descriptor Sets and Pipeline Layouts + smart_refctd_ptr benchPplnLayout; + { + // create Descriptor Set Layout + smart_refctd_ptr dsLayout; + { + IGPUDescriptorSetLayout::SBinding binding[2]; + for (uint32_t i = 0u; i < 2; i++) + binding[i] = {{},i,IDescriptor::E_TYPE::ET_STORAGE_BUFFER,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr }; + binding[1].count = OutputBufferCount; + dsLayout = m_device->createDescriptorSetLayout(binding); + } + + // set and transient pool + smart_refctd_ptr benchLayout; + { + IGPUDescriptorSetLayout::SBinding binding[3]; + for (uint32_t i = 0u; i < 2; i++) + binding[i] = { {},i,IDescriptor::E_TYPE::ET_STORAGE_BUFFER,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr }; + binding[1].count = OutputBufferCount; + binding[2] = { {},2,IDescriptor::E_TYPE::ET_STORAGE_IMAGE,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr }; + benchLayout = m_device->createDescriptorSetLayout(binding); + } + + benchPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, { &benchLayout.get(),1 }); + benchDs = benchPool->createDescriptorSet(smart_refctd_ptr(benchLayout)); + { + IGPUDescriptorSet::SDescriptorInfo infos[1 + OutputBufferCount]; + infos[0].desc = gpuinputDataBuffer; + infos[0].info.buffer = { 0u,gpuinputDataBuffer->getSize() }; + for (uint32_t i = 1u; i <= OutputBufferCount; i++) + { + auto buff = outputBuffers[i - 1]; + infos[i].info.buffer = { 0u,buff->getSize() }; + infos[i].desc = std::move(buff); // save an atomic in the refcount + } + // write swapchain image descriptor in loop + + IGPUDescriptorSet::SWriteDescriptorSet writes[2]; + for (uint32_t i = 0u; i < 2; i++) + writes[i] = { benchDs.get(),i,0u,1u,infos + i }; + writes[1].count = OutputBufferCount; + + m_device->updateDescriptorSets(2, writes, 0u, nullptr); + } + benchPplnLayout = m_device->createPipelineLayout({}, std::move(benchLayout)); + } + + // load shader source from file + auto getShaderSource = [&](const char* filePath) -> auto + { + IAssetLoader::SAssetLoadParams lparams = {}; + lparams.logger = m_logger.get(); + lparams.workingDirectory = ""; + auto bundle = m_assetMgr->getAsset(filePath, lparams); + if (bundle.getContents().empty() || bundle.getAssetType()!=IAsset::ET_SHADER) + { + m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath); + exit(-1); + } + auto firstAssetInBundle = bundle.getContents()[0]; + return smart_refctd_ptr_static_cast(firstAssetInBundle); + }; + + auto subgroupBenchSource = getShaderSource("app_resources/benchmarkSubgroup.comp.hlsl"); + auto workgroupBenchSource = getShaderSource("app_resources/benchmarkWorkgroup.comp.hlsl"); + // now create or retrieve final resources to run our tests + sema = m_device->createSemaphore(timelineValue); + resultsBuffer = ICPUBuffer::create({ outputBuffers[0]->getSize() }); + smart_refctd_ptr cmdbuf; + { + smart_refctd_ptr cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1})) + { + logFail("Failed to create Command Buffers!\n"); + return false; + } + } + + // const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; + const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; + const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; + + // for each workgroup size (manually adjust items per invoc, operation else uses up a lot of ram) + if constexpr (DoWorkgroupBenchmarks) + { + for (uint32_t i = 0; i < workgroupSizes.size(); i++) + benchSets[i] = createBenchmarkPipelines(workgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops); + } + else + { + for (uint32_t i = 0; i < workgroupSizes.size(); i++) + benchSets[i] = createBenchmarkPipelines(subgroupBenchSource, benchPplnLayout.get(), elementCount, hlsl::findMSB(MinSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops); + } + + m_winMgr->show(m_window.get()); + + return true; + } + + virtual bool onAppTerminated() override + { + delete[] inputData; + return true; + } + + // the unit test is carried out on init + void workLoopBody() override + { + const auto resourceIx = m_realFrameIx % MaxFramesInFlight; + + const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight()); + + if (m_realFrameIx >= framesInFlight) + { + const ISemaphore::SWaitInfo cbDonePending[] = + { + { + .semaphore = m_semaphore.get(), + .value = m_realFrameIx + 1 - framesInFlight + } + }; + if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) + return; + } + + m_currentImageAcquire = m_surface->acquireNextImage(); + if (!m_currentImageAcquire) + return; + + auto* const cmdbuf = m_cmdBufs.data()[resourceIx].get(); + cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + // barrier transition to GENERAL + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; + imageBarriers[0].barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, + .srcAccessMask = ACCESS_FLAGS::NONE, + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS + } + }; + imageBarriers[0].image = dummyImg.get(); + imageBarriers[0].subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }; + imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED; + imageBarriers[0].newLayout = IImage::LAYOUT::GENERAL; + + cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); + } + + // bind dummy image + IGPUImageView::SCreationParams viewParams = { + .flags = IGPUImageView::ECF_NONE, + .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT, + .image = dummyImg, + .viewType = IGPUImageView::ET_2D, + .format = dummyImg->getCreationParameters().format + }; + auto dummyImgView = m_device->createImageView(std::move(viewParams)); + + video::IGPUDescriptorSet::SDescriptorInfo dsInfo; + dsInfo.info.image.imageLayout = IImage::LAYOUT::GENERAL; + dsInfo.desc = dummyImgView; + + IGPUDescriptorSet::SWriteDescriptorSet dsWrites[1u] = + { + { + .dstSet = benchDs.get(), + .binding = 2u, + .arrayElement = 0u, + .count = 1u, + .info = &dsInfo, + } + }; + m_device->updateDescriptorSets(1u, dsWrites, 0u, nullptr); + + const uint32_t elementCount = Output<>::ScanElementCount; + const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; + const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; + + const auto SubgroupSizeLog2 = hlsl::findMSB(MinSubgroupSize); + + cmdbuf->bindDescriptorSets(EPBP_COMPUTE, benchSets[0].pipeline->getLayout(), 0u, 1u, &benchDs.get()); + + for (uint32_t i = 0; i < benchSets.size(); i++) + runBenchmark(cmdbuf, benchSets[i], elementCount, SubgroupSizeLog2); + + + // blit + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[2]; + imageBarriers[0].barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT, + .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT + } + }; + imageBarriers[0].image = dummyImg.get(); + imageBarriers[0].subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }; + imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED; + imageBarriers[0].newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL; + + imageBarriers[1].barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, + .srcAccessMask = ACCESS_FLAGS::NONE, + .dstStageMask = PIPELINE_STAGE_FLAGS::BLIT_BIT, + .dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT + } + }; + imageBarriers[1].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex); + imageBarriers[1].subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }; + imageBarriers[1].oldLayout = IImage::LAYOUT::UNDEFINED; + imageBarriers[1].newLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL; + + cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); + } + + { + IGPUCommandBuffer::SImageBlit regions[] = { { + .srcMinCoord = {0,0,0}, + .srcMaxCoord = {WIN_W,WIN_H,1}, + .dstMinCoord = {0,0,0}, + .dstMaxCoord = {WIN_W,WIN_H,1}, + .layerCount = 1, + .srcBaseLayer = 0, + .dstBaseLayer = 0, + .srcMipLevel = 0, + .dstMipLevel = 0, + .aspectMask = IGPUImage::E_ASPECT_FLAGS::EAF_COLOR_BIT + } }; + + auto srcImg = dummyImg.get(); + auto scRes = static_cast(m_surface->getSwapchainResources()); + auto dstImg = scRes->getImage(m_currentImageAcquire.imageIndex); + + cmdbuf->blitImage(srcImg, IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, dstImg, IImage::LAYOUT::TRANSFER_DST_OPTIMAL, regions, ISampler::ETF_NEAREST); + } + + // barrier transition to PRESENT + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; + imageBarriers[0].barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::NONE, + .dstAccessMask = ACCESS_FLAGS::NONE + } + }; + imageBarriers[0].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex); + imageBarriers[0].subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }; + imageBarriers[0].oldLayout = IImage::LAYOUT::TRANSFER_DST_OPTIMAL; + imageBarriers[0].newLayout = IImage::LAYOUT::PRESENT_SRC; + + cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); + } + + cmdbuf->end(); + + // submit + { + auto* queue = getGraphicsQueue(); + const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] = + { + { + .semaphore = m_semaphore.get(), + .value = ++m_realFrameIx, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + } + }; + { + { + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = + { + {.cmdbuf = cmdbuf } + }; + + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = + { + { + .semaphore = m_currentImageAcquire.semaphore, + .value = m_currentImageAcquire.acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::NONE + } + }; + const IQueue::SSubmitInfo infos[] = + { + { + .waitSemaphores = acquired, + .commandBuffers = commandBuffers, + .signalSemaphores = rendered + } + }; + + if (queue->submit(infos) == IQueue::RESULT::SUCCESS) + { + const nbl::video::ISemaphore::SWaitInfo waitInfos[] = + { { + .semaphore = m_semaphore.get(), + .value = m_realFrameIx + } }; + + m_device->blockForSemaphores(waitInfos); // this is not solution, quick wa to not throw validation errors + } + else + --m_realFrameIx; + } + } + + std::string caption = "[Nabla Engine] Geometry Creator"; + { + caption += ", displaying [all objects]"; + m_window->setCaption(caption); + } + m_surface->present(m_currentImageAcquire.imageIndex, rendered); + } + + numSubmits++; + } + + // + bool keepRunning() override { return numSubmits < MaxNumSubmits; } + +private: + void logTestOutcome(bool passed, uint32_t workgroupSize) + { + if (passed) + m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize); + else + { + totalFailCount++; + m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize); + } + } + + // create pipeline (specialized every test) [TODO: turn into a future/async] + smart_refctd_ptr createPipeline(const ICPUShader* overridenUnspecialized, const IGPUPipelineLayout* layout, const uint8_t subgroupSizeLog2) + { + auto shader = m_device->createShader(overridenUnspecialized); + IGPUComputePipeline::SCreationParams params = {}; + params.layout = layout; + params.shader = { + .entryPoint = "main", + .shader = shader.get(), + .entries = nullptr, + .requiredSubgroupSize = static_cast(subgroupSizeLog2), + .requireFullSubgroups = true + }; + core::smart_refctd_ptr pipeline; + if (!m_device->createComputePipelines(nullptr,{¶ms,1},&pipeline)) + return nullptr; + return pipeline; + } + + struct BenchmarkSet + { + smart_refctd_ptr pipeline; + uint32_t workgroupSize; + uint32_t itemsPerInvocation; + }; + + template class Arithmetic, bool WorkgroupBench> + BenchmarkSet createBenchmarkPipelines(const smart_refctd_ptr&source, const IGPUPipelineLayout* layout, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u) + { + std::string arith_name = Arithmetic>::name; + + auto compiler = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); + CHLSLCompiler::SOptions options = {}; + options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE; + options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion; + options.spirvOptimizer = nullptr; +#ifndef _NBL_DEBUG + ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; + auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); + options.spirvOptimizer = opt.get(); +#else + options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT; +#endif + options.preprocessorOptions.sourceIdentifier = source->getFilepathHint(); + options.preprocessorOptions.logger = m_logger.get(); + + auto* includeFinder = compiler->getDefaultIncludeFinder(); + includeFinder->addSearchPath("nbl/builtin/hlsl/jit", core::make_smart_refctd_ptr(m_physicalDevice->getLimits(), m_device->getEnabledFeatures())); + options.preprocessorOptions.includeFinder = includeFinder; + + const uint32_t subgroupSize = 0x1u << subgroupSizeLog2; + const uint32_t itemsPerWG = workgroupSize <= subgroupSize ? workgroupSize * itemsPerInvoc : itemsPerInvoc * max(workgroupSize >> subgroupSizeLog2, subgroupSize) << subgroupSizeLog2; // TODO use Config somehow + smart_refctd_ptr overriddenUnspecialized; + if constexpr (WorkgroupBench) + { + const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize); + const std::string definitions[6] = { + "workgroup2::" + arith_name, + std::to_string(workgroupSizeLog2), + std::to_string(itemsPerWG), + std::to_string(itemsPerInvoc), + std::to_string(subgroupSizeLog2), + std::to_string(numLoops) + }; + + const IShaderCompiler::SMacroDefinition defines[6] = { + { "OPERATION", definitions[0] }, + { "WORKGROUP_SIZE_LOG2", definitions[1] }, + { "ITEMS_PER_WG", definitions[2] }, + { "ITEMS_PER_INVOCATION", definitions[3] }, + { "SUBGROUP_SIZE_LOG2", definitions[4] }, + { "NUM_LOOPS", definitions[5] } + }; + options.preprocessorOptions.extraDefines = { defines, defines + 6 }; + + overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); + } + else + { + const std::string definitions[5] = { + "subgroup2::" + arith_name, + std::to_string(workgroupSize), + std::to_string(itemsPerInvoc), + std::to_string(subgroupSizeLog2), + std::to_string(numLoops) + }; + + const IShaderCompiler::SMacroDefinition defines[5] = { + { "OPERATION", definitions[0] }, + { "WORKGROUP_SIZE", definitions[1] }, + { "ITEMS_PER_INVOCATION", definitions[2] }, + { "SUBGROUP_SIZE_LOG2", definitions[3] }, + { "NUM_LOOPS", definitions[4] } + }; + options.preprocessorOptions.extraDefines = { defines, defines + 5 }; + + overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); + } + + BenchmarkSet set; + set.pipeline = createPipeline(overriddenUnspecialized.get(), layout, subgroupSizeLog2); + if constexpr (WorkgroupBench) + { + set.workgroupSize = itemsPerWG; + } + else + { + set.workgroupSize = workgroupSize; + } + set.itemsPerInvocation = itemsPerInvoc; + + return set; + }; + + template + void runBenchmark(IGPUCommandBuffer* cmdbuf, const BenchmarkSet& set, const uint32_t elementCount, const uint8_t subgroupSizeLog2) + { + uint32_t workgroupCount; + if constexpr (WorkgroupBench) + workgroupCount = elementCount / set.workgroupSize; + else + workgroupCount = elementCount / (set.workgroupSize * set.itemsPerInvocation); + + cmdbuf->bindComputePipeline(set.pipeline.get()); + cmdbuf->dispatch(workgroupCount, 1, 1); + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount]; + for (auto i = 0u; i < OutputBufferCount; i++) + { + memoryBarrier[i] = { + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + // in theory we don't need the HOST BITS cause we block on a semaphore but might as well add them + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT | PIPELINE_STAGE_FLAGS::HOST_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS | ACCESS_FLAGS::HOST_READ_BIT + } + }, + .range = {0ull,outputBuffers[i]->getSize(),outputBuffers[i]} + }; + } + IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .memBarriers = {},.bufBarriers = memoryBarrier }; + cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE, info); + } + } + + IQueue* transferDownQueue; + IQueue* computeQueue; + + smart_refctd_ptr m_window; + smart_refctd_ptr> m_surface; + smart_refctd_ptr m_semaphore; + uint64_t m_realFrameIx = 0; + std::array, MaxFramesInFlight> m_cmdBufs; + ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {}; + + smart_refctd_ptr m_inputSystem; + + smart_refctd_ptr dummyImg; + + constexpr static inline uint32_t MaxNumSubmits = 30; + uint32_t numSubmits = 0; + + /* PARAMETERS TO CHANGE FOR DIFFERENT BENCHMARKS */ + + constexpr static inline bool DoWorkgroupBenchmarks = true; + uint32_t ItemsPerInvocation = 4u; + constexpr static inline uint32_t NumLoops = 1000u; + constexpr static inline uint32_t NumBenchmarks = 6u; + constexpr static inline std::array workgroupSizes = { 32, 64, 128, 256, 512, 1024 }; + template + using ArithmeticOp = emulatedReduction; // change this to test other arithmetic ops + + std::array benchSets; + smart_refctd_ptr benchPool; + smart_refctd_ptr benchDs; + + uint32_t* inputData = nullptr; + constexpr static inline uint32_t OutputBufferCount = 8u; + smart_refctd_ptr outputBuffers[OutputBufferCount]; + + smart_refctd_ptr sema; + uint64_t timelineValue = 0; + smart_refctd_ptr resultsBuffer; + + uint32_t totalFailCount = 0; +}; + +NBL_MAIN_FUNC(ArithmeticBenchApp) \ No newline at end of file diff --git a/74_Arithmetic2Bench/pipeline.groovy b/74_Arithmetic2Bench/pipeline.groovy new file mode 100644 index 000000000..7ea9947e0 --- /dev/null +++ b/74_Arithmetic2Bench/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CArithemticUnitTestBuilder extends IBuilder +{ + public CArithemticUnitTestBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CArithemticUnitTestBuilder(_agent, _info) +} + +return this \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 0b3279a48..ed3992203 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,7 +58,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(20_AllocatorTest EXCLUDE_FROM_ALL) add_subdirectory(21_LRUCacheUnitTest EXCLUDE_FROM_ALL) add_subdirectory(22_CppCompat EXCLUDE_FROM_ALL) - add_subdirectory(23_ArithmeticUnitTest EXCLUDE_FROM_ALL) + add_subdirectory(23_Arithmetic2UnitTest EXCLUDE_FROM_ALL) add_subdirectory(24_ColorSpaceTest EXCLUDE_FROM_ALL) add_subdirectory(25_FilterTest EXCLUDE_FROM_ALL) add_subdirectory(26_Blur EXCLUDE_FROM_ALL) @@ -91,5 +91,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(70_FLIPFluids EXCLUDE_FROM_ALL) add_subdirectory(71_RayTracingPipeline EXCLUDE_FROM_ALL) + add_subdirectory(74_Arithmetic2Bench EXCLUDE_FROM_ALL) + NBL_HOOK_COMMON_API("${NBL_COMMON_API_TARGETS}") endif()