diff --git a/11_FFT/app_resources/shader.comp.hlsl b/11_FFT/app_resources/shader.comp.hlsl index ecbf4f092..63a85b0c4 100644 --- a/11_FFT/app_resources/shader.comp.hlsl +++ b/11_FFT/app_resources/shader.comp.hlsl @@ -14,13 +14,13 @@ uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(uint32_t(ConstevalParamete struct SharedMemoryAccessor { - template + template void set(IndexType idx, AccessType value) { sharedmem[idx] = value; } - template + template void get(IndexType idx, NBL_REF_ARG(AccessType) value) { value = sharedmem[idx]; @@ -44,14 +44,14 @@ struct Accessor } // TODO: can't use our own BDA yet, because it doesn't support the types `workgroup::FFT` will invoke these templates with - template - void get(const uint32_t index, NBL_REF_ARG(AccessType) value) + template + void get(const IndexType index, NBL_REF_ARG(AccessType) value) { value = vk::RawBufferLoad(address + index * sizeof(AccessType)); } - template - void set(const uint32_t index, const AccessType value) + template + void set(const IndexType index, const AccessType value) { vk::RawBufferStore(address + index * sizeof(AccessType), value); } diff --git a/23_ArithmeticUnitTest/CMakeLists.txt b/23_Arithmetic2UnitTest/CMakeLists.txt similarity index 100% rename from 23_ArithmeticUnitTest/CMakeLists.txt rename to 23_Arithmetic2UnitTest/CMakeLists.txt diff --git a/23_ArithmeticUnitTest/app_resources/common.hlsl b/23_Arithmetic2UnitTest/app_resources/common.hlsl similarity index 89% rename from 23_ArithmeticUnitTest/app_resources/common.hlsl rename to 23_Arithmetic2UnitTest/app_resources/common.hlsl index 10892a2b9..6654645cf 100644 --- a/23_ArithmeticUnitTest/app_resources/common.hlsl +++ b/23_Arithmetic2UnitTest/app_resources/common.hlsl @@ -1,15 +1,14 @@ #include "nbl/builtin/hlsl/cpp_compat.hlsl" #include "nbl/builtin/hlsl/functional.hlsl" -template -struct Output +struct PushConstantData { - NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanElementCount = kScanElementCount; - - uint32_t subgroupSize; - uint32_t data[ScanElementCount]; + uint64_t pInputBuf; + uint64_t pOutputBuf[8]; }; +namespace arithmetic +{ // Thanks to our unified HLSL/C++ STD lib we're able to remove a whole load of code template struct bit_and : nbl::hlsl::bit_and @@ -92,5 +91,6 @@ struct ballot : nbl::hlsl::plus static inline constexpr const char* name = "bitcount"; #endif }; +} -#include "nbl/builtin/hlsl/subgroup/basic.hlsl" \ No newline at end of file +#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" diff --git a/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl new file mode 100644 index 000000000..3793b08f8 --- /dev/null +++ b/23_Arithmetic2UnitTest/app_resources/shaderCommon.hlsl @@ -0,0 +1,19 @@ +#include "common.hlsl" + +using namespace nbl; +using namespace hlsl; + +[[vk::push_constant]] PushConstantData pc; + +struct device_capabilities +{ +#ifdef TEST_NATIVE + NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = true; +#else + NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = false; +#endif +}; + +#ifndef OPERATION +#error "Define OPERATION!" +#endif diff --git a/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl new file mode 100644 index 000000000..3105aec56 --- /dev/null +++ b/23_Arithmetic2UnitTest/app_resources/testSubgroup.comp.hlsl @@ -0,0 +1,55 @@ +#pragma shader_stage(compute) + +#define operation_t nbl::hlsl::OPERATION + +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_params.hlsl" + +#include "shaderCommon.hlsl" +#include "nbl/builtin/hlsl/workgroup2/basic.hlsl" + +template +using params_t = SUBGROUP_CONFIG_T; + +typedef vector::base_t, device_capabilities>::ItemsPerInvocation> type_t; + +uint32_t globalIndex() +{ + return glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+workgroup::SubgroupContiguousIndex(); +} + +template +static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) +{ + const uint64_t outputBufAddr = pc.pOutputBuf[Binop::BindingIndex]; + + assert(glsl::gl_SubgroupSize() == params_t::config_t::Size) + + operation_t > func; + type_t val = func(sourceVal); + + vk::RawBufferStore(outputBufAddr + sizeof(type_t) * globalIndex(), val, sizeof(uint32_t)); +} + +type_t test() +{ + const uint32_t idx = globalIndex(); + type_t sourceVal = vk::RawBufferLoad(pc.pInputBuf + idx * sizeof(type_t)); + + subtest >(sourceVal); + subtest >(sourceVal); + subtest >(sourceVal); + subtest >(sourceVal); + subtest >(sourceVal); + subtest >(sourceVal); + subtest >(sourceVal); + return sourceVal; +} + +[numthreads(WORKGROUP_SIZE,1,1)] +void main() +{ + test(); +} diff --git a/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl new file mode 100644 index 000000000..2a32ed20e --- /dev/null +++ b/23_Arithmetic2UnitTest/app_resources/testWorkgroup.comp.hlsl @@ -0,0 +1,74 @@ +#pragma shader_stage(compute) + +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" +#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl" + +using config_t = WORKGROUP_CONFIG_T; + +#include "shaderCommon.hlsl" + +typedef vector type_t; + +// final (level 1/2) scan needs to fit in one subgroup exactly +groupshared uint32_t scratch[mpl::max_v]; + +#include "../../common/include/WorkgroupDataAccessors.hlsl" + +static ScratchProxy arithmeticAccessor; + +template +struct operation_t +{ + using binop_base_t = typename Binop::base_t; + using otype_t = typename Binop::type_t; + + // workgroup reduction returns the value of the reduction + // workgroup scans do no return anything, but use the data accessor to do the storing directly + void operator()() + { + using data_proxy_t = PreloadedDataProxy; + data_proxy_t dataAccessor = data_proxy_t::create(pc.pInputBuf, pc.pOutputBuf[Binop::BindingIndex]); + dataAccessor.preload(); +#if IS_REDUCTION + otype_t value = +#endif + OPERATION::template __call(dataAccessor,arithmeticAccessor); + // we barrier before because we alias the accessors for Binop + arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); +#if IS_REDUCTION + [unroll] + for (uint32_t i = 0; i < data_proxy_t::PreloadedDataCount; i++) + dataAccessor.preloaded[i] = value; +#endif + dataAccessor.unload(); + } +}; + + +template +static void subtest() +{ + assert(glsl::gl_SubgroupSize() == config_t::SubgroupSize) + + operation_t func; + func(); +} + +void test() +{ + subtest >(); + subtest >(); + subtest >(); + subtest >(); + subtest >(); + subtest >(); + subtest >(); +} + +[numthreads(config_t::WorkgroupSize,1,1)] +void main() +{ + test(); +} \ No newline at end of file diff --git a/23_ArithmeticUnitTest/config.json.template b/23_Arithmetic2UnitTest/config.json.template similarity index 100% rename from 23_ArithmeticUnitTest/config.json.template rename to 23_Arithmetic2UnitTest/config.json.template diff --git a/23_Arithmetic2UnitTest/main.cpp b/23_Arithmetic2UnitTest/main.cpp new file mode 100644 index 000000000..65ef126ad --- /dev/null +++ b/23_Arithmetic2UnitTest/main.cpp @@ -0,0 +1,505 @@ +#include "nbl/application_templates/BasicMultiQueueApplication.hpp" +#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" +#include "app_resources/common.hlsl" +#include "nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_params.hlsl" + +using namespace nbl; +using namespace core; +using namespace asset; +using namespace system; +using namespace video; + +// method emulations on the CPU, to verify the results of the GPU methods +template +struct emulatedReduction +{ + using type_t = typename Binop::type_t; + + static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) + { + const type_t red = std::reduce(in,in+itemCount,Binop::identity,Binop()); + std::fill(out,out+itemCount,red); + } + + static inline constexpr const char* name = "reduction"; +}; +template +struct emulatedScanInclusive +{ + using type_t = typename Binop::type_t; + + static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) + { + std::inclusive_scan(in,in+itemCount,out,Binop()); + } + static inline constexpr const char* name = "inclusive_scan"; +}; +template +struct emulatedScanExclusive +{ + using type_t = typename Binop::type_t; + + static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) + { + std::exclusive_scan(in,in+itemCount,out,Binop::identity,Binop()); + } + static inline constexpr const char* name = "exclusive_scan"; +}; + +class Workgroup2ScanTestApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +{ + using device_base_t = application_templates::BasicMultiQueueApplication; + using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + +public: + Workgroup2ScanTestApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + if (!device_base_t::onAppInitialized(std::move(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + transferDownQueue = getTransferDownQueue(); + computeQueue = getComputeQueue(); + + // TODO: get the element count from argv + const uint32_t elementCount = 1024 * 1024; + // populate our random data buffer on the CPU and create a GPU copy + inputData = new uint32_t[elementCount]; + smart_refctd_ptr gpuinputDataBuffer; + { + std::mt19937 randGenerator(0xdeadbeefu); + for (uint32_t i = 0u; i < elementCount; i++) + inputData[i] = randGenerator(); // TODO: change to using xoroshiro, then we can skip having the input buffer at all + + IGPUBuffer::SCreationParams inputDataBufferCreationParams = {}; + inputDataBufferCreationParams.size = sizeof(uint32_t) * elementCount; + inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + m_utils->createFilledDeviceLocalBufferOnDedMem( + SIntendedSubmitInfo{.queue=getTransferUpQueue()}, + std::move(inputDataBufferCreationParams), + inputData + ).move_into(gpuinputDataBuffer); + } + + // create 8 buffers for 8 operations + for (auto i=0u; igetSize(); + params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + + outputBuffers[i] = m_device->createBuffer(std::move(params)); + auto mreq = outputBuffers[i]->getMemoryReqs(); + mreq.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); + assert(mreq.memoryTypeBits); + + auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + assert(bufferMem.isValid()); + } + pc.pInputBuf = gpuinputDataBuffer->getDeviceAddress(); + for (uint32_t i = 0; i < OutputBufferCount; i++) + pc.pOutputBuf[i] = outputBuffers[i]->getDeviceAddress(); + + // create Pipeline Layout + { + SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0,.size = sizeof(PushConstantData) }; + pipelineLayout = m_device->createPipelineLayout({&pcRange, 1}); + } + + const auto spirv_isa_cache_path = localOutputCWD / "spirv_isa_cache.bin"; + // enclose to make sure file goes out of scope and we can reopen it + { + smart_refctd_ptr spirv_isa_cache_input; + // try to load SPIR-V to ISA cache + { + ISystem::future_t> fileCreate; + m_system->createFile(fileCreate, spirv_isa_cache_path, IFile::ECF_READ | IFile::ECF_MAPPABLE | IFile::ECF_COHERENT); + if (auto lock = fileCreate.acquire()) + spirv_isa_cache_input = *lock; + } + // create the cache + { + std::span spirv_isa_cache_data = {}; + if (spirv_isa_cache_input) + spirv_isa_cache_data = { reinterpret_cast(spirv_isa_cache_input->getMappedPointer()),spirv_isa_cache_input->getSize() }; + else + m_logger->log("Failed to load SPIR-V 2 ISA cache!", ILogger::ELL_PERFORMANCE); + // Normally we'd deserialize a `ICPUPipelineCache` properly and pass that instead + m_spirv_isa_cache = m_device->createPipelineCache(spirv_isa_cache_data); + } + } + { + // TODO: rename `deleteDirectory` to just `delete`? and a `IFile::setSize()` ? + m_system->deleteDirectory(spirv_isa_cache_path); + ISystem::future_t> fileCreate; + m_system->createFile(fileCreate, spirv_isa_cache_path, IFile::ECF_WRITE); + // I can be relatively sure I'll succeed to acquire the future, the pointer to created file might be null though. + m_spirv_isa_cache_output = *fileCreate.acquire(); + if (!m_spirv_isa_cache_output) + logFail("Failed to Create SPIR-V to ISA cache file."); + } + + // load shader source from file + auto getShaderSource = [&](const char* filePath) -> auto + { + IAssetLoader::SAssetLoadParams lparams = {}; + lparams.logger = m_logger.get(); + lparams.workingDirectory = ""; + auto bundle = m_assetMgr->getAsset(filePath, lparams); + if (bundle.getContents().empty() || bundle.getAssetType()!=IAsset::ET_SHADER) + { + m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath); + exit(-1); + } + auto firstAssetInBundle = bundle.getContents()[0]; + return smart_refctd_ptr_static_cast(firstAssetInBundle); + }; + + auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl"); + auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl"); + // now create or retrieve final resources to run our tests + sema = m_device->createSemaphore(timelineValue); + resultsBuffer = ICPUBuffer::create({ outputBuffers[0]->getSize() }); + { + smart_refctd_ptr cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1})) + { + logFail("Failed to create Command Buffers!\n"); + return false; + } + } + + const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; + const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; + const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; + for (uint32_t useNative = 0; useNative <= uint32_t(m_physicalDevice->getProperties().limits.shaderSubgroupArithmetic); useNative++) + { + if (useNative) + m_logger->log("Testing with native subgroup arithmetic", ILogger::ELL_INFO); + else + m_logger->log("Testing with emulated subgroup arithmetic", ILogger::ELL_INFO); + + for (auto subgroupSize = MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) + { + const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); + for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize *= 2u) + { + // make sure renderdoc captures everything for debugging + m_api->startCapture(); + m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize); + + for (uint32_t j = 0; j < ItemsPerInvocations.size(); j++) + { + const uint32_t itemsPerInvocation = ItemsPerInvocations[j]; + uint32_t itemsPerWG = workgroupSize * itemsPerInvocation; + m_logger->log("Testing Items per Invocation %u", ILogger::ELL_INFO, itemsPerInvocation); + bool passed = true; + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; + logTestOutcome(passed, itemsPerWG); + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; + logTestOutcome(passed, itemsPerWG); + passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; + logTestOutcome(passed, itemsPerWG); + + hlsl::workgroup2::SArithmeticConfiguration wgConfig; + wgConfig.init(hlsl::findMSB(workgroupSize), subgroupSizeLog2, itemsPerInvocation); + itemsPerWG = wgConfig.VirtualWorkgroupSize * wgConfig.ItemsPerInvocation_0; + m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; + logTestOutcome(passed, itemsPerWG); + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; + logTestOutcome(passed, itemsPerWG); + passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, bool(useNative), itemsPerWG, itemsPerInvocation) && passed; + logTestOutcome(passed, itemsPerWG); + } + m_api->endCapture(); + + // save cache every now and then + { + auto cpu = m_spirv_isa_cache->convertToCPUCache(); + // Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata + auto bin = cpu->getEntries().begin()->second.bin; + IFile::success_t success; + m_spirv_isa_cache_output->write(success, bin->data(), 0ull, bin->size()); + if (!success) + logFail("Could not write Create SPIR-V to ISA cache to disk!"); + } + } + } + } + + return true; + } + + virtual bool onAppTerminated() override + { + m_logger->log("==========Result==========", ILogger::ELL_INFO); + m_logger->log("Fail Count: %u", ILogger::ELL_INFO, totalFailCount); + delete[] inputData; + return true; + } + + // the unit test is carried out on init + void workLoopBody() override {} + + // + bool keepRunning() override { return false; } + +private: + void logTestOutcome(bool passed, uint32_t workgroupSize) + { + if (passed) + m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize); + else + { + totalFailCount++; + m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize); + } + } + + // create pipeline (specialized every test) [TODO: turn into a future/async] + smart_refctd_ptr createPipeline(const ICPUShader* overridenUnspecialized, const uint8_t subgroupSizeLog2) + { + auto shader = m_device->createShader(overridenUnspecialized); + IGPUComputePipeline::SCreationParams params = {}; + params.layout = pipelineLayout.get(); + params.shader = { + .entryPoint = "main", + .shader = shader.get(), + .entries = nullptr, + .requiredSubgroupSize = static_cast(subgroupSizeLog2), + .requireFullSubgroups = true + }; + core::smart_refctd_ptr pipeline; + if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{¶ms,1},&pipeline)) + return nullptr; + return pipeline; + } + + template class Arithmetic, bool WorkgroupTest> + bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, bool useNative, uint32_t itemsPerWG, uint32_t itemsPerInvoc = 1u) + { + std::string arith_name = Arithmetic>::name; + const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize); + + auto compiler = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); + CHLSLCompiler::SOptions options = {}; + options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE; + options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion; + options.spirvOptimizer = nullptr; +#ifndef _NBL_DEBUG + ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; + auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); + options.spirvOptimizer = opt.get(); +#else + options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT; +#endif + options.preprocessorOptions.sourceIdentifier = source->getFilepathHint(); + options.preprocessorOptions.logger = m_logger.get(); + + auto* includeFinder = compiler->getDefaultIncludeFinder(); + options.preprocessorOptions.includeFinder = includeFinder; + + smart_refctd_ptr overriddenUnspecialized; + if constexpr (WorkgroupTest) + { + hlsl::workgroup2::SArithmeticConfiguration wgConfig; + wgConfig.init(hlsl::findMSB(workgroupSize), subgroupSizeLog2, itemsPerInvoc); + + const std::string definitions[3] = { + "workgroup2::" + arith_name, + wgConfig.getConfigTemplateStructString(), + std::to_string(arith_name=="reduction") + }; + + const IShaderCompiler::SMacroDefinition defines[4] = { + { "OPERATION", definitions[0] }, + { "WORKGROUP_CONFIG_T", definitions[1] }, + { "IS_REDUCTION", definitions[2] }, + { "TEST_NATIVE", "1" } + }; + if (useNative) + options.preprocessorOptions.extraDefines = { defines, defines + 4 }; + else + options.preprocessorOptions.extraDefines = { defines, defines + 3 }; + + overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); + } + else + { + hlsl::subgroup2::SArithmeticParams sgParams; + sgParams.init(subgroupSizeLog2, itemsPerInvoc); + + const std::string definitions[3] = { + "subgroup2::" + arith_name, + std::to_string(workgroupSize), + sgParams.getParamTemplateStructString() + }; + + const IShaderCompiler::SMacroDefinition defines[4] = { + { "OPERATION", definitions[0] }, + { "WORKGROUP_SIZE", definitions[1] }, + { "SUBGROUP_CONFIG_T", definitions[2] }, + { "TEST_NATIVE", "1" } + }; + if (useNative) + options.preprocessorOptions.extraDefines = { defines, defines + 4 }; + else + options.preprocessorOptions.extraDefines = { defines, defines + 3 }; + + overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); + } + + auto pipeline = createPipeline(overriddenUnspecialized.get(),subgroupSizeLog2); + + // TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`) + uint32_t workgroupCount = 1;// min(elementCount / itemsPerWG, m_physicalDevice->getLimits().maxComputeWorkGroupCount[0]); + + cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE); + cmdbuf->bindComputePipeline(pipeline.get()); + cmdbuf->pushConstants(pipelineLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PushConstantData), &pc); + cmdbuf->dispatch(workgroupCount, 1, 1); + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount]; + for (auto i=0u; igetSize(),outputBuffers[i]} + }; + } + IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = {.memBarriers={},.bufBarriers=memoryBarrier}; + cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info); + } + cmdbuf->end(); + + const IQueue::SSubmitInfo::SSemaphoreInfo signal[1] = {{.semaphore=sema.get(),.value=++timelineValue}}; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = {{.cmdbuf=cmdbuf.get()}}; + const IQueue::SSubmitInfo submits[1] = {{.commandBuffers=cmdbufs,.signalSemaphores=signal}}; + computeQueue->submit(submits); + const ISemaphore::SWaitInfo wait[1] = {{.semaphore=sema.get(),.value=timelineValue}}; + m_device->blockForSemaphores(wait); + + const uint32_t subgroupSize = 1u << subgroupSizeLog2; + // check results + bool passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc); + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed; + passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount, subgroupSize, itemsPerInvoc) && passed; + + return passed; + } + + //returns true if result matches + template class Arithmetic, class Binop, bool WorkgroupTest> + bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount, const uint32_t subgroupSize, const uint32_t itemsPerInvoc) + { + bool success = true; + + // download data + const SBufferRange bufferRange = {0u, resultsBuffer->getSize(), outputBuffers[Binop::BindingIndex]}; + m_utils->downloadBufferRangeViaStagingBufferAutoSubmit(SIntendedSubmitInfo{.queue=transferDownQueue},bufferRange,resultsBuffer->getPointer()); + + using type_t = typename Binop::type_t; + const auto testData = reinterpret_cast(resultsBuffer->getPointer()); + + // TODO: parallel for (the temporary values need to be threadlocal or what?) + // now check if the data obtained has valid values + type_t* tmp = new type_t[itemsPerWG]; + for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++) + { + if constexpr (WorkgroupTest) + { + const auto workgroupOffset = workgroupID * itemsPerWG; + Arithmetic::impl(tmp, inputData + workgroupOffset, itemsPerWG); + + for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++) + { + const auto globalInvocationIndex = workgroupOffset + localInvocationIndex; + const auto cpuVal = tmp[localInvocationIndex]; + const auto gpuVal = testData[globalInvocationIndex]; + if (cpuVal != gpuVal) + { + m_logger->log( + "Failed test #%d (%s) (%s) Expected %u got %u for workgroup %d and localinvoc %d", + ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name, + cpuVal, gpuVal, workgroupID, localInvocationIndex + ); + success = false; + break; + } + } + } + else + { + const auto workgroupOffset = workgroupID * itemsPerWG; + const auto workgroupSize = itemsPerWG / itemsPerInvoc; + for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < workgroupSize; pseudoSubgroupID += subgroupSize) + Arithmetic::impl(tmp + pseudoSubgroupID * itemsPerInvoc, inputData + workgroupOffset + pseudoSubgroupID * itemsPerInvoc, subgroupSize * itemsPerInvoc); + + for (uint32_t localInvocationIndex = 0u; localInvocationIndex < workgroupSize; localInvocationIndex++) + { + const auto localOffset = localInvocationIndex * itemsPerInvoc; + const auto globalInvocationIndex = workgroupOffset + localOffset; + + for (uint32_t itemInvocationIndex = 0u; itemInvocationIndex < itemsPerInvoc; itemInvocationIndex++) + { + const auto cpuVal = tmp[localOffset + itemInvocationIndex]; + const auto gpuVal = testData[globalInvocationIndex + itemInvocationIndex]; + if (cpuVal != gpuVal) + { + m_logger->log( + "Failed test #%d (%s) (%s) Expected %u got %u for workgroup %d and localinvoc %d and iteminvoc %d", + ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name, + cpuVal, gpuVal, workgroupID, localInvocationIndex, itemInvocationIndex + ); + success = false; + break; + } + } + } + } + } + delete[] tmp; + + return success; + } + + IQueue* transferDownQueue; + IQueue* computeQueue; + smart_refctd_ptr m_spirv_isa_cache; + smart_refctd_ptr m_spirv_isa_cache_output; + + uint32_t* inputData = nullptr; + constexpr static inline uint32_t OutputBufferCount = 8u; + smart_refctd_ptr outputBuffers[OutputBufferCount]; + smart_refctd_ptr pipelineLayout; + PushConstantData pc; + + smart_refctd_ptr sema; + uint64_t timelineValue = 0; + smart_refctd_ptr cmdbuf; + smart_refctd_ptr resultsBuffer; + + uint32_t totalFailCount = 0; + + constexpr static inline std::array ItemsPerInvocations = { 1, 2, 3, 4 }; +}; + +NBL_MAIN_FUNC(Workgroup2ScanTestApp) \ No newline at end of file diff --git a/23_ArithmeticUnitTest/pipeline.groovy b/23_Arithmetic2UnitTest/pipeline.groovy similarity index 100% rename from 23_ArithmeticUnitTest/pipeline.groovy rename to 23_Arithmetic2UnitTest/pipeline.groovy diff --git a/23_ArithmeticUnitTest/app_resources/shaderCommon.hlsl b/23_ArithmeticUnitTest/app_resources/shaderCommon.hlsl deleted file mode 100644 index 13ee8d21e..000000000 --- a/23_ArithmeticUnitTest/app_resources/shaderCommon.hlsl +++ /dev/null @@ -1,55 +0,0 @@ -#include "common.hlsl" - -#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" -#include "nbl/builtin/hlsl/subgroup/basic.hlsl" -#include "nbl/builtin/hlsl/subgroup/arithmetic_portability.hlsl" - -#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" - -// https://github.com/microsoft/DirectXShaderCompiler/issues/6144 -uint32_t3 nbl::hlsl::glsl::gl_WorkGroupSize() {return uint32_t3(WORKGROUP_SIZE,1,1);} - -// unfortunately DXC chokes on descriptors as static members -// https://github.com/microsoft/DirectXShaderCompiler/issues/5940 -[[vk::binding(0, 0)]] StructuredBuffer inputValue; -[[vk::binding(1, 0)]] RWByteAddressBuffer output[8]; - -// because subgroups don't match `gl_LocalInvocationIndex` snake curve addressing, we also can't load inputs that way -uint32_t globalIndex(); -// since we test ITEMS_PER_WG class binop> -static void subtest(NBL_CONST_REF_ARG(type_t) sourceVal) -{ - if (globalIndex()==0u) - output[binop::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); - - operation_t::base_t,nbl::hlsl::jit::device_capabilities> func; - if (canStore()) - output[binop::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),func(sourceVal)); -} - - -type_t test() -{ - const type_t sourceVal = inputValue[globalIndex()]; - - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - subtest(sourceVal); - return sourceVal; -} - -#include "nbl/builtin/hlsl/workgroup/basic.hlsl" \ No newline at end of file diff --git a/23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl b/23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl deleted file mode 100644 index 479265d73..000000000 --- a/23_ArithmeticUnitTest/app_resources/testSubgroup.comp.hlsl +++ /dev/null @@ -1,18 +0,0 @@ -#pragma shader_stage(compute) - -#define operation_t nbl::hlsl::OPERATION - -#include "shaderCommon.hlsl" - -uint32_t globalIndex() -{ - return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); -} - -bool canStore() {return true;} - -[numthreads(WORKGROUP_SIZE,1,1)] -void main() -{ - test(); -} \ No newline at end of file diff --git a/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl b/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl deleted file mode 100644 index 9bafae47f..000000000 --- a/23_ArithmeticUnitTest/app_resources/testWorkgroup.comp.hlsl +++ /dev/null @@ -1,107 +0,0 @@ -#pragma shader_stage(compute) - - -#include "nbl/builtin/hlsl/workgroup/scratch_size.hlsl" - -static const uint32_t ArithmeticSz = nbl::hlsl::workgroup::scratch_size_arithmetic::value; -static const uint32_t BallotSz = nbl::hlsl::workgroup::scratch_size_ballot::value; -static const uint32_t ScratchSz = ArithmeticSz+BallotSz; - -// TODO: Can we make it a static variable in the ScratchProxy struct? -groupshared uint32_t scratch[ScratchSz]; - - -#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" - - -template -struct ScratchProxy -{ - void get(const uint32_t ix, NBL_REF_ARG(uint32_t) value) - { - value = scratch[ix+offset]; - } - void set(const uint32_t ix, const uint32_t value) - { - scratch[ix+offset] = value; - } - - uint32_t atomicOr(const uint32_t ix, const uint32_t value) - { - return nbl::hlsl::glsl::atomicOr(scratch[ix],value); - } - - void workgroupExecutionAndMemoryBarrier() - { - nbl::hlsl::glsl::barrier(); - //nbl::hlsl::glsl::memoryBarrierShared(); implied by the above - } -}; - -static ScratchProxy<0> arithmeticAccessor; - - -#include "nbl/builtin/hlsl/workgroup/broadcast.hlsl" - - -template -struct operation_t -{ - using type_t = typename Binop::type_t; - - type_t operator()(type_t value) - { - type_t retval = nbl::hlsl::OPERATION::template __call >(value,arithmeticAccessor); - // we barrier before because we alias the accessors for Binop - arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); - return retval; - } -}; - - -#include "shaderCommon.hlsl" - -static ScratchProxy ballotAccessor; - - -uint32_t globalIndex() -{ - return nbl::hlsl::glsl::gl_WorkGroupID().x*ITEMS_PER_WG+nbl::hlsl::workgroup::SubgroupContiguousIndex(); -} - -bool canStore() -{ - return nbl::hlsl::workgroup::SubgroupContiguousIndex()::BindingIndex].template Store(0,nbl::hlsl::glsl::gl_SubgroupSize()); - - // we can only ballot booleans, so low bit - nbl::hlsl::workgroup::ballot >(bool(sourceVal & 0x1u), ballotAccessor); - // need to barrier between ballot and usages of a ballot by myself - ballotAccessor.workgroupExecutionAndMemoryBarrier(); - - uint32_t destVal = 0xdeadbeefu; -#define CONSTEXPR_OP_TYPE_TEST(IS_OP) nbl::hlsl::is_same,0x45>,nbl::hlsl::workgroup::IS_OP,0x45> >::value -#define BALLOT_TEMPLATE_ARGS ITEMS_PER_WG,decltype(ballotAccessor),decltype(arithmeticAccessor),nbl::hlsl::jit::device_capabilities - if (CONSTEXPR_OP_TYPE_TEST(reduction)) - destVal = nbl::hlsl::workgroup::ballotBitCount(ballotAccessor,arithmeticAccessor); - else if (CONSTEXPR_OP_TYPE_TEST(inclusive_scan)) - destVal = nbl::hlsl::workgroup::ballotInclusiveBitCount(ballotAccessor,arithmeticAccessor); - else if (CONSTEXPR_OP_TYPE_TEST(exclusive_scan)) - destVal = nbl::hlsl::workgroup::ballotExclusiveBitCount(ballotAccessor,arithmeticAccessor); - else - { - assert(false); - } -#undef BALLOT_TEMPLATE_ARGS -#undef CONSTEXPR_OP_TYPE_TEST - - if (canStore()) - output[ballot::BindingIndex].template Store(sizeof(uint32_t)+sizeof(type_t)*globalIndex(),destVal); -} \ No newline at end of file diff --git a/23_ArithmeticUnitTest/main.cpp b/23_ArithmeticUnitTest/main.cpp deleted file mode 100644 index 147d231e2..000000000 --- a/23_ArithmeticUnitTest/main.cpp +++ /dev/null @@ -1,462 +0,0 @@ -#include "nbl/application_templates/BasicMultiQueueApplication.hpp" -#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" -#include "app_resources/common.hlsl" - -using namespace nbl; -using namespace core; -using namespace asset; -using namespace system; -using namespace video; - -// method emulations on the CPU, to verify the results of the GPU methods -template -struct emulatedReduction -{ - using type_t = typename Binop::type_t; - - static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) - { - const type_t red = std::reduce(in,in+itemCount,Binop::identity,Binop()); - std::fill(out,out+itemCount,red); - } - - static inline constexpr const char* name = "reduction"; -}; -template -struct emulatedScanInclusive -{ - using type_t = typename Binop::type_t; - - static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) - { - std::inclusive_scan(in,in+itemCount,out,Binop()); - } - static inline constexpr const char* name = "inclusive_scan"; -}; -template -struct emulatedScanExclusive -{ - using type_t = typename Binop::type_t; - - static inline void impl(type_t* out, const type_t* in, const uint32_t itemCount) - { - std::exclusive_scan(in,in+itemCount,out,Binop::identity,Binop()); - } - static inline constexpr const char* name = "exclusive_scan"; -}; - -class ArithmeticUnitTestApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication -{ - using device_base_t = application_templates::BasicMultiQueueApplication; - using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; - -public: - ArithmeticUnitTestApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : - system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} - - bool onAppInitialized(smart_refctd_ptr&& system) override - { - if (!device_base_t::onAppInitialized(std::move(system))) - return false; - if (!asset_base_t::onAppInitialized(std::move(system))) - return false; - - transferDownQueue = getTransferDownQueue(); - computeQueue = getComputeQueue(); - - // TODO: get the element count from argv - const uint32_t elementCount = Output<>::ScanElementCount; - // populate our random data buffer on the CPU and create a GPU copy - inputData = new uint32_t[elementCount]; - smart_refctd_ptr gpuinputDataBuffer; - { - std::mt19937 randGenerator(0xdeadbeefu); - for (uint32_t i = 0u; i < elementCount; i++) - inputData[i] = randGenerator(); // TODO: change to using xoroshiro, then we can skip having the input buffer at all - - IGPUBuffer::SCreationParams inputDataBufferCreationParams = {}; - inputDataBufferCreationParams.size = sizeof(Output<>::data[0]) * elementCount; - inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT; - m_utils->createFilledDeviceLocalBufferOnDedMem( - SIntendedSubmitInfo{.queue=getTransferUpQueue()}, - std::move(inputDataBufferCreationParams), - inputData - ).move_into(gpuinputDataBuffer); - } - - // create 8 buffers for 8 operations - for (auto i=0u; igetSize(); - params.usage = bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_SRC_BIT; - - outputBuffers[i] = m_device->createBuffer(std::move(params)); - auto mreq = outputBuffers[i]->getMemoryReqs(); - mreq.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); - assert(mreq.memoryTypeBits); - - auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get()); - assert(bufferMem.isValid()); - } - - // create Descriptor Set and Pipeline Layout - { - // create Descriptor Set Layout - smart_refctd_ptr dsLayout; - { - IGPUDescriptorSetLayout::SBinding binding[2]; - for (uint32_t i = 0u; i < 2; i++) - binding[i] = {{},i,IDescriptor::E_TYPE::ET_STORAGE_BUFFER,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr }; - binding[1].count = OutputBufferCount; - dsLayout = m_device->createDescriptorSetLayout(binding); - } - - // set and transient pool - auto descPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE,{&dsLayout.get(),1}); - descriptorSet = descPool->createDescriptorSet(smart_refctd_ptr(dsLayout)); - { - IGPUDescriptorSet::SDescriptorInfo infos[1+OutputBufferCount]; - infos[0].desc = gpuinputDataBuffer; - infos[0].info.buffer = { 0u,gpuinputDataBuffer->getSize() }; - for (uint32_t i = 1u; i <= OutputBufferCount; i++) - { - auto buff = outputBuffers[i - 1]; - infos[i].info.buffer = { 0u,buff->getSize() }; - infos[i].desc = std::move(buff); // save an atomic in the refcount - - } - - IGPUDescriptorSet::SWriteDescriptorSet writes[2]; - for (uint32_t i=0u; i<2; i++) - writes[i] = {descriptorSet.get(),i,0u,1u,infos+i}; - writes[1].count = OutputBufferCount; - - m_device->updateDescriptorSets(2, writes, 0u, nullptr); - } - - pipelineLayout = m_device->createPipelineLayout({},std::move(dsLayout)); - } - - const auto spirv_isa_cache_path = localOutputCWD/"spirv_isa_cache.bin"; - // enclose to make sure file goes out of scope and we can reopen it - { - smart_refctd_ptr spirv_isa_cache_input; - // try to load SPIR-V to ISA cache - { - ISystem::future_t> fileCreate; - m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_READ|IFile::ECF_MAPPABLE|IFile::ECF_COHERENT); - if (auto lock=fileCreate.acquire()) - spirv_isa_cache_input = *lock; - } - // create the cache - { - std::span spirv_isa_cache_data = {}; - if (spirv_isa_cache_input) - spirv_isa_cache_data = {reinterpret_cast(spirv_isa_cache_input->getMappedPointer()),spirv_isa_cache_input->getSize()}; - else - m_logger->log("Failed to load SPIR-V 2 ISA cache!",ILogger::ELL_PERFORMANCE); - // Normally we'd deserialize a `ICPUPipelineCache` properly and pass that instead - m_spirv_isa_cache = m_device->createPipelineCache(spirv_isa_cache_data); - } - } - { - // TODO: rename `deleteDirectory` to just `delete`? and a `IFile::setSize()` ? - m_system->deleteDirectory(spirv_isa_cache_path); - ISystem::future_t> fileCreate; - m_system->createFile(fileCreate,spirv_isa_cache_path,IFile::ECF_WRITE); - // I can be relatively sure I'll succeed to acquire the future, the pointer to created file might be null though. - m_spirv_isa_cache_output=*fileCreate.acquire(); - if (!m_spirv_isa_cache_output) - logFail("Failed to Create SPIR-V to ISA cache file."); - } - - // load shader source from file - auto getShaderSource = [&](const char* filePath) -> auto - { - IAssetLoader::SAssetLoadParams lparams = {}; - lparams.logger = m_logger.get(); - lparams.workingDirectory = ""; - auto bundle = m_assetMgr->getAsset(filePath, lparams); - if (bundle.getContents().empty() || bundle.getAssetType()!=IAsset::ET_SHADER) - { - m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath); - exit(-1); - } - auto firstAssetInBundle = bundle.getContents()[0]; - return smart_refctd_ptr_static_cast(firstAssetInBundle); - }; - - auto subgroupTestSource = getShaderSource("app_resources/testSubgroup.comp.hlsl"); - auto workgroupTestSource = getShaderSource("app_resources/testWorkgroup.comp.hlsl"); - // now create or retrieve final resources to run our tests - sema = m_device->createSemaphore(timelineValue); - resultsBuffer = ICPUBuffer::create({ outputBuffers[0]->getSize() }); - { - smart_refctd_ptr cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1})) - { - logFail("Failed to create Command Buffers!\n"); - return false; - } - } - - const auto MaxWorkgroupSize = m_physicalDevice->getLimits().maxComputeWorkGroupInvocations; - const auto MinSubgroupSize = m_physicalDevice->getLimits().minSubgroupSize; - const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; - for (auto subgroupSize=MinSubgroupSize; subgroupSize <= MaxSubgroupSize; subgroupSize *= 2u) - { - const uint8_t subgroupSizeLog2 = hlsl::findMSB(subgroupSize); - for (uint32_t workgroupSize = subgroupSize; workgroupSize <= MaxWorkgroupSize; workgroupSize += subgroupSize) - { - // make sure renderdoc captures everything for debugging - m_api->startCapture(); - m_logger->log("Testing Workgroup Size %u with Subgroup Size %u", ILogger::ELL_INFO, workgroupSize, subgroupSize); - - bool passed = true; - // TODO async the testing - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; - logTestOutcome(passed, workgroupSize); - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; - logTestOutcome(passed, workgroupSize); - passed = runTest(subgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize) && passed; - logTestOutcome(passed, workgroupSize); - for (uint32_t itemsPerWG = workgroupSize; itemsPerWG > workgroupSize - subgroupSize; itemsPerWG--) - { - m_logger->log("Testing Item Count %u", ILogger::ELL_INFO, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; - logTestOutcome(passed, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; - logTestOutcome(passed, itemsPerWG); - passed = runTest(workgroupTestSource, elementCount, subgroupSizeLog2, workgroupSize, itemsPerWG) && passed; - logTestOutcome(passed, itemsPerWG); - } - m_api->endCapture(); - - // save cache every now and then - { - auto cpu = m_spirv_isa_cache->convertToCPUCache(); - // Normally we'd beautifully JSON serialize the thing, allow multiple devices & drivers + metadata - auto bin = cpu->getEntries().begin()->second.bin; - IFile::success_t success; - m_spirv_isa_cache_output->write(success,bin->data(),0ull,bin->size()); - if (!success) - logFail("Could not write Create SPIR-V to ISA cache to disk!"); - } - } - } - - return true; - } - - virtual bool onAppTerminated() override - { - m_logger->log("==========Result==========", ILogger::ELL_INFO); - m_logger->log("Fail Count: %u", ILogger::ELL_INFO, totalFailCount); - delete[] inputData; - return true; - } - - // the unit test is carried out on init - void workLoopBody() override {} - - // - bool keepRunning() override { return false; } - -private: - void logTestOutcome(bool passed, uint32_t workgroupSize) - { - if (passed) - m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize); - else - { - totalFailCount++; - m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize); - } - } - - // create pipeline (specialized every test) [TODO: turn into a future/async] - smart_refctd_ptr createPipeline(const ICPUShader* overridenUnspecialized, const uint8_t subgroupSizeLog2) - { - auto shader = m_device->createShader(overridenUnspecialized); - IGPUComputePipeline::SCreationParams params = {}; - params.layout = pipelineLayout.get(); - params.shader = { - .entryPoint = "main", - .shader = shader.get(), - .entries = nullptr, - .requiredSubgroupSize = static_cast(subgroupSizeLog2), - .requireFullSubgroups = true - }; - core::smart_refctd_ptr pipeline; - if (!m_device->createComputePipelines(m_spirv_isa_cache.get(),{¶ms,1},&pipeline)) - return nullptr; - return pipeline; - } - - /*template class Arithmetic, bool WorkgroupTest> - bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u) - { - return true; - }*/ - - template class Arithmetic, bool WorkgroupTest> - bool runTest(const smart_refctd_ptr& source, const uint32_t elementCount, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerWG = ~0u) - { - std::string arith_name = Arithmetic>::name; - - smart_refctd_ptr overridenUnspecialized; - if constexpr (WorkgroupTest) - { - overridenUnspecialized = CHLSLCompiler::createOverridenCopy( - source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n#define ITEMS_PER_WG %d\n", - (("workgroup::") + arith_name).c_str(), workgroupSize, itemsPerWG - ); - } - else - { - itemsPerWG = workgroupSize; - overridenUnspecialized = CHLSLCompiler::createOverridenCopy( - source.get(), "#define OPERATION %s\n#define WORKGROUP_SIZE %d\n", - (("subgroup::") + arith_name).c_str(), workgroupSize - ); - } - auto pipeline = createPipeline(overridenUnspecialized.get(),subgroupSizeLog2); - - // TODO: overlap dispatches with memory readbacks (requires multiple copies of `buffers`) - const uint32_t workgroupCount = elementCount / itemsPerWG; - cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE); - cmdbuf->bindComputePipeline(pipeline.get()); - cmdbuf->bindDescriptorSets(EPBP_COMPUTE, pipeline->getLayout(), 0u, 1u, &descriptorSet.get()); - cmdbuf->dispatch(workgroupCount, 1, 1); - { - IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount]; - for (auto i=0u; igetSize(),outputBuffers[i]} - }; - } - IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = {.memBarriers={},.bufBarriers=memoryBarrier}; - cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE,info); - } - cmdbuf->end(); - - const IQueue::SSubmitInfo::SSemaphoreInfo signal[1] = {{.semaphore=sema.get(),.value=++timelineValue}}; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[1] = {{.cmdbuf=cmdbuf.get()}}; - const IQueue::SSubmitInfo submits[1] = {{.commandBuffers=cmdbufs,.signalSemaphores=signal}}; - computeQueue->submit(submits); - const ISemaphore::SWaitInfo wait[1] = {{.semaphore=sema.get(),.value=timelineValue}}; - m_device->blockForSemaphores(wait); - - // check results - bool passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount); - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - if constexpr (WorkgroupTest) - passed = validateResults, WorkgroupTest>(itemsPerWG, workgroupCount) && passed; - - return passed; - } - - //returns true if result matches - template class Arithmetic, class Binop, bool WorkgroupTest> - bool validateResults(const uint32_t itemsPerWG, const uint32_t workgroupCount) - { - bool success = true; - - // download data - const SBufferRange bufferRange = {0u, resultsBuffer->getSize(), outputBuffers[Binop::BindingIndex]}; - m_utils->downloadBufferRangeViaStagingBufferAutoSubmit(SIntendedSubmitInfo{.queue=transferDownQueue},bufferRange,resultsBuffer->getPointer()); - - using type_t = typename Binop::type_t; - const auto dataFromBuffer = reinterpret_cast(resultsBuffer->getPointer()); - const auto subgroupSize = dataFromBuffer[0]; - if (subgroupSizenbl::hlsl::subgroup::MaxSubgroupSize) - { - m_logger->log("Unexpected Subgroup Size %u", ILogger::ELL_ERROR, subgroupSize); - return false; - } - - const auto testData = reinterpret_cast(dataFromBuffer + 1); - // TODO: parallel for (the temporary values need to be threadlocal or what?) - // now check if the data obtained has valid values - type_t* tmp = new type_t[itemsPerWG]; - type_t* ballotInput = new type_t[itemsPerWG]; - for (uint32_t workgroupID = 0u; success && workgroupID < workgroupCount; workgroupID++) - { - const auto workgroupOffset = workgroupID * itemsPerWG; - - if constexpr (WorkgroupTest) - { - if constexpr (std::is_same_v, Binop>) - { - for (auto i = 0u; i < itemsPerWG; i++) - ballotInput[i] = inputData[i + workgroupOffset] & 0x1u; - Arithmetic::impl(tmp, ballotInput, itemsPerWG); - } - else - Arithmetic::impl(tmp, inputData + workgroupOffset, itemsPerWG); - } - else - { - for (uint32_t pseudoSubgroupID = 0u; pseudoSubgroupID < itemsPerWG; pseudoSubgroupID += subgroupSize) - Arithmetic::impl(tmp + pseudoSubgroupID, inputData + workgroupOffset + pseudoSubgroupID, subgroupSize); - } - - for (uint32_t localInvocationIndex = 0u; localInvocationIndex < itemsPerWG; localInvocationIndex++) - { - const auto globalInvocationIndex = workgroupOffset + localInvocationIndex; - const auto cpuVal = tmp[localInvocationIndex]; - const auto gpuVal = testData[globalInvocationIndex]; - if (cpuVal != gpuVal) - { - m_logger->log( - "Failed test #%d (%s) (%s) Expected %u got %u for workgroup %d and localinvoc %d", - ILogger::ELL_ERROR, itemsPerWG, WorkgroupTest ? "workgroup" : "subgroup", Binop::name, - cpuVal, gpuVal, workgroupID, localInvocationIndex - ); - success = false; - break; - } - } - } - delete[] ballotInput; - delete[] tmp; - - return success; - } - - IQueue* transferDownQueue; - IQueue* computeQueue; - smart_refctd_ptr m_spirv_isa_cache; - smart_refctd_ptr m_spirv_isa_cache_output; - - uint32_t* inputData = nullptr; - constexpr static inline uint32_t OutputBufferCount = 8u; - smart_refctd_ptr outputBuffers[OutputBufferCount]; - smart_refctd_ptr descriptorSet; - smart_refctd_ptr pipelineLayout; - - smart_refctd_ptr sema; - uint64_t timelineValue = 0; - smart_refctd_ptr cmdbuf; - smart_refctd_ptr resultsBuffer; - - uint32_t totalFailCount = 0; -}; - -NBL_MAIN_FUNC(ArithmeticUnitTestApp) \ No newline at end of file diff --git a/28_FFTBloom/app_resources/fft_common.hlsl b/28_FFTBloom/app_resources/fft_common.hlsl index 41f8821cc..9f2be1432 100644 --- a/28_FFTBloom/app_resources/fft_common.hlsl +++ b/28_FFTBloom/app_resources/fft_common.hlsl @@ -5,13 +5,13 @@ groupshared uint32_t sharedmem[FFTParameters::SharedMemoryDWORDs]; struct SharedMemoryAccessor { - template + template void set(IndexType idx, AccessType value) { sharedmem[idx] = value; } - template + template void get(IndexType idx, NBL_REF_ARG(AccessType) value) { value = sharedmem[idx]; @@ -36,14 +36,14 @@ struct PreloadedAccessorCommonBase struct PreloadedAccessorBase : PreloadedAccessorCommonBase { - template - void set(uint32_t idx, AccessType value) + template + void set(IndexType idx, AccessType value) { preloaded[idx >> WorkgroupSizeLog2] = value; } - template - void get(uint32_t idx, NBL_REF_ARG(AccessType) value) + template + void get(IndexType idx, NBL_REF_ARG(AccessType) value) { value = preloaded[idx >> WorkgroupSizeLog2]; } @@ -54,14 +54,14 @@ struct PreloadedAccessorBase : PreloadedAccessorCommonBase // In the case for preloading all channels at once we make it stateful so we track which channel we're running FFT on struct MultiChannelPreloadedAccessorBase : PreloadedAccessorCommonBase { - template - void set(uint32_t idx, AccessType value) + template + void set(IndexType idx, AccessType value) { preloaded[currentChannel][idx >> WorkgroupSizeLog2] = value; } - template - void get(uint32_t idx, NBL_REF_ARG(AccessType) value) + template + void get(IndexType idx, NBL_REF_ARG(AccessType) value) { value = preloaded[currentChannel][idx >> WorkgroupSizeLog2]; } diff --git a/29_Arithmetic2Bench/CMakeLists.txt b/29_Arithmetic2Bench/CMakeLists.txt new file mode 100644 index 000000000..0724366c9 --- /dev/null +++ b/29_Arithmetic2Bench/CMakeLists.txt @@ -0,0 +1,25 @@ + +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + +if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) +endif() \ No newline at end of file diff --git a/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl new file mode 100644 index 000000000..f6ad3e678 --- /dev/null +++ b/29_Arithmetic2Bench/app_resources/benchmarkSubgroup.comp.hlsl @@ -0,0 +1,57 @@ +#pragma shader_stage(compute) + +#define operation_t nbl::hlsl::OPERATION + +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_params.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" +#include "nbl/builtin/hlsl/random/xoroshiro.hlsl" + +#include "shaderCommon.hlsl" +#include "nbl/builtin/hlsl/workgroup2/basic.hlsl" + +template +using params_t = SUBGROUP_CONFIG_T; + +NBL_CONSTEXPR_STATIC_INLINE uint32_t ItemsPerInvocation = params_t::base_t, device_capabilities>::ItemsPerInvocation; + +typedef vector type_t; + +uint32_t globalIndex() +{ + return glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+workgroup::SubgroupContiguousIndex(); +} + +template +static void subbench(NBL_CONST_REF_ARG(type_t) sourceVal) +{ + type_t value = sourceVal; + + const uint64_t outputBufAddr = pc.pOutputBuf[Binop::BindingIndex]; + + operation_t > func; + // [unroll] + for (uint32_t i = 0; i < NUM_LOOPS; i++) + value = func(value); + + vk::RawBufferStore(outputBufAddr + sizeof(type_t) * globalIndex(), value, sizeof(uint32_t)); +} + +void benchmark() +{ + const uint32_t invocationIndex = globalIndex(); + type_t sourceVal; + Xoroshiro64Star xoroshiro = Xoroshiro64Star::construct(uint32_t2(invocationIndex,invocationIndex+1)); + [unroll] + for (uint16_t i = 0; i < ItemsPerInvocation; i++) + sourceVal[i] = xoroshiro(); + + subbench >(sourceVal); +} + +[numthreads(WORKGROUP_SIZE,1,1)] +void main() +{ + benchmark(); +} diff --git a/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl new file mode 100644 index 000000000..a56945467 --- /dev/null +++ b/29_Arithmetic2Bench/app_resources/benchmarkWorkgroup.comp.hlsl @@ -0,0 +1,124 @@ +#pragma shader_stage(compute) + +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_portability.hlsl" +#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl" +#include "nbl/builtin/hlsl/random/xoroshiro.hlsl" + +using config_t = WORKGROUP_CONFIG_T; + +#include "shaderCommon.hlsl" + +typedef vector type_t; + +// final (level 1/2) scan needs to fit in one subgroup exactly +groupshared uint32_t scratch[mpl::max_v]; + +#include "../../common/include/WorkgroupDataAccessors.hlsl" + +template +struct RandomizedInputDataProxy +{ + using dtype_t = vector; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(1u) << WorkgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t PreloadedDataCount = VirtualWorkgroupSize / WorkgroupSize; + + static RandomizedInputDataProxy create(uint64_t inputBuf, uint64_t outputBuf) + { + RandomizedInputDataProxy retval; + retval.data = DataProxy::create(inputBuf, outputBuf); + return retval; + } + + template + void get(const IndexType ix, NBL_REF_ARG(AccessType) value) + { + value = preloaded[ix>>WorkgroupSizeLog2]; + } + template + void set(const IndexType ix, const AccessType value) + { + preloaded[ix>>WorkgroupSizeLog2] = value; + } + + void preload() + { + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); + Xoroshiro64Star xoroshiro = Xoroshiro64Star::construct(uint32_t2(invocationIndex,invocationIndex+1)); + [unroll] + for (uint16_t idx = 0; idx < PreloadedDataCount; idx++) + [unroll] + for (uint16_t i = 0; i < ItemsPerInvocation; i++) + preloaded[idx][i] = xoroshiro(); + } + void unload() + { + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); + [unroll] + for (uint16_t idx = 0; idx < PreloadedDataCount; idx++) + data.template set(idx * WorkgroupSize + invocationIndex, preloaded[idx]); + } + + void workgroupExecutionAndMemoryBarrier() + { + glsl::barrier(); + //glsl::memoryBarrierShared(); implied by the above + } + + DataProxy data; + dtype_t preloaded[PreloadedDataCount]; +}; + +static ScratchProxy arithmeticAccessor; + +using data_proxy_t = RandomizedInputDataProxy; + +template +struct operation_t +{ + using binop_base_t = typename Binop::base_t; + using otype_t = typename Binop::type_t; + + void operator()(data_proxy_t dataAccessor) + { +#if IS_REDUCTION + otype_t value = +#endif + OPERATION::template __call(dataAccessor,arithmeticAccessor); + // we barrier before because we alias the accessors for Binop + arithmeticAccessor.workgroupExecutionAndMemoryBarrier(); +#if IS_REDUCTION + [unroll] + for (uint32_t i = 0; i < data_proxy_t::PreloadedDataCount; i++) + dataAccessor.preloaded[i] = value; +#endif + } +}; + +template +static void subbench() +{ + data_proxy_t dataAccessor = data_proxy_t::create(0, pc.pOutputBuf[Binop::BindingIndex]); + dataAccessor.preload(); + + operation_t func; + for (uint32_t i = 0; i < NUM_LOOPS; i++) + func(dataAccessor); + + dataAccessor.unload(); +} + +void benchmark() +{ + // only benchmark plus op + subbench >(); +} + + +[numthreads(config_t::WorkgroupSize,1,1)] +void main() +{ + benchmark(); +} diff --git a/29_Arithmetic2Bench/app_resources/common.hlsl b/29_Arithmetic2Bench/app_resources/common.hlsl new file mode 100644 index 000000000..cca5af987 --- /dev/null +++ b/29_Arithmetic2Bench/app_resources/common.hlsl @@ -0,0 +1,34 @@ +#include "nbl/builtin/hlsl/cpp_compat.hlsl" +#include "nbl/builtin/hlsl/functional.hlsl" + +struct PushConstantData +{ + uint64_t pOutputBuf[2]; +}; + +namespace arithmetic +{ +template +struct plus : nbl::hlsl::plus +{ + using base_t = nbl::hlsl::plus; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 0; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "plus"; +#endif +}; + +template +struct ballot : nbl::hlsl::plus +{ + using base_t = nbl::hlsl::plus; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t BindingIndex = 1; +#ifndef __HLSL_VERSION + static inline constexpr const char* name = "bitcount"; +#endif +}; +} + +#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" diff --git a/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl new file mode 100644 index 000000000..242ededd8 --- /dev/null +++ b/29_Arithmetic2Bench/app_resources/shaderCommon.hlsl @@ -0,0 +1,26 @@ +#include "common.hlsl" + +using namespace nbl; +using namespace hlsl; + +[[vk::push_constant]] PushConstantData pc; + +struct device_capabilities +{ +#ifdef TEST_NATIVE + NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = true; +#else + NBL_CONSTEXPR_STATIC_INLINE bool shaderSubgroupArithmetic = false; +#endif +}; + +#ifndef OPERATION +#error "Define OPERATION!" +#endif + +#ifndef NUM_LOOPS +#error "Define NUM_LOOPS!" +#endif + +// NOTE added dummy output image to be able to profile with Nsight, which still doesn't support profiling headless compute shaders +[[vk::binding(2, 0)]] RWTexture2D outImage; // dummy diff --git a/29_Arithmetic2Bench/config.json.template b/29_Arithmetic2Bench/config.json.template new file mode 100644 index 000000000..f961745c1 --- /dev/null +++ b/29_Arithmetic2Bench/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [] + }, + "profiles": [ + { + "backend": "vulkan", + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/29_Arithmetic2Bench/main.cpp b/29_Arithmetic2Bench/main.cpp new file mode 100644 index 000000000..2d5afeb4c --- /dev/null +++ b/29_Arithmetic2Bench/main.cpp @@ -0,0 +1,689 @@ +#include "SimpleWindowedApplication.hpp" +#include "CEventCallback.hpp" +#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp" +#include "app_resources/common.hlsl" +#include "nbl/builtin/hlsl/workgroup2/arithmetic_config.hlsl" +#include "nbl/builtin/hlsl/subgroup2/arithmetic_params.hlsl" + +using namespace nbl; +using namespace core; +using namespace system; +using namespace asset; +using namespace ui; +using namespace video; + +template requires std::is_base_of_v +class CExplicitSurfaceFormatResizeSurface final : public ISimpleManagedSurface +{ +public: + using this_t = CExplicitSurfaceFormatResizeSurface; + + // Factory method so we can fail, requires a `_surface` created from a window and with a callback that inherits from `ICallback` declared just above + template requires std::is_base_of_v, Surface> + static inline core::smart_refctd_ptr create(core::smart_refctd_ptr&& _surface) + { + if (!_surface) + return nullptr; + + auto _window = _surface->getWindow(); + ICallback* cb = nullptr; + if (_window) + cb = dynamic_cast(_window->getEventCallback()); + + return core::smart_refctd_ptr(new this_t(std::move(_surface), cb), core::dont_grab); + } + + // Factory method so we can fail, requires a `_surface` created from a native surface + template requires std::is_base_of_v, Surface> + static inline core::smart_refctd_ptr create(core::smart_refctd_ptr&& _surface, ICallback* cb) + { + if (!_surface) + return nullptr; + + return core::smart_refctd_ptr(new this_t(std::move(_surface), cb), core::dont_grab); + } + + // + inline bool init(CThreadSafeQueueAdapter* queue, std::unique_ptr&& scResources, const ISwapchain::SSharedCreationParams& sharedParams = {}) + { + if (!scResources || !base_init(queue)) + return init_fail(); + + m_sharedParams = sharedParams; + if (!m_sharedParams.deduce(queue->getOriginDevice()->getPhysicalDevice(), getSurface())) + return init_fail(); + + m_swapchainResources = std::move(scResources); + return true; + } + + // Can be public because we don't need to worry about mutexes unlike the Smooth Resize class + inline ISwapchainResources* getSwapchainResources() override { return m_swapchainResources.get(); } + + // need to see if the swapchain is invalidated (e.g. because we're starting from 0-area old Swapchain) and try to recreate the swapchain + inline SAcquireResult acquireNextImage() + { + if (!isWindowOpen()) + { + becomeIrrecoverable(); + return {}; + } + + if (!m_swapchainResources || (m_swapchainResources->getStatus() != ISwapchainResources::STATUS::USABLE && !recreateSwapchain(m_surfaceFormat))) + return {}; + + return ISimpleManagedSurface::acquireNextImage(); + } + + // its enough to just foward though + inline bool present(const uint8_t imageIndex, const std::span waitSemaphores) + { + return ISimpleManagedSurface::present(imageIndex, waitSemaphores); + } + + // + inline bool recreateSwapchain(const ISurface::SFormat& explicitSurfaceFormat) + { + assert(m_swapchainResources); + // dont assign straight to `m_swapchainResources` because of complex refcounting and cycles + core::smart_refctd_ptr newSwapchain; + // TODO: This block of code could be rolled up into `ISimpleManagedSurface::ISwapchainResources` eventually + { + auto* surface = getSurface(); + auto device = const_cast(getAssignedQueue()->getOriginDevice()); + // 0s are invalid values, so they indicate we want them deduced + m_sharedParams.width = 0; + m_sharedParams.height = 0; + // Question: should we re-query the supported queues, formats, present modes, etc. just-in-time?? + auto* swapchain = m_swapchainResources->getSwapchain(); + if (swapchain ? swapchain->deduceRecreationParams(m_sharedParams) : m_sharedParams.deduce(device->getPhysicalDevice(), surface)) + { + // super special case, we can't re-create the swapchain but its possible to recover later on + if (m_sharedParams.width == 0 || m_sharedParams.height == 0) + { + // we need to keep the old-swapchain around, but can drop the rest + m_swapchainResources->invalidate(); + return false; + } + // now lets try to create a new swapchain + if (swapchain) + newSwapchain = swapchain->recreate(m_sharedParams); + else + { + ISwapchain::SCreationParams params = { + .surface = core::smart_refctd_ptr(surface), + .surfaceFormat = explicitSurfaceFormat, + .sharedParams = m_sharedParams + // we're not going to support concurrent sharing in this simple class + }; + m_surfaceFormat = explicitSurfaceFormat; + newSwapchain = CVulkanSwapchain::create(core::smart_refctd_ptr(device), std::move(params)); + } + } + else // parameter deduction failed + return false; + } + + if (newSwapchain) + { + m_swapchainResources->invalidate(); + return m_swapchainResources->onCreateSwapchain(getAssignedQueue()->getFamilyIndex(), std::move(newSwapchain)); + } + else + becomeIrrecoverable(); + + return false; + } + +protected: + using ISimpleManagedSurface::ISimpleManagedSurface; + + // + inline void deinit_impl() override final + { + becomeIrrecoverable(); + } + + // + inline void becomeIrrecoverable() override { m_swapchainResources = nullptr; } + + // gets called when OUT_OF_DATE upon an acquire + inline SAcquireResult handleOutOfDate() override final + { + // recreate swapchain and try to acquire again + if (recreateSwapchain(m_surfaceFormat)) + return ISimpleManagedSurface::acquireNextImage(); + return {}; + } + +private: + // Because the surface can start minimized (extent={0,0}) we might not be able to create the swapchain right away, so store creation parameters until we can create it. + ISwapchain::SSharedCreationParams m_sharedParams = {}; + // The swapchain might not be possible to create or recreate right away, so this might be + // either nullptr before the first successful acquire or the old to-be-retired swapchain. + std::unique_ptr m_swapchainResources = {}; + + ISurface::SFormat m_surfaceFormat = {}; +}; + +// NOTE added swapchain + drawing frames to be able to profile with Nsight, which still doesn't support profiling headless compute shaders +class ArithmeticBenchApp final : public examples::SimpleWindowedApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication +{ + using device_base_t = examples::SimpleWindowedApplication; + using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication; + + constexpr static inline uint32_t WIN_W = 1280; + constexpr static inline uint32_t WIN_H = 720; + constexpr static inline uint32_t MaxFramesInFlight = 5; + +public: + ArithmeticBenchApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + inline core::vector getSurfaces() const override + { + if (!m_surface) + { + { + auto windowCallback = core::make_smart_refctd_ptr(smart_refctd_ptr(m_inputSystem), smart_refctd_ptr(m_logger)); + IWindow::SCreationParams params = {}; + params.callback = core::make_smart_refctd_ptr(); + params.width = WIN_W; + params.height = WIN_H; + params.x = 32; + params.y = 32; + params.flags = ui::IWindow::ECF_HIDDEN | IWindow::ECF_BORDERLESS | IWindow::ECF_RESIZABLE; + params.windowCaption = "ArithmeticBenchApp"; + params.callback = windowCallback; + const_cast&>(m_window) = m_winMgr->createWindow(std::move(params)); + } + + auto surface = CSurfaceVulkanWin32::create(smart_refctd_ptr(m_api), smart_refctd_ptr_static_cast(m_window)); + const_cast&>(m_surface) = CExplicitSurfaceFormatResizeSurface::create(std::move(surface)); + } + + if (m_surface) + return { {m_surface->getSurface()/*,EQF_NONE*/} }; + + return {}; + } + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + m_inputSystem = make_smart_refctd_ptr(logger_opt_smart_ptr(smart_refctd_ptr(m_logger))); + + if (!device_base_t::onAppInitialized(std::move(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + m_semaphore = m_device->createSemaphore(m_realFrameIx); + if (!m_semaphore) + return logFail("Failed to Create a Semaphore!"); + + ISwapchain::SCreationParams swapchainParams = { .surface = m_surface->getSurface() }; + asset::E_FORMAT preferredFormats[] = { asset::EF_R8G8B8A8_UNORM }; + if (!swapchainParams.deduceFormat(m_physicalDevice, preferredFormats)) + return logFail("Could not choose a Surface Format for the Swapchain!"); + + swapchainParams.sharedParams.imageUsage = IGPUImage::E_USAGE_FLAGS::EUF_RENDER_ATTACHMENT_BIT | IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT; + + auto graphicsQueue = getGraphicsQueue(); + if (!m_surface || !m_surface->init(graphicsQueue, std::make_unique(), swapchainParams.sharedParams)) + return logFail("Could not create Window & Surface or initialize the Surface!"); + + auto pool = m_device->createCommandPool(graphicsQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + + for (auto i = 0u; i < MaxFramesInFlight; i++) + { + if (!pool) + return logFail("Couldn't create Command Pool!"); + if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 })) + return logFail("Couldn't create Command Buffer!"); + } + + m_winMgr->setWindowSize(m_window.get(), WIN_W, WIN_H); + m_surface->recreateSwapchain(swapchainParams.surfaceFormat); + + transferDownQueue = getTransferDownQueue(); + computeQueue = getComputeQueue(); + + // create 2 buffers for 2 operations + for (auto i=0u; icreateBuffer(std::move(params)); + auto mreq = outputBuffers[i]->getMemoryReqs(); + mreq.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); + assert(mreq.memoryTypeBits); + + auto bufferMem = m_device->allocate(mreq, outputBuffers[i].get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + assert(bufferMem.isValid()); + } + for (auto i = 0u; i < OutputBufferCount; i++) + pc.pOutputBuf[i] = outputBuffers[i]->getDeviceAddress(); + + // create image views for swapchain images + for (uint32_t i = 0; i < ISwapchain::MaxImages; i++) + { + IGPUImage* scImg = m_surface->getSwapchainResources()->getImage(i); + if (scImg == nullptr) + continue; + IGPUImageView::SCreationParams viewParams = { + .flags = IGPUImageView::ECF_NONE, + .subUsages = IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT, + .image = smart_refctd_ptr(scImg), + .viewType = IGPUImageView::ET_2D, + .format = scImg->getCreationParameters().format + }; + swapchainImageViews[i] = m_device->createImageView(std::move(viewParams)); + } + + // create Descriptor Sets and Pipeline Layouts + smart_refctd_ptr benchPplnLayout; + { + // set and transient pool + smart_refctd_ptr benchLayout; + { + IGPUDescriptorSetLayout::SBinding binding[1]; + binding[0] = { {},2,IDescriptor::E_TYPE::ET_STORAGE_IMAGE,IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT,IShader::E_SHADER_STAGE::ESS_COMPUTE,1u,nullptr }; + benchLayout = m_device->createDescriptorSetLayout(binding); + } + + const uint32_t setCount = ISwapchain::MaxImages; + benchPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_UPDATE_AFTER_BIND_BIT, { &benchLayout.get(),1 }, &setCount); + for (auto i = 0u; i < ISwapchain::MaxImages; i++) + { + benchDs[i] = benchPool->createDescriptorSet(smart_refctd_ptr(benchLayout)); + if (!benchDs[i]) + return logFail("Could not create Descriptor Set!"); + } + + SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, .offset = 0,.size = sizeof(PushConstantData) }; + benchPplnLayout = m_device->createPipelineLayout({ &pcRange, 1 }, std::move(benchLayout)); + } + if (UseNativeArithmetic && !m_physicalDevice->getProperties().limits.shaderSubgroupArithmetic) + { + logFail("UseNativeArithmetic is true but device does not support shaderSubgroupArithmetic!"); + return false; + } + + IGPUDescriptorSet::SWriteDescriptorSet dsWrites[ISwapchain::MaxImages]; + for (auto i = 0u; i < ISwapchain::MaxImages; i++) + { + if (swapchainImageViews[i].get() == nullptr) + continue; + + video::IGPUDescriptorSet::SDescriptorInfo dsInfo; + dsInfo.info.image.imageLayout = IImage::LAYOUT::GENERAL; + dsInfo.desc = swapchainImageViews[i]; + + dsWrites[i] = + { + .dstSet = benchDs[i].get(), + .binding = 2u, + .arrayElement = 0u, + .count = 1u, + .info = &dsInfo, + }; + m_device->updateDescriptorSets(1u, &dsWrites[i], 0u, nullptr); + } + + + // load shader source from file + auto getShaderSource = [&](const char* filePath) -> auto + { + IAssetLoader::SAssetLoadParams lparams = {}; + lparams.logger = m_logger.get(); + lparams.workingDirectory = ""; + auto bundle = m_assetMgr->getAsset(filePath, lparams); + if (bundle.getContents().empty() || bundle.getAssetType()!=IAsset::ET_SHADER) + { + m_logger->log("Shader %s not found!", ILogger::ELL_ERROR, filePath); + exit(-1); + } + auto firstAssetInBundle = bundle.getContents()[0]; + return smart_refctd_ptr_static_cast(firstAssetInBundle); + }; + + // for each workgroup size (manually adjust items per invoc, operation else uses up a lot of ram) + const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; + smart_refctd_ptr shaderSource; + if constexpr (DoWorkgroupBenchmarks) + shaderSource = getShaderSource("app_resources/benchmarkWorkgroup.comp.hlsl"); + else + shaderSource = getShaderSource("app_resources/benchmarkSubgroup.comp.hlsl"); + + for (uint32_t op = 0; op < arithmeticOperations.size(); op++) + for (uint32_t i = 0; i < workgroupSizes.size(); i++) + benchSets[op*workgroupSizes.size()+i] = createBenchmarkPipelines(shaderSource, benchPplnLayout.get(), ElementCount, arithmeticOperations[op], hlsl::findMSB(MaxSubgroupSize), workgroupSizes[i], ItemsPerInvocation, NumLoops); + + m_winMgr->show(m_window.get()); + + return true; + } + + virtual bool onAppTerminated() override + { + return true; + } + + // the unit test is carried out on init + void workLoopBody() override + { + const auto resourceIx = m_realFrameIx % MaxFramesInFlight; + + const uint32_t framesInFlight = core::min(MaxFramesInFlight, m_surface->getMaxAcquiresInFlight()); + + if (m_realFrameIx >= framesInFlight) + { + const ISemaphore::SWaitInfo cbDonePending[] = + { + { + .semaphore = m_semaphore.get(), + .value = m_realFrameIx + 1 - framesInFlight + } + }; + if (m_device->blockForSemaphores(cbDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) + return; + } + + m_currentImageAcquire = m_surface->acquireNextImage(); + if (!m_currentImageAcquire) + return; + + auto* const cmdbuf = m_cmdBufs.data()[resourceIx].get(); + cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + const auto MaxSubgroupSize = m_physicalDevice->getLimits().maxSubgroupSize; + const auto SubgroupSizeLog2 = hlsl::findMSB(MaxSubgroupSize); + + cmdbuf->bindDescriptorSets(EPBP_COMPUTE, benchSets[0].pipeline->getLayout(), 0u, 1u, &benchDs[m_currentImageAcquire.imageIndex].get()); + cmdbuf->pushConstants(benchSets[0].pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PushConstantData), &pc); + + for (uint32_t i = 0; i < benchSets.size(); i++) + runBenchmark(cmdbuf, benchSets[i], ElementCount, SubgroupSizeLog2); + + // barrier transition to PRESENT + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::image_barrier_t imageBarriers[1]; + imageBarriers[0].barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::NONE, + .dstAccessMask = ACCESS_FLAGS::NONE + } + }; + imageBarriers[0].image = m_surface->getSwapchainResources()->getImage(m_currentImageAcquire.imageIndex); + imageBarriers[0].subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .baseMipLevel = 0u, + .levelCount = 1u, + .baseArrayLayer = 0u, + .layerCount = 1u + }; + imageBarriers[0].oldLayout = IImage::LAYOUT::UNDEFINED; + imageBarriers[0].newLayout = IImage::LAYOUT::PRESENT_SRC; + + cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = imageBarriers }); + } + + cmdbuf->end(); + + // submit + { + auto* queue = getGraphicsQueue(); + const IQueue::SSubmitInfo::SSemaphoreInfo rendered[] = + { + { + .semaphore = m_semaphore.get(), + .value = ++m_realFrameIx, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + } + }; + { + { + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = + { + {.cmdbuf = cmdbuf } + }; + + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = + { + { + .semaphore = m_currentImageAcquire.semaphore, + .value = m_currentImageAcquire.acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::NONE + } + }; + const IQueue::SSubmitInfo infos[] = + { + { + .waitSemaphores = acquired, + .commandBuffers = commandBuffers, + .signalSemaphores = rendered + } + }; + + if (queue->submit(infos) == IQueue::RESULT::SUCCESS) + { + const nbl::video::ISemaphore::SWaitInfo waitInfos[] = + { { + .semaphore = m_semaphore.get(), + .value = m_realFrameIx + } }; + + m_device->blockForSemaphores(waitInfos); // this is not solution, quick wa to not throw validation errors + } + else + --m_realFrameIx; + } + } + + m_surface->present(m_currentImageAcquire.imageIndex, rendered); + } + + numSubmits++; + } + + // + bool keepRunning() override { return numSubmits < MaxNumSubmits; } + +private: + // create pipeline (specialized every test) [TODO: turn into a future/async] + smart_refctd_ptr createPipeline(const ICPUShader* overridenUnspecialized, const IGPUPipelineLayout* layout, const uint8_t subgroupSizeLog2) + { + auto shader = m_device->createShader(overridenUnspecialized); + IGPUComputePipeline::SCreationParams params = {}; + params.layout = layout; + params.shader = { + .entryPoint = "main", + .shader = shader.get(), + .entries = nullptr, + .requiredSubgroupSize = static_cast(subgroupSizeLog2), + .requireFullSubgroups = true + }; + core::smart_refctd_ptr pipeline; + if (!m_device->createComputePipelines(nullptr,{¶ms,1},&pipeline)) + return nullptr; + return pipeline; + } + + struct BenchmarkSet + { + smart_refctd_ptr pipeline; + uint32_t workgroupSize; + uint32_t itemsPerInvocation; + }; + + template + BenchmarkSet createBenchmarkPipelines(const smart_refctd_ptr&source, const IGPUPipelineLayout* layout, const uint32_t elementCount, const std::string& arith_name, const uint8_t subgroupSizeLog2, const uint32_t workgroupSize, uint32_t itemsPerInvoc = 1u, uint32_t numLoops = 8u) + { + auto compiler = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); + CHLSLCompiler::SOptions options = {}; + options.stage = IShader::E_SHADER_STAGE::ESS_COMPUTE; + options.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion; + options.spirvOptimizer = nullptr; +#ifndef _NBL_DEBUG + ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; + auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); + options.spirvOptimizer = opt.get(); +#else + options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT; +#endif + options.preprocessorOptions.sourceIdentifier = source->getFilepathHint(); + options.preprocessorOptions.logger = m_logger.get(); + + auto* includeFinder = compiler->getDefaultIncludeFinder(); + options.preprocessorOptions.includeFinder = includeFinder; + + const uint32_t subgroupSize = 0x1u << subgroupSizeLog2; + const uint32_t workgroupSizeLog2 = hlsl::findMSB(workgroupSize); + hlsl::workgroup2::SArithmeticConfiguration wgConfig; + wgConfig.init(workgroupSizeLog2, subgroupSizeLog2, itemsPerInvoc); + const uint32_t itemsPerWG = wgConfig.VirtualWorkgroupSize * wgConfig.ItemsPerInvocation_0; + smart_refctd_ptr overriddenUnspecialized; + if constexpr (WorkgroupBench) + { + const std::string definitions[4] = { + "workgroup2::" + arith_name, + wgConfig.getConfigTemplateStructString(), + std::to_string(numLoops), + std::to_string(arith_name=="reduction") + }; + + const IShaderCompiler::SMacroDefinition defines[5] = { + { "OPERATION", definitions[0] }, + { "WORKGROUP_CONFIG_T", definitions[1] }, + { "NUM_LOOPS", definitions[2] }, + { "IS_REDUCTION", definitions[3] }, + { "TEST_NATIVE", "1" } + }; + if (UseNativeArithmetic) + options.preprocessorOptions.extraDefines = { defines, defines + 5 }; + else + options.preprocessorOptions.extraDefines = { defines, defines + 4 }; + + overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); + } + else + { + hlsl::subgroup2::SArithmeticParams sgParams; + sgParams.init(subgroupSizeLog2, itemsPerInvoc); + + const std::string definitions[4] = { + "subgroup2::" + arith_name, + std::to_string(workgroupSize), + sgParams.getParamTemplateStructString(), + std::to_string(numLoops) + }; + + const IShaderCompiler::SMacroDefinition defines[5] = { + { "OPERATION", definitions[0] }, + { "WORKGROUP_SIZE", definitions[1] }, + { "SUBGROUP_CONFIG_T", definitions[2] }, + { "NUM_LOOPS", definitions[3] }, + { "TEST_NATIVE", "1" } + }; + if (UseNativeArithmetic) + options.preprocessorOptions.extraDefines = { defines, defines + 5 }; + else + options.preprocessorOptions.extraDefines = { defines, defines + 4 }; + + overriddenUnspecialized = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); + } + + BenchmarkSet set; + set.pipeline = createPipeline(overriddenUnspecialized.get(), layout, subgroupSizeLog2); + if constexpr (WorkgroupBench) + { + set.workgroupSize = itemsPerWG; + } + else + { + set.workgroupSize = workgroupSize; + } + set.itemsPerInvocation = itemsPerInvoc; + + return set; + }; + + template + void runBenchmark(IGPUCommandBuffer* cmdbuf, const BenchmarkSet& set, const uint32_t elementCount, const uint8_t subgroupSizeLog2) + { + uint32_t workgroupCount; + if constexpr (WorkgroupBench) + workgroupCount = elementCount / set.workgroupSize; + else + workgroupCount = elementCount / (set.workgroupSize * set.itemsPerInvocation); + + cmdbuf->bindComputePipeline(set.pipeline.get()); + cmdbuf->dispatch(workgroupCount, 1, 1); + { + IGPUCommandBuffer::SPipelineBarrierDependencyInfo::buffer_barrier_t memoryBarrier[OutputBufferCount]; + for (auto i = 0u; i < OutputBufferCount; i++) + { + memoryBarrier[i] = { + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + // in theory we don't need the HOST BITS cause we block on a semaphore but might as well add them + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT | PIPELINE_STAGE_FLAGS::HOST_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS | ACCESS_FLAGS::HOST_READ_BIT + } + }, + .range = {0ull,outputBuffers[i]->getSize(),outputBuffers[i]} + }; + } + IGPUCommandBuffer::SPipelineBarrierDependencyInfo info = { .memBarriers = {},.bufBarriers = memoryBarrier }; + cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE, info); + } + } + + IQueue* transferDownQueue; + IQueue* computeQueue; + + smart_refctd_ptr m_window; + smart_refctd_ptr> m_surface; + smart_refctd_ptr m_semaphore; + uint64_t m_realFrameIx = 0; + std::array, MaxFramesInFlight> m_cmdBufs; + ISimpleManagedSurface::SAcquireResult m_currentImageAcquire = {}; + + smart_refctd_ptr m_inputSystem; + + std::array, ISwapchain::MaxImages> swapchainImageViews; + + constexpr static inline uint32_t MaxNumSubmits = 30; + uint32_t numSubmits = 0; + constexpr static inline uint32_t ElementCount = 1024 * 1024; + + /* PARAMETERS TO CHANGE FOR DIFFERENT BENCHMARKS */ + constexpr static inline bool DoWorkgroupBenchmarks = true; + constexpr static inline bool UseNativeArithmetic = true; + uint32_t ItemsPerInvocation = 4u; + constexpr static inline uint32_t NumLoops = 1000u; + constexpr static inline uint32_t NumBenchmarks = 6u; + std::array workgroupSizes = { 32, 64, 128, 256, 512, 1024 }; + std::array arithmeticOperations = { "reduction", "inclusive_scan", "exclusive_scan" }; + + + std::array benchSets; + smart_refctd_ptr benchPool; + std::array, ISwapchain::MaxImages> benchDs; + + constexpr static inline uint32_t OutputBufferCount = 2u; + smart_refctd_ptr outputBuffers[OutputBufferCount]; + smart_refctd_ptr gpuOutputAddressesBuffer; + PushConstantData pc; + + uint64_t timelineValue = 0; +}; + +NBL_MAIN_FUNC(ArithmeticBenchApp) \ No newline at end of file diff --git a/29_Arithmetic2Bench/pipeline.groovy b/29_Arithmetic2Bench/pipeline.groovy new file mode 100644 index 000000000..7ea9947e0 --- /dev/null +++ b/29_Arithmetic2Bench/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CArithemticUnitTestBuilder extends IBuilder +{ + public CArithemticUnitTestBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CArithemticUnitTestBuilder(_agent, _info) +} + +return this \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 0b3279a48..31ebaddf9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,12 +58,13 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(20_AllocatorTest EXCLUDE_FROM_ALL) add_subdirectory(21_LRUCacheUnitTest EXCLUDE_FROM_ALL) add_subdirectory(22_CppCompat EXCLUDE_FROM_ALL) - add_subdirectory(23_ArithmeticUnitTest EXCLUDE_FROM_ALL) + add_subdirectory(23_Arithmetic2UnitTest EXCLUDE_FROM_ALL) add_subdirectory(24_ColorSpaceTest EXCLUDE_FROM_ALL) add_subdirectory(25_FilterTest EXCLUDE_FROM_ALL) add_subdirectory(26_Blur EXCLUDE_FROM_ALL) add_subdirectory(27_MPMCScheduler EXCLUDE_FROM_ALL) add_subdirectory(28_FFTBloom EXCLUDE_FROM_ALL) + add_subdirectory(29_Arithmetic2Bench EXCLUDE_FROM_ALL) # add_subdirectory(36_CUDAInterop EXCLUDE_FROM_ALL) # Showcase compute pathtracing diff --git a/common/include/WorkgroupDataAccessors.hlsl b/common/include/WorkgroupDataAccessors.hlsl new file mode 100644 index 000000000..7287a4135 --- /dev/null +++ b/common/include/WorkgroupDataAccessors.hlsl @@ -0,0 +1,124 @@ +#ifndef _WORKGROUP_DATA_ACCESSORS_HLSL_ +#define _WORKGROUP_DATA_ACCESSORS_HLSL_ + +#include "nbl/builtin/hlsl/bda/legacy_bda_accessor.hlsl" + +namespace nbl +{ +namespace hlsl +{ + +struct ScratchProxy +{ + template + void get(const uint32_t ix, NBL_REF_ARG(AccessType) value) + { + value = scratch[ix]; + } + template + void set(const uint32_t ix, const AccessType value) + { + scratch[ix] = value; + } + + uint32_t atomicOr(const uint32_t ix, const uint32_t value) + { + return glsl::atomicOr(scratch[ix],value); + } + + void workgroupExecutionAndMemoryBarrier() + { + glsl::barrier(); + //glsl::memoryBarrierShared(); implied by the above + } +}; + +template +struct DataProxy +{ + using dtype_t = vector; + // function template AccessType should be the same as dtype_t + + static DataProxy create(const uint64_t inputBuf, const uint64_t outputBuf) + { + DataProxy retval; + const uint32_t workgroupOffset = glsl::gl_WorkGroupID().x * VirtualWorkgroupSize * sizeof(dtype_t); + retval.accessor = DoubleLegacyBdaAccessor::create(inputBuf + workgroupOffset, outputBuf + workgroupOffset); + return retval; + } + + template + void get(const IndexType ix, NBL_REF_ARG(AccessType) value) + { + accessor.get(ix, value); + } + template + void set(const IndexType ix, const AccessType value) + { + accessor.set(ix, value); + } + + void workgroupExecutionAndMemoryBarrier() + { + glsl::barrier(); + //glsl::memoryBarrierShared(); implied by the above + } + + DoubleLegacyBdaAccessor accessor; +}; + +template +struct PreloadedDataProxy +{ + using dtype_t = vector; + + NBL_CONSTEXPR_STATIC_INLINE uint16_t WorkgroupSize = uint16_t(1u) << WorkgroupSizeLog2; + NBL_CONSTEXPR_STATIC_INLINE uint16_t PreloadedDataCount = VirtualWorkgroupSize / WorkgroupSize; + + static PreloadedDataProxy create(const uint64_t inputBuf, const uint64_t outputBuf) + { + PreloadedDataProxy retval; + retval.data = DataProxy::create(inputBuf, outputBuf); + return retval; + } + + template + void get(const IndexType ix, NBL_REF_ARG(AccessType) value) + { + value = preloaded[ix>>WorkgroupSizeLog2]; + } + template + void set(const IndexType ix, const AccessType value) + { + preloaded[ix>>WorkgroupSizeLog2] = value; + } + + void preload() + { + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); + [unroll] + for (uint16_t idx = 0; idx < PreloadedDataCount; idx++) + data.template get(idx * WorkgroupSize + invocationIndex, preloaded[idx]); + } + void unload() + { + const uint16_t invocationIndex = workgroup::SubgroupContiguousIndex(); + [unroll] + for (uint16_t idx = 0; idx < PreloadedDataCount; idx++) + data.template set(idx * WorkgroupSize + invocationIndex, preloaded[idx]); + } + + void workgroupExecutionAndMemoryBarrier() + { + glsl::barrier(); + //glsl::memoryBarrierShared(); implied by the above + } + + DataProxy data; + dtype_t preloaded[PreloadedDataCount]; +}; + +} +} + +#endif