diff --git a/03_DeviceSelectionAndSharedSources/main.cpp b/03_DeviceSelectionAndSharedSources/main.cpp index d9a8d9b1a..28872a5f5 100644 --- a/03_DeviceSelectionAndSharedSources/main.cpp +++ b/03_DeviceSelectionAndSharedSources/main.cpp @@ -32,7 +32,7 @@ class DeviceSelectionAndSharedSourcesApp final : public examples::MonoDeviceAppl bool onAppInitialized(smart_refctd_ptr&& system) override { // Remember to call the base class initialization! - if (!device_base_t::onAppInitialized(std::move(system))) + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) return false; if (!asset_base_t::onAppInitialized(std::move(system))) return false; diff --git a/22_CppCompat/test.hlsl.orig b/22_CppCompat/test.hlsl.orig deleted file mode 100644 index b67b08062..000000000 --- a/22_CppCompat/test.hlsl.orig +++ /dev/null @@ -1,264 +0,0 @@ -//// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -//// This file is part of the "Nabla Engine". -//// For conditions of distribution and use, see copyright notice in nabla.h - -#pragma shader_stage(compute) - -#define STATIC_ASSERT(C) { nbl::hlsl::conditional::type a = 0; } - -#define IS_SAME(L,R) nbl::hlsl::is_same::value -#define SHADER_CRASHING_ASSERT(expr) \ -{ \ - bool con = (expr); \ - do { \ - [branch] if (!con) \ - vk::RawBufferStore(0xdeadbeefBADC0FFbull,0x45u,4u); \ - } while(!con); \ -} - - -#include -#include - -#include -#include - -#include -#include -#include -#include - -#include - -#include -#include - -#include - -struct PushConstants -{ - uint2 imgSize; -}; - -[[vk::push_constant]] -PushConstants u_pushConstants; - -[[vk::binding(0, 0)]] Texture2D inImage; -[[vk::binding(1, 0)]] RWTexture2D outImage; -[[vk::binding(2, 0)]] Buffer inBuffer; -[[vk::binding(3, 0)]] RWStructuredBuffer outBuffer; - - -template -struct Spec -{ - static const int value = Spec::value + 1; -}; - -template<> -struct Spec<0> -{ - static const int value = 0; -}; - -Buffer unbounded[]; - -template -bool val(T) { return nbl::hlsl::is_unbounded_array::value; } - - -template -struct array -{ - T data[N]; -}; - -void fill(uint3 invocationID, float val) -{ - outImage[invocationID.xy] = float4(val,val,val,val); - outBuffer[invocationID.x * invocationID.y] = float4(val,val,val,val); -} - -void fill(uint3 invocationID, float4 val) -{ - outImage[invocationID.xy] = val; - outBuffer[invocationID.x * invocationID.y] = val; -} - - -[numthreads(8, 8, 1)] -void main(uint3 invocationID : SV_DispatchThreadID) -{ - fill(invocationID, 1); - const float32_t3 TEST_VEC = float32_t3(1.0f, 2.0f, 3.0f); - - fill(invocationID, 2); - // test functions from EOTF.hlsl - nbl::hlsl::colorspace::eotf::identity(TEST_VEC); - nbl::hlsl::colorspace::eotf::impl_shared_2_4(TEST_VEC, 0.5f); - nbl::hlsl::colorspace::eotf::sRGB(TEST_VEC); - nbl::hlsl::colorspace::eotf::Display_P3(TEST_VEC); - nbl::hlsl::colorspace::eotf::DCI_P3_XYZ(TEST_VEC); - nbl::hlsl::colorspace::eotf::SMPTE_170M(TEST_VEC); - nbl::hlsl::colorspace::eotf::SMPTE_ST2084(TEST_VEC); - nbl::hlsl::colorspace::eotf::HDR10_HLG(TEST_VEC); - nbl::hlsl::colorspace::eotf::AdobeRGB(TEST_VEC); - nbl::hlsl::colorspace::eotf::Gamma_2_2(TEST_VEC); - nbl::hlsl::colorspace::eotf::ACEScc(TEST_VEC); - nbl::hlsl::colorspace::eotf::ACEScct(TEST_VEC); - - fill(invocationID, 3); - - // test functions from OETF.hlsl - nbl::hlsl::colorspace::oetf::identity(TEST_VEC); - nbl::hlsl::colorspace::oetf::impl_shared_2_4(TEST_VEC, 0.5f); - nbl::hlsl::colorspace::oetf::sRGB(TEST_VEC); - nbl::hlsl::colorspace::oetf::Display_P3(TEST_VEC); - nbl::hlsl::colorspace::oetf::DCI_P3_XYZ(TEST_VEC); - nbl::hlsl::colorspace::oetf::SMPTE_170M(TEST_VEC); - nbl::hlsl::colorspace::oetf::SMPTE_ST2084(TEST_VEC); - nbl::hlsl::colorspace::oetf::HDR10_HLG(TEST_VEC); - nbl::hlsl::colorspace::oetf::AdobeRGB(TEST_VEC); - nbl::hlsl::colorspace::oetf::Gamma_2_2(TEST_VEC); - nbl::hlsl::colorspace::oetf::ACEScc(TEST_VEC); - nbl::hlsl::colorspace::oetf::ACEScct(TEST_VEC); - - fill(invocationID, 4); - // xoroshiro tests - const uint32_t2 state = uint32_t2(12u, 34u); - nbl::hlsl::Xoroshiro64Star xoroshiro64Star = nbl::hlsl::Xoroshiro64Star::construct(state); - xoroshiro64Star(); - nbl::hlsl::Xoroshiro64StarStar xoroshiro64StarStar = nbl::hlsl::Xoroshiro64StarStar::construct(state); - xoroshiro64StarStar(); - - //nbl::hlsl::mpl::countl_zero<2ull>::value; - - // TODO: test if std::rotl/r == nbl::hlsl::rotr/l == nbl::hlsl::mpl::rotr/l - - // uint32_t mplRotlResult0 = nbl::hlsl::mpl::rotl::value; - // uint32_t mplRotlResult1 = nbl::hlsl::mpl::rotl::value; - // uint32_t mplRotrResult0 = nbl::hlsl::mpl::rotr::value; - // uint32_t mplRotrResult1 = nbl::hlsl::mpl::rotr::value; - - // uint32_t rotlResult0 = nbl::hlsl::mpl::rotl::value; - // uint32_t rotlResult1 = nbl::hlsl::mpl::rotl::value; - // uint32_t rotrResult0 = nbl::hlsl::mpl::rotr::value; - // uint32_t rotrResult1 = nbl::hlsl::mpl::rotr::value; - - // SHADER_CRASHING_ASSERT(rotlResult0 == mplRotlResult0); - // SHADER_CRASHING_ASSERT(rotlResult1 == mplRotlResult1); - // SHADER_CRASHING_ASSERT(rotrResult0 == mplRotrResult0); - // SHADER_CRASHING_ASSERT(rotrResult1 == mplRotrResult1); - - // TODO: more tests and compare with cpp version as well - fill(invocationID, 5); - // countl_zero test - { - static const uint16_t TEST_VALUE_0 = 5; - static const uint32_t TEST_VALUE_1 = 0x80000000u; - static const uint32_t TEST_VALUE_2 = 0x8000000000000000u; - static const uint32_t TEST_VALUE_3 = 0x00000001u; - static const uint32_t TEST_VALUE_4 = 0x0000000000000001u; - - - fill(invocationID, 5.01); - uint16_t compileTimeCountLZero = nbl::hlsl::mpl::countl_zero::value; - uint16_t runTimeCountLZero = nbl::hlsl::countl_zero(TEST_VALUE_0); - fill(invocationID, float4(5.1, compileTimeCountLZero, runTimeCountLZero, 0)); - SHADER_CRASHING_ASSERT(compileTimeCountLZero == runTimeCountLZero); - - compileTimeCountLZero = nbl::hlsl::mpl::countl_zero::value; - runTimeCountLZero = nbl::hlsl::countl_zero(TEST_VALUE_1); - fill(invocationID, float4(5.2, compileTimeCountLZero, runTimeCountLZero, 0)); - SHADER_CRASHING_ASSERT(compileTimeCountLZero == runTimeCountLZero); - - compileTimeCountLZero = nbl::hlsl::mpl::countl_zero::value; - runTimeCountLZero = nbl::hlsl::countl_zero(TEST_VALUE_2); - fill(invocationID, float4(5.3, compileTimeCountLZero, runTimeCountLZero, 0)); - SHADER_CRASHING_ASSERT(compileTimeCountLZero == runTimeCountLZero); - - compileTimeCountLZero = nbl::hlsl::mpl::countl_zero::value; - runTimeCountLZero = nbl::hlsl::countl_zero(TEST_VALUE_3); - fill(invocationID, float4(5.4, compileTimeCountLZero, runTimeCountLZero, 0)); - SHADER_CRASHING_ASSERT(compileTimeCountLZero == runTimeCountLZero); - - compileTimeCountLZero = nbl::hlsl::mpl::countl_zero::value; - runTimeCountLZero = nbl::hlsl::countl_zero(TEST_VALUE_4); - fill(invocationID, float4(5.5, compileTimeCountLZero, runTimeCountLZero, 0)); - SHADER_CRASHING_ASSERT(compileTimeCountLZero == runTimeCountLZero); - } - - { - bool A = Spec<3>::value == 3; - } - { - bool A = nbl::hlsl::is_integral::value; - } - { - bool A = val(unbounded); - } - fill(invocationID, 6); - { - float4 v; - fill(invocationID, float4(alignof(v.x), alignof(v), 0, 0)); - SHADER_CRASHING_ASSERT(alignof(v.x) == alignof(v)); - } - - { - float4 v; - const volatile float4 u; - - STATIC_ASSERT(IS_SAME(decltype(v.x), nbl::hlsl::impl::add_lvalue_reference::type)); - STATIC_ASSERT(nbl::hlsl::impl::is_reference::value); - STATIC_ASSERT(IS_SAME(float,nbl::hlsl::impl::remove_reference::type)); - STATIC_ASSERT(IS_SAME(decltype(v.x),nbl::hlsl::impl::add_lvalue_reference::type)); - STATIC_ASSERT(IS_SAME(decltype(v.x),nbl::hlsl::impl::add_lvalue_reference::type>::type)); - - STATIC_ASSERT(IS_SAME(float,nbl::hlsl::remove_cvref::type)); - STATIC_ASSERT(IS_SAME(nbl::hlsl::remove_cv::type,nbl::hlsl::impl::add_lvalue_reference::type)); - STATIC_ASSERT(IS_SAME(nbl::hlsl::remove_cv::type,nbl::hlsl::impl::add_lvalue_reference::type>::type)); - } - fill(invocationID, 7); - { - float x[4][4]; - STATIC_ASSERT(IS_SAME(nbl::hlsl::remove_extent::type, float[4])); - STATIC_ASSERT(IS_SAME(nbl::hlsl::remove_all_extents::type, float)); - } - fill(invocationID, 8); - { - STATIC_ASSERT(IS_SAME(nbl::hlsl::make_signed::type, nbl::hlsl::make_signed::type)); - STATIC_ASSERT(IS_SAME(nbl::hlsl::make_unsigned::type, nbl::hlsl::make_unsigned::type)); - - STATIC_ASSERT(IS_SAME(nbl::hlsl::make_signed::type, nbl::hlsl::make_signed::type)); - STATIC_ASSERT(IS_SAME(nbl::hlsl::make_unsigned::type, nbl::hlsl::make_unsigned::type)); - - STATIC_ASSERT(IS_SAME(nbl::hlsl::make_signed::type, nbl::hlsl::make_signed::type)); - STATIC_ASSERT(IS_SAME(nbl::hlsl::make_unsigned::type, nbl::hlsl::make_unsigned::type)); - } - - { - int Q[3][4][5]; - STATIC_ASSERT(3 == (nbl::hlsl::extent::value)); - STATIC_ASSERT(4 == (nbl::hlsl::extent::value)); - STATIC_ASSERT(5 == (nbl::hlsl::extent::value)); - STATIC_ASSERT(0 == (nbl::hlsl::extent::value)); - } - fill(invocationID, 9); - { - float32_t a; - - a = nbl::hlsl::numeric_limits::min; - a = nbl::hlsl::numeric_limits::max; - a = nbl::hlsl::numeric_limits::lowest; - a = nbl::hlsl::numeric_limits::epsilon; - a = nbl::hlsl::numeric_limits::round_error; - a = nbl::hlsl::numeric_limits::infinity; - a = nbl::hlsl::numeric_limits::quiet_NaN; - a = nbl::hlsl::numeric_limits::signaling_NaN; - a = nbl::hlsl::numeric_limits::denorm_min; - } - - fill(invocationID, 10); - - fill(invocationID, -1); -} diff --git a/63_CUDAInterop/CMakeLists.txt b/63_CUDAInterop/CMakeLists.txt new file mode 100644 index 000000000..bc1624875 --- /dev/null +++ b/63_CUDAInterop/CMakeLists.txt @@ -0,0 +1,24 @@ +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + +if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) +endif() \ No newline at end of file diff --git a/63_CUDAInterop/app_resources/vectorAdd_kernel.cu b/63_CUDAInterop/app_resources/vectorAdd_kernel.cu new file mode 100644 index 000000000..99c831121 --- /dev/null +++ b/63_CUDAInterop/app_resources/vectorAdd_kernel.cu @@ -0,0 +1,42 @@ +/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * CUDA Kernel Device code + * + * Computes the vector addition of A and B into C. The 3 vectors have the same + * number of elements numElements. + */ + +extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, + int numElements) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) { + C[i] = A[i] + B[i]; + } +} diff --git a/63_CUDAInterop/main.cpp b/63_CUDAInterop/main.cpp new file mode 100644 index 000000000..9508fa22a --- /dev/null +++ b/63_CUDAInterop/main.cpp @@ -0,0 +1,538 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/video/CCUDAHandler.h" +#include "nbl/video/CCUDASharedMemory.h" +#include "nbl/video/CCUDASharedSemaphore.h" + +#include "../common/MonoDeviceApplication.hpp" +#include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp" + +using namespace nbl; +using namespace core; +using namespace system; +using namespace asset; +using namespace video; + +/* +The start of the main function starts like in most other example. We ask the +user for the desired renderer and start it up. +*/ + +bool check_cuda_err(cudaError_enum err, auto& cu, auto& logger, auto file, auto line) +{ + if (auto re = err; CUDA_SUCCESS != re) + { + const char* name = 0, * str = 0; + cu.pcuGetErrorName(re, &name); + cu.pcuGetErrorString(re, &str); + logger->log("%s:%d %s:\n\t%s\n", system::ILogger::ELL_ERROR, file, line, name, str); + return false; + } + return true; +} + +bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto line, std::string const& log) +{ + if (auto re = err; NVRTC_SUCCESS != re) + { + const char* str = cudaHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re); + logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str()); + return false; + } + return true; +} + +#define ASSERT_SUCCESS(expr) { auto re = check_cuda_err((expr), cu, m_logger, __FILE__, __LINE__); assert(re); } +#define ASSERT_SUCCESS_NV(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); } + +#ifndef _NBL_COMPILE_WITH_CUDA_ +static_assert(false); +#endif + +class CUDA2VKApp : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication +{ + using device_base_t = examples::MonoDeviceApplication; + using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication; + + static constexpr uint32_t gridDim[3] = { 4096,1,1 }; + static constexpr uint32_t blockDim[3] = { 1024,1,1 }; + static constexpr size_t numElements = gridDim[0] * blockDim[0]; + static constexpr size_t size = sizeof(float) * numElements; +public: + // Yay thanks to multiple inheritance we cannot forward ctors anymore + CUDA2VKApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + smart_refctd_ptr cudaHandler; + smart_refctd_ptr cudaDevice; + + IQueue* queue; + + // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory + std::array, 2> cpuBufs; + // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu' + // Kernel writes to cudaMemories[2] which we later use to export and read on nabla side + std::array, 3> cudaMemories = {}; + // A semaphore created in CUDA which will alias a Nabla semaphore to help sync between the CUDA kernel and Nabla device to host transfer + smart_refctd_ptr cudaSemaphore; + + // our Buffer that is bound to cudaMemories[2] + smart_refctd_ptr importedBuf; + // our Image that is also bound to cudaMemories[2] + smart_refctd_ptr importedImg; + + // host visible buffers that we use to copy from the resources above after CUDA kernel is done writing + smart_refctd_ptr stagingBufs[2]; + + // Nabla semaphore for sync + smart_refctd_ptr semaphore; + + smart_refctd_ptr commandPool; + smart_refctd_ptr cmd[2]; + + // a device filter helps you create a set of physical devices that satisfy your requirements in terms of features, limits etc. + core::set filterDevices(const core::SRange& physicalDevices) const override + { + auto devices = device_base_t::filterDevices(physicalDevices); + auto& cuDevices = cudaHandler->getAvailableDevices(); + std::erase_if(devices, [&cuDevices](auto pdev) { + return cuDevices.end() == std::find_if(cuDevices.begin(), cuDevices.end(), [pdev](auto& cuDev) { return !memcmp(pdev->getProperties().deviceUUID, &cuDev.uuid, 16); }); + }); + return devices; + } + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + // Remember to call the base class initialization! + if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + + cudaHandler = CCUDAHandler::create(m_system.get(), smart_refctd_ptr(m_logger)); + if (!cudaHandler) + return logFail("Could not create a CUDA handler!"); + + if (!device_base_t::onAppInitialized(std::move(system))) + return false; + + cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast(m_api), m_physicalDevice); + if (!cudaDevice) + return logFail("Could not create a CUDA Device!"); + + + queue = device_base_t::getComputeQueue(); + + createResources(); + + smart_refctd_ptr ptx; + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = ""; // virtual root + // this time we load a shader directly from a file + auto assetBundle = m_assetMgr->getAsset("app_resources/vectorAdd_kernel.cu", lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + return logFail("Could not load kernel!"); + + smart_refctd_ptr source = IAsset::castDown(assets[0]); + std::string log; + auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), + "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log); + ASSERT_SUCCESS_NV(res, log); + + ptx = std::move(ptx_); + } + CUmodule module; + CUfunction kernel; + CUstream stream; + + auto& cu = cudaHandler->getCUDAFunctionTable(); + + ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr)); + ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd")); + ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); + + launchKernel(kernel, stream); + + ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); + ASSERT_SUCCESS(cu.pcuModuleUnload(module)); + ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream)); + + m_device->waitIdle(); + + testInterop(); + + return true; + } + + void createResources() + { + auto& cu = cudaHandler->getCUDAFunctionTable(); + + for (auto& buf : cpuBufs) + buf = make_smart_refctd_ptr(size); + + for (auto j = 0; j < 2; j++) + for (auto i = 0; i < numElements; i++) + reinterpret_cast(cpuBufs[j]->getPointer())[i] = rand() / float(RAND_MAX); + + + // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper + ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + + semaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 }); + ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, semaphore.get())); + { + // export the CUmem we have just created into a refctd IDeviceMemoryAllocation + auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get()); + if (!devmemory) + logFail("Failed to export CUDA memory!"); + + + // create an importing external buffer on Nabla side + IGPUBuffer::SCreationParams params = {}; + params.size = devmemory->getAllocationSize(); + params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT; + params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; + importedBuf = m_device->createBuffer(std::move(params)); + if (!importedBuf) + logFail("Failed to create an external buffer"); + + // bind that imported IDeviceMemoryAllocation to the external buffer we've just created + ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedBuf.get(), .binding = {.memory = devmemory.get() } }; + bool re = m_device->bindBufferMemory(1, &bindInfo); + if (!re) logFail("Failed to bind CUDA memory to buffer"); + } + + { + // same thing as above + // we create an external image and bind the imported external memory to it + // now we have 2 different resources that are bound to the same memory + IImage::SCreationParams params = {}; + params.type = IGPUImage::ET_2D; + params.samples = IGPUImage::ESCF_1_BIT; + params.format = EF_R32_SFLOAT; + params.extent = { gridDim[0], blockDim[0], 1 }; + params.mipLevels = 1; + params.arrayLayers = 1; + params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT; + importedImg = cudaMemories[2]->createAndBindImage(m_device.get(), std::move(params)); + if (!importedImg) logFail("Failed to create an external image"); + } + + commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 2, cmd, smart_refctd_ptr(m_logger)); + + stagingBufs[0] = createStaging(); + stagingBufs[1] = createStaging(); + } + + smart_refctd_ptr createExternalBuffer(IDeviceMemoryAllocation* mem) + { + IGPUBuffer::SCreationParams params = {}; + params.size = mem->getAllocationSize(); + params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT; + params.externalHandleTypes = mem->getCreationParams().externalHandleType; + auto buf = m_device->createBuffer(std::move(params)); + ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = buf.get(), .binding = {.memory = mem } }; + m_device->bindBufferMemory(1, &bindInfo); + return buf; + } + + smart_refctd_ptr createStaging(size_t sz = size) + { + auto buf = m_device->createBuffer({ {.size = sz, .usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT} }); + auto req = buf->getMemoryReqs(); + req.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits(); + auto allocation = m_device->allocate(req, buf.get()); + + void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ); + if (!mapping) + logFail("Failed to map an staging buffer"); + memset(mapping, 0, req.size); + return buf; + }; + + void launchKernel(CUfunction kernel, CUstream stream) + { + + // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API + { + IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { + .barrier = { + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE, + .otherQueueFamilyIndex = IQueue::FamilyExternal, + }, + .range = {.buffer = importedBuf, }, + }; + + IGPUCommandBuffer::SImageMemoryBarrier imgBarrier = { + .barrier = { + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE, + .otherQueueFamilyIndex = IQueue::FamilyExternal, + }, + .image = importedImg.get(), + .subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .levelCount = 1u, + .layerCount = 1u, + } + }; + // start recording + bool re = true; + re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + re &= cmd[0]->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}, .imgBarriers = {&imgBarrier,&imgBarrier + 1} }); + re &= cmd[0]->end(); + + IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 1 }; + IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get()}; + IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo,&signalInfo + 1} }; + auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 }); + re &= IQueue::RESULT::SUCCESS == submitRe; + if (!re) + logFail("Something went wrong readying resources for CUDA"); + } + + auto& cu = cudaHandler->getCUDAFunctionTable(); + // Launch kernel + { + CUdeviceptr ptrs[] = { + cudaMemories[0]->getDeviceptr(), + cudaMemories[1]->getDeviceptr(), + cudaMemories[2]->getDeviceptr(), + }; + auto numEles = numElements; + void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numEles }; + ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), size, stream)); + ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), size, stream)); + + auto semaphore = cudaSemaphore->getInternalObject(); + CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; + ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan + ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr)); + CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } }; + ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore + } + + // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA + { + IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { + .barrier = { + .dep = { + .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, + .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT, + }, + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, + .otherQueueFamilyIndex = IQueue::FamilyExternal, + }, + .range = { .buffer = importedBuf, }, + }; + bool re = true; + re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + re &= cmd[1]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}}); + + IGPUCommandBuffer::SBufferCopy region = { .size = size }; + re &= cmd[1]->copyBuffer(importedBuf.get(), stagingBufs[0].get(), 1, ®ion); + + IGPUCommandBuffer::SImageMemoryBarrier imgBarrier = { + .barrier = { + .dep = { + .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, + .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS, + }, + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, + .otherQueueFamilyIndex = IQueue::FamilyExternal, + }, + .image = importedImg.get(), + .subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .levelCount = 1u, + .layerCount = 1u, + }, + .oldLayout = IImage::LAYOUT::PREINITIALIZED, + .newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, + }; + + re &= cmd[1]->pipelineBarrier(EDF_NONE, {.imgBarriers = {&imgBarrier,&imgBarrier + 1}}); + + IImage::SBufferCopy imgRegion = { + .imageSubresource = { + .aspectMask = imgBarrier.subresourceRange.aspectMask, + .layerCount = imgBarrier.subresourceRange.layerCount, + }, + .imageExtent = importedImg->getCreationParameters().extent, + }; + + re &= cmd[1]->copyImageToBuffer(importedImg.get(), imgBarrier.newLayout, stagingBufs[1].get(), 1, &imgRegion); + re &= cmd[1]->end(); + + IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = semaphore.get(), .value = 2 }; + IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 3 }; + IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() }; + IQueue::SSubmitInfo submitInfo = { + .waitSemaphores = {&waitInfo,&waitInfo + 1}, + .commandBuffers = {&cmdInfo, &cmdInfo + 1}, + .signalSemaphores = {&signalInfo,&signalInfo + 1} + }; + auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 }); + re &= IQueue::RESULT::SUCCESS == submitRe; + if (!re) + logFail("Something went wrong copying results from CUDA"); + } + + ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this)); + } + + void kernelCallback() + { + // Make sure we are also done with the readback + auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = semaphore.get(), .value = 3}}; + m_device->waitForSemaphores(wait, true, -1); + + float* A = reinterpret_cast(cpuBufs[0]->getPointer()); + float* B = reinterpret_cast(cpuBufs[1]->getPointer()); + + float* CBuf = reinterpret_cast(stagingBufs[0]->getBoundMemory().memory->getMappedPointer()); + float* CImg = reinterpret_cast(stagingBufs[1]->getBoundMemory().memory->getMappedPointer()); + + if(memcmp(CBuf, CImg, size)) + logFail("Buffer and Image memories do not match!"); + + for (auto i = 0; i < numElements; i++) + { + bool re = (abs(CBuf[i] - A[i] - B[i]) < 0.01f) && (abs(CImg[i] - A[i] - B[i]) < 0.01f); + if(!re) + logFail("Element at index %d is incorrect!", i); + } + + std::cout << "Success\n"; + } + + + void testInterop() + { + { + IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { + .size = size, + .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), + .alignmentLog2 = 10, + }; + + for (size_t i = 0; i < (1 << 8); ++i) + { + auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; + assert(memory); + auto tmpBuf = createExternalBuffer(memory.get()); + } + } + + smart_refctd_ptr escaped; + { + IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { + .size = size, + .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), + .alignmentLog2 = 10, + }; + + auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; + + auto tmpBuf = createExternalBuffer(memory.get()); + auto staging = createStaging(); + + auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer(); + for (uint32_t i = 0; i < size / 4; ++i) + ptr[i] = i; + + smart_refctd_ptr cmd; + commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd); + cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + IGPUCommandBuffer::SBufferCopy region = { .size = size }; + assert(cmd->copyBuffer(staging.get(), tmpBuf.get(), 1, ®ion)); + cmd->end(); + IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() }; + IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} }; + queue->submit({ &submitInfo,&submitInfo + 1 }); + m_device->waitIdle(); + escaped = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, memory->getCreationParams().externalHandle).memory; + } + + //{ + // constexpr size_t M = 32; + // auto staging = createStaging(size * M); + + // auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer(); + // for (uint32_t i = 0; i < (M * size) / 4; ++i) + // ptr[i] = rand(); + + // std::vector> cmd(1 << 10); + // commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data()); + + // for (size_t i = 0; i < 1 << 10; ++i) + // { + // IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { + // .size = size * M, + // .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), + // .alignmentLog2 = 10, + // }; + // RE: + // auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; + + // if (!memory) + // { + // m_device->waitIdle(); + // for (size_t j = 0; j < i; ++j) + // cmd[j] = 0; + // goto END; + // } + // assert(memory); + // auto tmpBuf = createExternalBuffer(memory.get()); + + // cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + // IGPUCommandBuffer::SBufferCopy region = { .size = size * M }; + // assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, ®ion)); + // cmd[i]->end(); + // IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() }; + // IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} }; + // assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 })); + // } + //END: + // m_device->waitIdle(); + //} + + { + auto tmpBuf = createExternalBuffer(escaped.get()); + auto staging = createStaging(); + + smart_refctd_ptr cmd; + commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd); + cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + IGPUCommandBuffer::SBufferCopy region = { .size = size }; + assert(cmd->copyBuffer(tmpBuf.get(), staging.get(), 1, ®ion)); + cmd->end(); + IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() }; + IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} }; + auto qre = queue->submit({ &submitInfo,&submitInfo + 1 }); + assert(IQueue::RESULT::SUCCESS == qre); + m_device->waitIdle(); + + auto& ptr = *(std::array*)staging->getBoundMemory().memory->getMappedPointer(); + for (uint32_t i = 0; i < size / 4; ++i) + assert(ptr[i] == i); + } + + } + + + // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization. + bool keepRunning() override { return false; } + + // Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop" + void workLoopBody() override {} +}; + +NBL_MAIN_FUNC(CUDA2VKApp) \ No newline at end of file diff --git a/63_CUDAInterop/pipeline.groovy b/63_CUDAInterop/pipeline.groovy new file mode 100644 index 000000000..29909e3d1 --- /dev/null +++ b/63_CUDAInterop/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CCUDAInteropBuilder extends IBuilder +{ + public CCUDAInteropBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CCUDAInteropBuilder(_agent, _info) +} + +return this \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a20a33a9..feb8307a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,5 +65,6 @@ if(NBL_BUILD_EXAMPLES) #add_subdirectory(61_UI EXCLUDE_FROM_ALL) add_subdirectory(62_CAD EXCLUDE_FROM_ALL) add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL) + add_subdirectory(63_CUDAInterop EXCLUDE_FROM_ALL) add_subdirectory(0_ImportanceSamplingEnvMaps EXCLUDE_FROM_ALL) #TODO: integrate back into 42 endif() \ No newline at end of file diff --git a/common/MonoDeviceApplication.hpp b/common/MonoDeviceApplication.hpp index ca4e6d449..4e6a6bf95 100644 --- a/common/MonoDeviceApplication.hpp +++ b/common/MonoDeviceApplication.hpp @@ -245,6 +245,7 @@ class MonoDeviceApplication : public virtual MonoSystemMonoLoggerApplication return retval; } + virtual video::IQueue* getQueue(video::IQueue::FAMILY_FLAGS flags) const { // In the default implementation of everything I asked only for one queue from first compute family @@ -252,7 +253,6 @@ class MonoDeviceApplication : public virtual MonoSystemMonoLoggerApplication for (auto i = 0u; i < familyProperties.size(); i++) if (familyProperties[i].queueFlags.hasFlags(flags)) return m_device->getQueue(i, 0); - return nullptr; }